@tangle-network/agent-eval 0.33.1 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/dist/index.d.ts +261 -1
- package/dist/index.js +477 -88
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +12 -22
package/dist/index.js
CHANGED
|
@@ -333,7 +333,7 @@ var RunCritic = class {
|
|
|
333
333
|
);
|
|
334
334
|
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
335
335
|
if (!success) notes.push("run did not complete with pass=true");
|
|
336
|
-
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((
|
|
336
|
+
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
337
337
|
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
|
|
338
338
|
trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
|
|
339
339
|
) : void 0;
|
|
@@ -348,7 +348,7 @@ var RunCritic = class {
|
|
|
348
348
|
(span) => typeof span.testsTotal === "number" && span.testsTotal > 0
|
|
349
349
|
);
|
|
350
350
|
const testReality = sandboxTests.length ? sandboxTests.reduce(
|
|
351
|
-
(
|
|
351
|
+
(sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
|
|
352
352
|
0
|
|
353
353
|
) / sandboxTests.length : toolSpans2.some(
|
|
354
354
|
(span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
|
|
@@ -370,7 +370,7 @@ var RunCritic = class {
|
|
|
370
370
|
const costUsd = trace.budget.length ? Math.max(
|
|
371
371
|
...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
|
|
372
372
|
0
|
|
373
|
-
) : llmSpans2.reduce((
|
|
373
|
+
) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
|
|
374
374
|
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
375
375
|
return {
|
|
376
376
|
success,
|
|
@@ -1960,12 +1960,12 @@ function allocateBudget(policy, args) {
|
|
|
1960
1960
|
return policy.totalUsd / Math.max(1, args.runningCount);
|
|
1961
1961
|
}
|
|
1962
1962
|
function sumFindingCost(findings) {
|
|
1963
|
-
let
|
|
1963
|
+
let sum3 = 0;
|
|
1964
1964
|
for (const f of findings) {
|
|
1965
1965
|
const c = f.metadata?.cost_usd;
|
|
1966
|
-
if (typeof c === "number" && Number.isFinite(c))
|
|
1966
|
+
if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
|
|
1967
1967
|
}
|
|
1968
|
-
return
|
|
1968
|
+
return sum3;
|
|
1969
1969
|
}
|
|
1970
1970
|
function selectPriorFindings(source, analystId) {
|
|
1971
1971
|
if (!source) return void 0;
|
|
@@ -2184,10 +2184,10 @@ function ghCliClient(opts = {}) {
|
|
|
2184
2184
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
2185
2185
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
2186
2186
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
2187
|
-
const { dirname:
|
|
2187
|
+
const { dirname: dirname6, join: join4, resolve } = await import("path");
|
|
2188
2188
|
for (const change of input.fileChanges) {
|
|
2189
2189
|
const abs = resolve(cwd, change.path);
|
|
2190
|
-
await mkdir(
|
|
2190
|
+
await mkdir(dirname6(abs), { recursive: true });
|
|
2191
2191
|
await writeFile(abs, change.contents, "utf8");
|
|
2192
2192
|
await run("git", ["add", join4(change.path)]);
|
|
2193
2193
|
}
|
|
@@ -3722,6 +3722,178 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
|
|
|
3722
3722
|
};
|
|
3723
3723
|
}
|
|
3724
3724
|
|
|
3725
|
+
// src/pr-review-benchmark.ts
|
|
3726
|
+
var DEFAULT_PR_REVIEW_SCORE_WEIGHTS = {
|
|
3727
|
+
recall: 4,
|
|
3728
|
+
precision: 2,
|
|
3729
|
+
actionability: 1.5,
|
|
3730
|
+
severityCalibration: 1,
|
|
3731
|
+
lowNoise: 1
|
|
3732
|
+
};
|
|
3733
|
+
function commentsForSource(auditCase, source) {
|
|
3734
|
+
return auditCase.comments.filter((comment) => comment.source === source);
|
|
3735
|
+
}
|
|
3736
|
+
function scorePrReviewSource(auditCase, source, weights = {}) {
|
|
3737
|
+
return scorePrReviewComments(auditCase, commentsForSource(auditCase, source), source, weights);
|
|
3738
|
+
}
|
|
3739
|
+
function scorePrReviewComments(auditCase, comments, source, weights = {}) {
|
|
3740
|
+
const matchedFindings = matchReferenceFindings(auditCase.referenceFindings, comments);
|
|
3741
|
+
const matchedCommentIds = new Set(matchedFindings.map((match) => match.commentId));
|
|
3742
|
+
const positiveComments = comments.filter((comment) => isPositiveOutcome(comment.outcome));
|
|
3743
|
+
const negativeComments = comments.filter((comment) => isNegativeOutcome(comment.outcome));
|
|
3744
|
+
const actionableComments = comments.filter(isActionableComment);
|
|
3745
|
+
const severityComments = comments.filter((comment) => comment.severity);
|
|
3746
|
+
const severityAligned = severityComments.filter(
|
|
3747
|
+
(comment) => isSeverityAligned(comment, auditCase.referenceFindings, matchedFindings)
|
|
3748
|
+
);
|
|
3749
|
+
const recall = auditCase.referenceFindings.length ? matchedFindings.length / auditCase.referenceFindings.length : comments.length === 0 ? 1 : 0;
|
|
3750
|
+
const precisionDenominator = positiveComments.length + negativeComments.length;
|
|
3751
|
+
const precision2 = precisionDenominator > 0 ? positiveComments.length / precisionDenominator : comments.length > 0 ? matchedCommentIds.size / comments.length : auditCase.referenceFindings.length === 0 ? 1 : 0;
|
|
3752
|
+
const actionability = comments.length ? actionableComments.length / comments.length : 1;
|
|
3753
|
+
const severityCalibration = severityComments.length ? severityAligned.length / severityComments.length : matchedFindings.length ? 0.5 : 1;
|
|
3754
|
+
const lowNoise = comments.length ? 1 - negativeComments.length / comments.length : 1;
|
|
3755
|
+
const aggregate2 = aggregatePrReviewScore(
|
|
3756
|
+
{ recall, precision: precision2, actionability, severityCalibration, lowNoise },
|
|
3757
|
+
weights
|
|
3758
|
+
);
|
|
3759
|
+
return {
|
|
3760
|
+
caseId: auditCase.id,
|
|
3761
|
+
source,
|
|
3762
|
+
commentCount: comments.length,
|
|
3763
|
+
referenceCount: auditCase.referenceFindings.length,
|
|
3764
|
+
matchedFindings,
|
|
3765
|
+
recall,
|
|
3766
|
+
precision: precision2,
|
|
3767
|
+
actionability,
|
|
3768
|
+
severityCalibration,
|
|
3769
|
+
lowNoise,
|
|
3770
|
+
aggregate: aggregate2,
|
|
3771
|
+
notes: buildScoreNotes({
|
|
3772
|
+
comments,
|
|
3773
|
+
referenceCount: auditCase.referenceFindings.length,
|
|
3774
|
+
matchedFindings,
|
|
3775
|
+
negativeComments,
|
|
3776
|
+
actionableComments
|
|
3777
|
+
})
|
|
3778
|
+
};
|
|
3779
|
+
}
|
|
3780
|
+
function summarizePrReviewBenchmark(scores) {
|
|
3781
|
+
const bySource = /* @__PURE__ */ new Map();
|
|
3782
|
+
for (const score of scores) {
|
|
3783
|
+
bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
|
|
3784
|
+
}
|
|
3785
|
+
return [...bySource.entries()].map(([source, sourceScores]) => ({
|
|
3786
|
+
source,
|
|
3787
|
+
caseCount: sourceScores.length,
|
|
3788
|
+
commentCount: sum(sourceScores.map((score) => score.commentCount)),
|
|
3789
|
+
aggregateMean: mean(sourceScores.map((score) => score.aggregate)),
|
|
3790
|
+
recallMean: mean(sourceScores.map((score) => score.recall)),
|
|
3791
|
+
precisionMean: mean(sourceScores.map((score) => score.precision)),
|
|
3792
|
+
actionabilityMean: mean(sourceScores.map((score) => score.actionability)),
|
|
3793
|
+
severityCalibrationMean: mean(sourceScores.map((score) => score.severityCalibration)),
|
|
3794
|
+
lowNoiseMean: mean(sourceScores.map((score) => score.lowNoise))
|
|
3795
|
+
})).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
3796
|
+
}
|
|
3797
|
+
function aggregatePrReviewScore(dimensions, weights = {}) {
|
|
3798
|
+
const merged = { ...DEFAULT_PR_REVIEW_SCORE_WEIGHTS, ...weights };
|
|
3799
|
+
const weightSum = Object.values(merged).reduce((total, value) => total + Math.max(0, value), 0);
|
|
3800
|
+
if (weightSum <= 0) return 0;
|
|
3801
|
+
return (merged.recall * clamp01(dimensions.recall) + merged.precision * clamp01(dimensions.precision) + merged.actionability * clamp01(dimensions.actionability) + merged.severityCalibration * clamp01(dimensions.severityCalibration) + merged.lowNoise * clamp01(dimensions.lowNoise)) / weightSum;
|
|
3802
|
+
}
|
|
3803
|
+
function matchReferenceFindings(references, comments) {
|
|
3804
|
+
const matches = [];
|
|
3805
|
+
const usedCommentIds = /* @__PURE__ */ new Set();
|
|
3806
|
+
for (const reference of references) {
|
|
3807
|
+
const candidates = comments.filter((comment) => !usedCommentIds.has(comment.id)).map((comment) => ({ comment, score: matchScore(reference, comment) })).filter(({ score }) => score >= 0.55).sort((a, b) => b.score - a.score);
|
|
3808
|
+
const best = candidates[0];
|
|
3809
|
+
if (!best) continue;
|
|
3810
|
+
usedCommentIds.add(best.comment.id);
|
|
3811
|
+
matches.push({ referenceId: reference.id, commentId: best.comment.id, score: best.score });
|
|
3812
|
+
}
|
|
3813
|
+
return matches;
|
|
3814
|
+
}
|
|
3815
|
+
function matchScore(reference, comment) {
|
|
3816
|
+
let score = 0;
|
|
3817
|
+
if (reference.sourceCommentIds?.includes(comment.id)) score += 1;
|
|
3818
|
+
if (reference.path && comment.path && normalizePath(reference.path) === normalizePath(comment.path)) {
|
|
3819
|
+
score += 0.35;
|
|
3820
|
+
}
|
|
3821
|
+
if (reference.line && comment.line && Math.abs(reference.line - comment.line) <= 3) score += 0.15;
|
|
3822
|
+
const terms = [...reference.keywords ?? [], ...tokenize(reference.title)];
|
|
3823
|
+
const uniqueTerms = [...new Set(terms.map(normalizeTerm).filter((term) => term.length >= 3))];
|
|
3824
|
+
if (uniqueTerms.length > 0) {
|
|
3825
|
+
const bodyTerms = new Set(tokenize(comment.body).map(normalizeTerm));
|
|
3826
|
+
const overlap = uniqueTerms.filter((term) => bodyTerms.has(term)).length;
|
|
3827
|
+
score += 0.5 * (overlap / uniqueTerms.length);
|
|
3828
|
+
}
|
|
3829
|
+
return clamp01(score);
|
|
3830
|
+
}
|
|
3831
|
+
function isActionableComment(comment) {
|
|
3832
|
+
const body = comment.body.trim();
|
|
3833
|
+
if (!comment.path && !/\b(file|line|function|method|class|module|test|migration)\b/i.test(body)) {
|
|
3834
|
+
return false;
|
|
3835
|
+
}
|
|
3836
|
+
return /\b(fix|change|add|remove|guard|check|reject|validate|test|assert|return|throw|fail|block)\b/i.test(
|
|
3837
|
+
body
|
|
3838
|
+
);
|
|
3839
|
+
}
|
|
3840
|
+
function isSeverityAligned(comment, references, matches) {
|
|
3841
|
+
if (!comment.severity) return false;
|
|
3842
|
+
const match = matches.find((candidate) => candidate.commentId === comment.id);
|
|
3843
|
+
if (!match) return comment.severity === "nit" || comment.severity === "low";
|
|
3844
|
+
const reference = references.find((candidate) => candidate.id === match.referenceId);
|
|
3845
|
+
if (!reference) return false;
|
|
3846
|
+
return Math.abs(severityRank(comment.severity) - severityRank(reference.severity)) <= 1;
|
|
3847
|
+
}
|
|
3848
|
+
function buildScoreNotes(input) {
|
|
3849
|
+
const notes = [];
|
|
3850
|
+
if (input.referenceCount > 0 && input.matchedFindings.length === 0) {
|
|
3851
|
+
notes.push("no reference findings matched");
|
|
3852
|
+
}
|
|
3853
|
+
if (input.negativeComments.length > 0) {
|
|
3854
|
+
notes.push(`${input.negativeComments.length} comment(s) labelled rejected/duplicate/noise`);
|
|
3855
|
+
}
|
|
3856
|
+
if (input.comments.length > 0 && input.actionableComments.length === 0) {
|
|
3857
|
+
notes.push("comments were not actionable enough for a PR reviewer benchmark");
|
|
3858
|
+
}
|
|
3859
|
+
return notes;
|
|
3860
|
+
}
|
|
3861
|
+
function isPositiveOutcome(outcome) {
|
|
3862
|
+
return outcome === "accepted" || outcome === "fixed";
|
|
3863
|
+
}
|
|
3864
|
+
function isNegativeOutcome(outcome) {
|
|
3865
|
+
return outcome === "rejected" || outcome === "duplicate" || outcome === "noise";
|
|
3866
|
+
}
|
|
3867
|
+
function severityRank(severity) {
|
|
3868
|
+
switch (severity) {
|
|
3869
|
+
case "critical":
|
|
3870
|
+
return 5;
|
|
3871
|
+
case "high":
|
|
3872
|
+
return 4;
|
|
3873
|
+
case "medium":
|
|
3874
|
+
return 3;
|
|
3875
|
+
case "low":
|
|
3876
|
+
return 2;
|
|
3877
|
+
case "nit":
|
|
3878
|
+
return 1;
|
|
3879
|
+
}
|
|
3880
|
+
}
|
|
3881
|
+
function tokenize(input) {
|
|
3882
|
+
return input.match(/[a-zA-Z0-9_.$/-]+/g) ?? [];
|
|
3883
|
+
}
|
|
3884
|
+
function normalizeTerm(input) {
|
|
3885
|
+
return input.toLowerCase().replace(/^[^a-z0-9_]+|[^a-z0-9_]+$/g, "");
|
|
3886
|
+
}
|
|
3887
|
+
function normalizePath(input) {
|
|
3888
|
+
return input.replace(/^\.\/+/, "");
|
|
3889
|
+
}
|
|
3890
|
+
function mean(values) {
|
|
3891
|
+
return values.length ? sum(values) / values.length : 0;
|
|
3892
|
+
}
|
|
3893
|
+
function sum(values) {
|
|
3894
|
+
return values.reduce((total, value) => total + value, 0);
|
|
3895
|
+
}
|
|
3896
|
+
|
|
3725
3897
|
// src/production-loop.ts
|
|
3726
3898
|
async function runProductionLoop(opts) {
|
|
3727
3899
|
validate2(opts);
|
|
@@ -5217,14 +5389,14 @@ async function runHarnessExperiment(config) {
|
|
|
5217
5389
|
const score = config.score ?? ((trace) => critic.scoreTrace(trace));
|
|
5218
5390
|
const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
|
|
5219
5391
|
const trace = await config.adapter.run(request);
|
|
5220
|
-
const
|
|
5392
|
+
const runScore2 = await score(trace, request);
|
|
5221
5393
|
const result = {
|
|
5222
5394
|
variant: request.variant,
|
|
5223
5395
|
scenario: request.scenario,
|
|
5224
5396
|
trialIndex: request.trialIndex,
|
|
5225
5397
|
trace,
|
|
5226
|
-
score:
|
|
5227
|
-
aggregate: aggregateRunScore(
|
|
5398
|
+
score: runScore2,
|
|
5399
|
+
aggregate: aggregateRunScore(runScore2, config.weights)
|
|
5228
5400
|
};
|
|
5229
5401
|
await config.onResult?.(result);
|
|
5230
5402
|
return result;
|
|
@@ -5251,10 +5423,10 @@ function summarizeHarnessResults(results) {
|
|
|
5251
5423
|
return {
|
|
5252
5424
|
variant,
|
|
5253
5425
|
runs,
|
|
5254
|
-
aggregateMean:
|
|
5255
|
-
passRate:
|
|
5256
|
-
costUsdMean:
|
|
5257
|
-
wallSecondsMean:
|
|
5426
|
+
aggregateMean: mean2(runs.map((r) => r.aggregate)),
|
|
5427
|
+
passRate: mean2(runs.map((r) => r.score.success)),
|
|
5428
|
+
costUsdMean: mean2(runs.map((r) => r.score.costUsd)),
|
|
5429
|
+
wallSecondsMean: mean2(runs.map((r) => r.score.wallSeconds)),
|
|
5258
5430
|
scoreMean: meanRunScore(runs.map((r) => r.score))
|
|
5259
5431
|
};
|
|
5260
5432
|
}).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
@@ -5291,22 +5463,22 @@ async function mapLimit(items, limit, fn) {
|
|
|
5291
5463
|
);
|
|
5292
5464
|
return results;
|
|
5293
5465
|
}
|
|
5294
|
-
function
|
|
5295
|
-
return values.length ? values.reduce((
|
|
5466
|
+
function mean2(values) {
|
|
5467
|
+
return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
|
|
5296
5468
|
}
|
|
5297
5469
|
function meanRunScore(scores) {
|
|
5298
5470
|
return {
|
|
5299
|
-
success:
|
|
5300
|
-
goalProgress:
|
|
5301
|
-
repoGroundedness:
|
|
5302
|
-
driftPenalty:
|
|
5303
|
-
toolUseQuality:
|
|
5304
|
-
patchQuality:
|
|
5305
|
-
testReality:
|
|
5306
|
-
finalGate:
|
|
5307
|
-
reviewerBlockers:
|
|
5308
|
-
costUsd:
|
|
5309
|
-
wallSeconds:
|
|
5471
|
+
success: mean2(scores.map((s) => s.success)),
|
|
5472
|
+
goalProgress: mean2(scores.map((s) => s.goalProgress)),
|
|
5473
|
+
repoGroundedness: mean2(scores.map((s) => s.repoGroundedness)),
|
|
5474
|
+
driftPenalty: mean2(scores.map((s) => s.driftPenalty)),
|
|
5475
|
+
toolUseQuality: mean2(scores.map((s) => s.toolUseQuality)),
|
|
5476
|
+
patchQuality: mean2(scores.map((s) => s.patchQuality)),
|
|
5477
|
+
testReality: mean2(scores.map((s) => s.testReality)),
|
|
5478
|
+
finalGate: mean2(scores.map((s) => s.finalGate)),
|
|
5479
|
+
reviewerBlockers: mean2(scores.map((s) => s.reviewerBlockers)),
|
|
5480
|
+
costUsd: mean2(scores.map((s) => s.costUsd)),
|
|
5481
|
+
wallSeconds: mean2(scores.map((s) => s.wallSeconds)),
|
|
5310
5482
|
notes: scores.flatMap((s) => s.notes ?? [])
|
|
5311
5483
|
};
|
|
5312
5484
|
}
|
|
@@ -5645,7 +5817,7 @@ function rankRows(rows, weights) {
|
|
|
5645
5817
|
}
|
|
5646
5818
|
return [...buckets.entries()].map(([variantId, values]) => ({
|
|
5647
5819
|
variantId,
|
|
5648
|
-
mean: values.reduce((
|
|
5820
|
+
mean: values.reduce((sum3, value) => sum3 + value, 0) / values.length,
|
|
5649
5821
|
runs: values.length
|
|
5650
5822
|
})).sort((a, b) => b.mean - a.mean);
|
|
5651
5823
|
}
|
|
@@ -5815,6 +5987,22 @@ var BudgetGuard = class {
|
|
|
5815
5987
|
}
|
|
5816
5988
|
};
|
|
5817
5989
|
|
|
5990
|
+
// src/agent-profile.ts
|
|
5991
|
+
import { createHash as createHash2 } from "crypto";
|
|
5992
|
+
function agentProfileHash(profile) {
|
|
5993
|
+
if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
|
|
5994
|
+
throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
|
|
5995
|
+
}
|
|
5996
|
+
const behaviour = {
|
|
5997
|
+
model: profile.model.trim(),
|
|
5998
|
+
skills: [...profile.skills ?? []].sort(),
|
|
5999
|
+
promptVersion: profile.promptVersion ?? null,
|
|
6000
|
+
tools: [...profile.tools ?? []].sort(),
|
|
6001
|
+
metadata: profile.metadata ?? {}
|
|
6002
|
+
};
|
|
6003
|
+
return createHash2("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
|
|
6004
|
+
}
|
|
6005
|
+
|
|
5818
6006
|
// src/cost-tracker.ts
|
|
5819
6007
|
var CostTracker = class {
|
|
5820
6008
|
byScenario = /* @__PURE__ */ new Map();
|
|
@@ -6221,6 +6409,194 @@ function isObject(v) {
|
|
|
6221
6409
|
return typeof v === "object" && v !== null && !Array.isArray(v);
|
|
6222
6410
|
}
|
|
6223
6411
|
|
|
6412
|
+
// src/scorecard.ts
|
|
6413
|
+
import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3 } from "fs";
|
|
6414
|
+
import { dirname as dirname2 } from "path";
|
|
6415
|
+
function median(xs) {
|
|
6416
|
+
if (xs.length === 0) return 0;
|
|
6417
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
6418
|
+
const mid = Math.floor(sorted.length / 2);
|
|
6419
|
+
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
6420
|
+
}
|
|
6421
|
+
function runScore(run) {
|
|
6422
|
+
return run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
6423
|
+
}
|
|
6424
|
+
function aggregatePerDimension(runs) {
|
|
6425
|
+
const sums = /* @__PURE__ */ new Map();
|
|
6426
|
+
for (const run of runs) {
|
|
6427
|
+
const dims = run.outcome.judgeScores?.perDimMean;
|
|
6428
|
+
if (!dims) continue;
|
|
6429
|
+
for (const [dim, value] of Object.entries(dims)) {
|
|
6430
|
+
if (!Number.isFinite(value)) continue;
|
|
6431
|
+
const acc = sums.get(dim) ?? { total: 0, count: 0 };
|
|
6432
|
+
acc.total += value;
|
|
6433
|
+
acc.count += 1;
|
|
6434
|
+
sums.set(dim, acc);
|
|
6435
|
+
}
|
|
6436
|
+
}
|
|
6437
|
+
if (sums.size === 0) return void 0;
|
|
6438
|
+
const out = {};
|
|
6439
|
+
for (const [dim, acc] of sums) out[dim] = acc.total / acc.count;
|
|
6440
|
+
return out;
|
|
6441
|
+
}
|
|
6442
|
+
function recordRuns(runs, opts) {
|
|
6443
|
+
const profileHash = agentProfileHash(opts.profile);
|
|
6444
|
+
const timestamp = opts.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
6445
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
6446
|
+
for (const run of runs) {
|
|
6447
|
+
const scenarioId = run.scenarioId;
|
|
6448
|
+
if (!scenarioId) continue;
|
|
6449
|
+
const bucket = byScenario.get(scenarioId);
|
|
6450
|
+
if (bucket) bucket.push(run);
|
|
6451
|
+
else byScenario.set(scenarioId, [run]);
|
|
6452
|
+
}
|
|
6453
|
+
const lines = [];
|
|
6454
|
+
for (const [scenarioId, scenarioRuns] of byScenario) {
|
|
6455
|
+
const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
|
|
6456
|
+
if (scored.length === 0) continue;
|
|
6457
|
+
const scores = scored.map((s) => s.score);
|
|
6458
|
+
const entry = {
|
|
6459
|
+
commitSha: opts.commitSha,
|
|
6460
|
+
timestamp,
|
|
6461
|
+
scores,
|
|
6462
|
+
composite: median(scores),
|
|
6463
|
+
runIds: scored.map((s) => s.run.runId)
|
|
6464
|
+
};
|
|
6465
|
+
const perDimension = aggregatePerDimension(scenarioRuns);
|
|
6466
|
+
if (perDimension) entry.perDimension = perDimension;
|
|
6467
|
+
lines.push({
|
|
6468
|
+
scenarioId,
|
|
6469
|
+
profileHash,
|
|
6470
|
+
model: opts.profile.model,
|
|
6471
|
+
profile: opts.profile,
|
|
6472
|
+
entry
|
|
6473
|
+
});
|
|
6474
|
+
}
|
|
6475
|
+
return lines;
|
|
6476
|
+
}
|
|
6477
|
+
function appendScorecard(logPath, lines) {
|
|
6478
|
+
if (lines.length === 0) return;
|
|
6479
|
+
mkdirSync2(dirname2(logPath), { recursive: true });
|
|
6480
|
+
appendFileSync2(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
|
|
6481
|
+
`);
|
|
6482
|
+
}
|
|
6483
|
+
function recordRunsToScorecard(logPath, runs, opts) {
|
|
6484
|
+
const lines = recordRuns(runs, opts);
|
|
6485
|
+
appendScorecard(logPath, lines);
|
|
6486
|
+
return lines;
|
|
6487
|
+
}
|
|
6488
|
+
function loadScorecard(logPath) {
|
|
6489
|
+
if (!existsSync4(logPath)) return { cells: [], profiles: {} };
|
|
6490
|
+
const cells = /* @__PURE__ */ new Map();
|
|
6491
|
+
const profiles = {};
|
|
6492
|
+
for (const raw of readFileSync3(logPath, "utf8").split("\n")) {
|
|
6493
|
+
const line = raw.trim();
|
|
6494
|
+
if (!line) continue;
|
|
6495
|
+
let parsed;
|
|
6496
|
+
try {
|
|
6497
|
+
parsed = JSON.parse(line);
|
|
6498
|
+
} catch {
|
|
6499
|
+
continue;
|
|
6500
|
+
}
|
|
6501
|
+
if (!parsed?.scenarioId || !parsed.profileHash || !parsed.entry) continue;
|
|
6502
|
+
const key = `${parsed.scenarioId}::${parsed.profileHash}`;
|
|
6503
|
+
let cell = cells.get(key);
|
|
6504
|
+
if (!cell) {
|
|
6505
|
+
cell = {
|
|
6506
|
+
scenarioId: parsed.scenarioId,
|
|
6507
|
+
profileHash: parsed.profileHash,
|
|
6508
|
+
model: parsed.model,
|
|
6509
|
+
timeline: []
|
|
6510
|
+
};
|
|
6511
|
+
cells.set(key, cell);
|
|
6512
|
+
}
|
|
6513
|
+
cell.timeline.push(parsed.entry);
|
|
6514
|
+
if (parsed.profile) profiles[parsed.profileHash] = parsed.profile;
|
|
6515
|
+
}
|
|
6516
|
+
for (const cell of cells.values()) {
|
|
6517
|
+
cell.timeline.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
|
|
6518
|
+
}
|
|
6519
|
+
return { cells: [...cells.values()], profiles };
|
|
6520
|
+
}
|
|
6521
|
+
function diffScorecard(scorecard, opts = {}) {
|
|
6522
|
+
const minEffect = opts.minEffect ?? 0.5;
|
|
6523
|
+
const maxP = opts.maxP ?? 0.05;
|
|
6524
|
+
const minDelta = opts.minDelta ?? 0.05;
|
|
6525
|
+
const cells = [];
|
|
6526
|
+
for (const cell of scorecard.cells) {
|
|
6527
|
+
const timeline = cell.timeline;
|
|
6528
|
+
if (timeline.length === 0) continue;
|
|
6529
|
+
const current = timeline[timeline.length - 1];
|
|
6530
|
+
const baseline = opts.baselineCommit ? [...timeline].reverse().find((e) => e.commitSha === opts.baselineCommit && e !== current) : timeline[timeline.length - 2];
|
|
6531
|
+
const base = {
|
|
6532
|
+
scenarioId: cell.scenarioId,
|
|
6533
|
+
profileHash: cell.profileHash,
|
|
6534
|
+
model: cell.model,
|
|
6535
|
+
current: current.composite,
|
|
6536
|
+
currentCommit: current.commitSha
|
|
6537
|
+
};
|
|
6538
|
+
if (!baseline) {
|
|
6539
|
+
cells.push({
|
|
6540
|
+
...base,
|
|
6541
|
+
verdict: "new",
|
|
6542
|
+
baseline: null,
|
|
6543
|
+
delta: null,
|
|
6544
|
+
cohensD: null,
|
|
6545
|
+
pValue: null,
|
|
6546
|
+
baselineCommit: null
|
|
6547
|
+
});
|
|
6548
|
+
continue;
|
|
6549
|
+
}
|
|
6550
|
+
const delta = current.composite - baseline.composite;
|
|
6551
|
+
const canStat = baseline.scores.length >= 2 && current.scores.length >= 2;
|
|
6552
|
+
let d = null;
|
|
6553
|
+
let p = null;
|
|
6554
|
+
let verdict;
|
|
6555
|
+
if (canStat) {
|
|
6556
|
+
d = cohensD(baseline.scores, current.scores);
|
|
6557
|
+
const t = welchsTTest(baseline.scores, current.scores);
|
|
6558
|
+
p = Number.isFinite(t.p) ? t.p : null;
|
|
6559
|
+
const significant = Math.abs(d) >= minEffect && p !== null && p <= maxP;
|
|
6560
|
+
verdict = significant ? delta > 0 ? "improved" : "regressed" : "flat";
|
|
6561
|
+
} else {
|
|
6562
|
+
verdict = Math.abs(delta) >= minDelta ? delta > 0 ? "improved" : "regressed" : "flat";
|
|
6563
|
+
}
|
|
6564
|
+
cells.push({
|
|
6565
|
+
...base,
|
|
6566
|
+
verdict,
|
|
6567
|
+
baseline: baseline.composite,
|
|
6568
|
+
delta,
|
|
6569
|
+
cohensD: d,
|
|
6570
|
+
pValue: p,
|
|
6571
|
+
baselineCommit: baseline.commitSha
|
|
6572
|
+
});
|
|
6573
|
+
}
|
|
6574
|
+
const summary = { improved: 0, regressed: 0, flat: 0, new: 0 };
|
|
6575
|
+
for (const cell of cells) summary[cell.verdict] += 1;
|
|
6576
|
+
return { cells, summary };
|
|
6577
|
+
}
|
|
6578
|
+
function formatScorecardDiff(diff) {
|
|
6579
|
+
const lines = [];
|
|
6580
|
+
const { summary } = diff;
|
|
6581
|
+
lines.push(
|
|
6582
|
+
`Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
|
|
6583
|
+
);
|
|
6584
|
+
const fmt = (n) => n.toFixed(3);
|
|
6585
|
+
const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
|
|
6586
|
+
if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
|
|
6587
|
+
return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
|
|
6588
|
+
});
|
|
6589
|
+
for (const cell of noteworthy) {
|
|
6590
|
+
const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
|
|
6591
|
+
const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt(cell.delta)}` : fmt(cell.delta) : "\u2014";
|
|
6592
|
+
const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
|
|
6593
|
+
lines.push(
|
|
6594
|
+
` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt(cell.baseline ?? 0)} \u2192 ${fmt(cell.current)} ${deltaStr}${stat}`
|
|
6595
|
+
);
|
|
6596
|
+
}
|
|
6597
|
+
return lines.join("\n");
|
|
6598
|
+
}
|
|
6599
|
+
|
|
6224
6600
|
// src/series-convergence.ts
|
|
6225
6601
|
function analyzeSeries(values, options = {}) {
|
|
6226
6602
|
const window = options.window ?? 5;
|
|
@@ -6230,10 +6606,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
6230
6606
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
6231
6607
|
}
|
|
6232
6608
|
const tail = values.slice(-window);
|
|
6233
|
-
const
|
|
6234
|
-
const variance = tail.reduce((acc, v) => acc + (v -
|
|
6609
|
+
const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
6610
|
+
const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
|
|
6235
6611
|
const stdDev = Math.sqrt(variance);
|
|
6236
|
-
const refMean = Math.abs(
|
|
6612
|
+
const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
6237
6613
|
const cv = stdDev / refMean;
|
|
6238
6614
|
const stable = tail.length >= window && cv <= stableCv;
|
|
6239
6615
|
let tailRun = 0;
|
|
@@ -6254,7 +6630,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
6254
6630
|
} else {
|
|
6255
6631
|
state = "noisy";
|
|
6256
6632
|
}
|
|
6257
|
-
return { state, windowMean:
|
|
6633
|
+
return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
|
|
6258
6634
|
}
|
|
6259
6635
|
|
|
6260
6636
|
// src/slo.ts
|
|
@@ -7052,12 +7428,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
7052
7428
|
variantScores.push({ mutator: id, score, mutated });
|
|
7053
7429
|
all.push(score);
|
|
7054
7430
|
}
|
|
7055
|
-
const
|
|
7056
|
-
const variance = all.reduce((a, v) => a + (v -
|
|
7431
|
+
const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
7432
|
+
const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
|
|
7057
7433
|
const stdDev = Math.sqrt(variance);
|
|
7058
|
-
const ref = Math.abs(
|
|
7434
|
+
const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
7059
7435
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
7060
|
-
return { originalScore, variantScores, meanScore:
|
|
7436
|
+
return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
|
|
7061
7437
|
}
|
|
7062
7438
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
7063
7439
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -7123,8 +7499,8 @@ async function paraphraseRobustnessScenarios(args) {
|
|
|
7123
7499
|
});
|
|
7124
7500
|
scores.push(out.score);
|
|
7125
7501
|
}
|
|
7126
|
-
const
|
|
7127
|
-
deltas[m.name] =
|
|
7502
|
+
const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7503
|
+
deltas[m.name] = mean5 - originalScore;
|
|
7128
7504
|
paraphrasedAll.push(...scores);
|
|
7129
7505
|
}
|
|
7130
7506
|
const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
|
|
@@ -7737,8 +8113,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7737
8113
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7738
8114
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7739
8115
|
if (scores.length < 3) continue;
|
|
7740
|
-
const
|
|
7741
|
-
const variance = scores.reduce((a, b) => a + (b -
|
|
8116
|
+
const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
8117
|
+
const variance = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
|
|
7742
8118
|
if (variance > varianceThreshold) {
|
|
7743
8119
|
targets.push({
|
|
7744
8120
|
reason: "high-variance",
|
|
@@ -7969,7 +8345,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
7969
8345
|
|
|
7970
8346
|
// src/command-runner.ts
|
|
7971
8347
|
import { spawnSync } from "child_process";
|
|
7972
|
-
import { existsSync as
|
|
8348
|
+
import { existsSync as existsSync5, readdirSync as readdirSync2, readFileSync as readFileSync4, statSync as statSync2 } from "fs";
|
|
7973
8349
|
import { join as join2 } from "path";
|
|
7974
8350
|
var localCommandRunner = {
|
|
7975
8351
|
name: "local",
|
|
@@ -7998,11 +8374,11 @@ var localCommandRunner = {
|
|
|
7998
8374
|
return r.status === 0 && (r.stdout ?? "").trim().length > 0;
|
|
7999
8375
|
},
|
|
8000
8376
|
async fileExists(path) {
|
|
8001
|
-
return
|
|
8377
|
+
return existsSync5(path);
|
|
8002
8378
|
},
|
|
8003
8379
|
async readFile(path) {
|
|
8004
8380
|
try {
|
|
8005
|
-
return
|
|
8381
|
+
return readFileSync4(path, "utf8");
|
|
8006
8382
|
} catch {
|
|
8007
8383
|
return null;
|
|
8008
8384
|
}
|
|
@@ -8240,7 +8616,7 @@ function extractErrorCount(text, opts = {}) {
|
|
|
8240
8616
|
for (const p of patterns) {
|
|
8241
8617
|
const matches = Array.from(text.matchAll(p.regex));
|
|
8242
8618
|
if (matches.length === 0) continue;
|
|
8243
|
-
const count = p.transform ? matches.reduce((
|
|
8619
|
+
const count = p.transform ? matches.reduce((sum3, m) => sum3 + p.transform(m), 0) : matches.length;
|
|
8244
8620
|
return {
|
|
8245
8621
|
count,
|
|
8246
8622
|
matched: p.name,
|
|
@@ -8934,8 +9310,8 @@ function multiToolchainLayer(config) {
|
|
|
8934
9310
|
}
|
|
8935
9311
|
|
|
8936
9312
|
// src/reference-replay.ts
|
|
8937
|
-
import { appendFileSync as
|
|
8938
|
-
import { dirname as
|
|
9313
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5 } from "fs";
|
|
9314
|
+
import { dirname as dirname3 } from "path";
|
|
8939
9315
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
8940
9316
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
8941
9317
|
async function runReferenceReplay(cases, options) {
|
|
@@ -9053,14 +9429,14 @@ function jsonlReferenceReplayStore(path) {
|
|
|
9053
9429
|
return {
|
|
9054
9430
|
async save(run) {
|
|
9055
9431
|
await lock.runExclusive(() => {
|
|
9056
|
-
|
|
9057
|
-
|
|
9432
|
+
mkdirSync3(dirname3(path), { recursive: true });
|
|
9433
|
+
appendFileSync3(path, `${JSON.stringify(run)}
|
|
9058
9434
|
`);
|
|
9059
9435
|
});
|
|
9060
9436
|
},
|
|
9061
9437
|
async list() {
|
|
9062
9438
|
return lock.runExclusive(() => {
|
|
9063
|
-
if (!
|
|
9439
|
+
if (!existsSync6(path)) return [];
|
|
9064
9440
|
return readJsonl(path);
|
|
9065
9441
|
});
|
|
9066
9442
|
}
|
|
@@ -9149,7 +9525,7 @@ function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
|
|
|
9149
9525
|
regressions
|
|
9150
9526
|
};
|
|
9151
9527
|
}
|
|
9152
|
-
const requiredMeanDelta =
|
|
9528
|
+
const requiredMeanDelta = mean3(compared.map((item) => item.f1Delta));
|
|
9153
9529
|
if (requiredMeanDelta < minF1Delta) {
|
|
9154
9530
|
return {
|
|
9155
9531
|
promote: false,
|
|
@@ -9284,8 +9660,8 @@ function scorePair(scenario, matcher, reference, candidate) {
|
|
|
9284
9660
|
function buildScenarioScore(scenario, matches, falsePositives) {
|
|
9285
9661
|
const matched = matches.filter((match) => match.matched).length;
|
|
9286
9662
|
const total = scenario.references.length;
|
|
9287
|
-
const matchedWeight = matches.filter((match) => match.matched).reduce((
|
|
9288
|
-
const totalWeight = matches.reduce((
|
|
9663
|
+
const matchedWeight = matches.filter((match) => match.matched).reduce((sum3, match) => sum3 + match.weight, 0);
|
|
9664
|
+
const totalWeight = matches.reduce((sum3, match) => sum3 + match.weight, 0);
|
|
9289
9665
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9290
9666
|
const recall = ratio(matched, total);
|
|
9291
9667
|
return {
|
|
@@ -9311,11 +9687,11 @@ function aggregateBySplit(scores) {
|
|
|
9311
9687
|
return out;
|
|
9312
9688
|
}
|
|
9313
9689
|
function aggregateScenarioScores(scores) {
|
|
9314
|
-
const matched =
|
|
9315
|
-
const total =
|
|
9316
|
-
const falsePositives =
|
|
9317
|
-
const matchedWeight =
|
|
9318
|
-
const totalWeight =
|
|
9690
|
+
const matched = sum2(scores.map((score) => score.matched));
|
|
9691
|
+
const total = sum2(scores.map((score) => score.total));
|
|
9692
|
+
const falsePositives = sum2(scores.map((score) => score.falsePositives));
|
|
9693
|
+
const matchedWeight = sum2(scores.map((score) => score.matchedWeight));
|
|
9694
|
+
const totalWeight = sum2(scores.map((score) => score.totalWeight));
|
|
9319
9695
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9320
9696
|
const recall = ratio(matched, total);
|
|
9321
9697
|
return {
|
|
@@ -9382,11 +9758,11 @@ function clamp012(value) {
|
|
|
9382
9758
|
if (!Number.isFinite(value)) return 0;
|
|
9383
9759
|
return Math.max(0, Math.min(1, value));
|
|
9384
9760
|
}
|
|
9385
|
-
function
|
|
9761
|
+
function sum2(values) {
|
|
9386
9762
|
return values.reduce((acc, value) => acc + value, 0);
|
|
9387
9763
|
}
|
|
9388
|
-
function
|
|
9389
|
-
return values.length ?
|
|
9764
|
+
function mean3(values) {
|
|
9765
|
+
return values.length ? sum2(values) / values.length : 0;
|
|
9390
9766
|
}
|
|
9391
9767
|
function formatPct(value) {
|
|
9392
9768
|
return `${(value * 100).toFixed(1)}%`;
|
|
@@ -9403,7 +9779,7 @@ function throwIfAborted(signal) {
|
|
|
9403
9779
|
throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
|
|
9404
9780
|
}
|
|
9405
9781
|
function readJsonl(path) {
|
|
9406
|
-
const raw =
|
|
9782
|
+
const raw = readFileSync5(path, "utf8");
|
|
9407
9783
|
const out = [];
|
|
9408
9784
|
for (const line of raw.split("\n")) {
|
|
9409
9785
|
const trimmed = line.trim();
|
|
@@ -9650,8 +10026,8 @@ function detectCalibrationDrift(runs, opts) {
|
|
|
9650
10026
|
alpha,
|
|
9651
10027
|
recentN: recent.length,
|
|
9652
10028
|
historyN: historical.length,
|
|
9653
|
-
recentMean:
|
|
9654
|
-
historyMean:
|
|
10029
|
+
recentMean: mean4(recent),
|
|
10030
|
+
historyMean: mean4(historical)
|
|
9655
10031
|
}
|
|
9656
10032
|
}
|
|
9657
10033
|
];
|
|
@@ -9771,7 +10147,7 @@ function chiSquareCritical(df, alpha) {
|
|
|
9771
10147
|
}
|
|
9772
10148
|
return TABLE[10][idx];
|
|
9773
10149
|
}
|
|
9774
|
-
function
|
|
10150
|
+
function mean4(xs) {
|
|
9775
10151
|
if (xs.length === 0) return 0;
|
|
9776
10152
|
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
9777
10153
|
}
|
|
@@ -9971,8 +10347,8 @@ async function discoverPersonas(dir, opts = {}) {
|
|
|
9971
10347
|
}
|
|
9972
10348
|
|
|
9973
10349
|
// src/evolution-telemetry.ts
|
|
9974
|
-
import { appendFileSync as
|
|
9975
|
-
import { dirname as
|
|
10350
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
|
|
10351
|
+
import { dirname as dirname4 } from "path";
|
|
9976
10352
|
var MutationTelemetry = class {
|
|
9977
10353
|
appender;
|
|
9978
10354
|
constructor(path) {
|
|
@@ -10001,17 +10377,17 @@ var LineageRecorder = class {
|
|
|
10001
10377
|
this.path = path;
|
|
10002
10378
|
this.snapshotPath = `${path}.snapshot`;
|
|
10003
10379
|
this.kindOf = kindOf ?? defaultKindOf;
|
|
10004
|
-
|
|
10005
|
-
if (
|
|
10380
|
+
mkdirSync4(dirname4(path), { recursive: true });
|
|
10381
|
+
if (existsSync7(this.snapshotPath)) {
|
|
10006
10382
|
try {
|
|
10007
|
-
const parsed = JSON.parse(
|
|
10383
|
+
const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
|
|
10008
10384
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
10009
10385
|
} catch {
|
|
10010
10386
|
}
|
|
10011
10387
|
}
|
|
10012
|
-
if (
|
|
10388
|
+
if (existsSync7(path)) {
|
|
10013
10389
|
try {
|
|
10014
|
-
for (const line of
|
|
10390
|
+
for (const line of readFileSync6(path, "utf-8").split("\n")) {
|
|
10015
10391
|
if (!line.trim()) continue;
|
|
10016
10392
|
try {
|
|
10017
10393
|
const entry = JSON.parse(line);
|
|
@@ -10023,9 +10399,9 @@ var LineageRecorder = class {
|
|
|
10023
10399
|
} catch {
|
|
10024
10400
|
}
|
|
10025
10401
|
}
|
|
10026
|
-
if (
|
|
10402
|
+
if (existsSync7(path) && this.nodes.size === 0) {
|
|
10027
10403
|
try {
|
|
10028
|
-
const raw =
|
|
10404
|
+
const raw = readFileSync6(path, "utf-8").trim();
|
|
10029
10405
|
if (raw.startsWith("[")) {
|
|
10030
10406
|
const parsed = JSON.parse(raw);
|
|
10031
10407
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
@@ -10039,15 +10415,15 @@ var LineageRecorder = class {
|
|
|
10039
10415
|
const prev = this.nodes.get(node.id);
|
|
10040
10416
|
this.nodes.set(node.id, { ...prev, ...node });
|
|
10041
10417
|
try {
|
|
10042
|
-
if (
|
|
10043
|
-
const head =
|
|
10418
|
+
if (existsSync7(this.path)) {
|
|
10419
|
+
const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
10044
10420
|
if (head === "[") {
|
|
10045
10421
|
writeFileSync(this.path, "");
|
|
10046
10422
|
}
|
|
10047
10423
|
}
|
|
10048
10424
|
} catch {
|
|
10049
10425
|
}
|
|
10050
|
-
|
|
10426
|
+
appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
|
|
10051
10427
|
`);
|
|
10052
10428
|
});
|
|
10053
10429
|
}
|
|
@@ -10106,9 +10482,9 @@ var CostLedger = class {
|
|
|
10106
10482
|
mutex = new Mutex();
|
|
10107
10483
|
constructor(path) {
|
|
10108
10484
|
this.path = path;
|
|
10109
|
-
if (
|
|
10485
|
+
if (existsSync7(path)) {
|
|
10110
10486
|
try {
|
|
10111
|
-
const loaded = JSON.parse(
|
|
10487
|
+
const loaded = JSON.parse(readFileSync6(path, "utf-8"));
|
|
10112
10488
|
for (const k of Object.keys(this.totals)) {
|
|
10113
10489
|
if (k === "byGeneration") {
|
|
10114
10490
|
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
@@ -10125,7 +10501,7 @@ var CostLedger = class {
|
|
|
10125
10501
|
} catch {
|
|
10126
10502
|
}
|
|
10127
10503
|
} else {
|
|
10128
|
-
|
|
10504
|
+
mkdirSync4(dirname4(path), { recursive: true });
|
|
10129
10505
|
}
|
|
10130
10506
|
}
|
|
10131
10507
|
genBucket(generation) {
|
|
@@ -10277,16 +10653,16 @@ function precision(goldens, candidates, options = {}) {
|
|
|
10277
10653
|
}
|
|
10278
10654
|
|
|
10279
10655
|
// src/jsonl-trial-cache.ts
|
|
10280
|
-
import { appendFileSync as
|
|
10281
|
-
import { dirname as
|
|
10656
|
+
import { appendFileSync as appendFileSync5, existsSync as existsSync8, mkdirSync as mkdirSync5, readFileSync as readFileSync7 } from "fs";
|
|
10657
|
+
import { dirname as dirname5 } from "path";
|
|
10282
10658
|
var JsonlTrialCache = class {
|
|
10283
10659
|
map = /* @__PURE__ */ new Map();
|
|
10284
10660
|
path;
|
|
10285
10661
|
appender;
|
|
10286
10662
|
constructor(path) {
|
|
10287
10663
|
this.path = path;
|
|
10288
|
-
if (
|
|
10289
|
-
for (const line of
|
|
10664
|
+
if (existsSync8(path)) {
|
|
10665
|
+
for (const line of readFileSync7(path, "utf-8").split("\n")) {
|
|
10290
10666
|
if (!line.trim()) continue;
|
|
10291
10667
|
try {
|
|
10292
10668
|
const entry = JSON.parse(line);
|
|
@@ -10295,7 +10671,7 @@ var JsonlTrialCache = class {
|
|
|
10295
10671
|
}
|
|
10296
10672
|
}
|
|
10297
10673
|
} else {
|
|
10298
|
-
|
|
10674
|
+
mkdirSync5(dirname5(path), { recursive: true });
|
|
10299
10675
|
}
|
|
10300
10676
|
this.appender = new LockedJsonlAppender(path);
|
|
10301
10677
|
}
|
|
@@ -10318,7 +10694,7 @@ var JsonlTrialCache = class {
|
|
|
10318
10694
|
setSync(key, value) {
|
|
10319
10695
|
this.map.set(key, value);
|
|
10320
10696
|
const line = { key, result: value, writtenAt: Date.now() };
|
|
10321
|
-
|
|
10697
|
+
appendFileSync5(this.path, `${JSON.stringify(line)}
|
|
10322
10698
|
`);
|
|
10323
10699
|
}
|
|
10324
10700
|
};
|
|
@@ -10401,9 +10777,9 @@ function passOrthogonality(input) {
|
|
|
10401
10777
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10402
10778
|
}
|
|
10403
10779
|
}
|
|
10404
|
-
const
|
|
10780
|
+
const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10405
10781
|
return {
|
|
10406
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
10782
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
|
|
10407
10783
|
passCount: passes.length,
|
|
10408
10784
|
similarities: sims
|
|
10409
10785
|
};
|
|
@@ -10678,6 +11054,7 @@ export {
|
|
|
10678
11054
|
DEFAULT_HARNESS_OBJECTIVES,
|
|
10679
11055
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
10680
11056
|
DEFAULT_MUTATORS,
|
|
11057
|
+
DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
|
|
10681
11058
|
DEFAULT_REDACTION_RULES,
|
|
10682
11059
|
DEFAULT_RED_TEAM_CORPUS,
|
|
10683
11060
|
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
@@ -10766,13 +11143,16 @@ export {
|
|
|
10766
11143
|
adversarialJudge,
|
|
10767
11144
|
agentProfileCellHashMaterial,
|
|
10768
11145
|
agentProfileCellKey,
|
|
11146
|
+
agentProfileHash,
|
|
10769
11147
|
aggregateLlm,
|
|
11148
|
+
aggregatePrReviewScore,
|
|
10770
11149
|
aggregateRunScore,
|
|
10771
11150
|
aggregateTrialsByMode,
|
|
10772
11151
|
allCriticalPassed,
|
|
10773
11152
|
analyzeAntiSlop,
|
|
10774
11153
|
analyzeSeries,
|
|
10775
11154
|
analyzeTraces,
|
|
11155
|
+
appendScorecard,
|
|
10776
11156
|
argHash,
|
|
10777
11157
|
assertLlmRoute,
|
|
10778
11158
|
assertRealBackend,
|
|
@@ -10816,6 +11196,7 @@ export {
|
|
|
10816
11196
|
cohensD,
|
|
10817
11197
|
coherenceJudge,
|
|
10818
11198
|
collectionPreserved,
|
|
11199
|
+
commentsForSource,
|
|
10819
11200
|
commitBisect,
|
|
10820
11201
|
compareReferenceReplay,
|
|
10821
11202
|
compareToBaseline,
|
|
@@ -10866,6 +11247,7 @@ export {
|
|
|
10866
11247
|
deployGateLayer,
|
|
10867
11248
|
describeTraceInsightScope,
|
|
10868
11249
|
diffFindings,
|
|
11250
|
+
diffScorecard,
|
|
10869
11251
|
discoverPersonas,
|
|
10870
11252
|
distillPlaybook,
|
|
10871
11253
|
domainEvidencePattern,
|
|
@@ -10901,6 +11283,7 @@ export {
|
|
|
10901
11283
|
formatBenchmarkReport,
|
|
10902
11284
|
formatDriverReport,
|
|
10903
11285
|
formatFindings,
|
|
11286
|
+
formatScorecardDiff,
|
|
10904
11287
|
gainHistogram,
|
|
10905
11288
|
ghCliClient,
|
|
10906
11289
|
precision as goldenPrecision,
|
|
@@ -10943,6 +11326,7 @@ export {
|
|
|
10943
11326
|
linterJudge,
|
|
10944
11327
|
llmSpanFromProvider,
|
|
10945
11328
|
llmSpans,
|
|
11329
|
+
loadScorecard,
|
|
10946
11330
|
loadScorerFromGrader,
|
|
10947
11331
|
localCommandRunner,
|
|
10948
11332
|
lowercaseMutator,
|
|
@@ -10984,6 +11368,8 @@ export {
|
|
|
10984
11368
|
proposeSynthesisTargets,
|
|
10985
11369
|
providerFromBaseUrl,
|
|
10986
11370
|
pytestTestParser,
|
|
11371
|
+
recordRuns,
|
|
11372
|
+
recordRunsToScorecard,
|
|
10987
11373
|
redTeamDataset,
|
|
10988
11374
|
redTeamReport,
|
|
10989
11375
|
redactString,
|
|
@@ -11042,6 +11428,8 @@ export {
|
|
|
11042
11428
|
scoreContinuity,
|
|
11043
11429
|
scoreFromEvals,
|
|
11044
11430
|
scoreKnowledgeReadiness,
|
|
11431
|
+
scorePrReviewComments,
|
|
11432
|
+
scorePrReviewSource,
|
|
11045
11433
|
scoreRedTeamOutput,
|
|
11046
11434
|
scoreReferenceReplay,
|
|
11047
11435
|
scoreTraceInsightReadiness,
|
|
@@ -11060,6 +11448,7 @@ export {
|
|
|
11060
11448
|
summarize,
|
|
11061
11449
|
summarizeBackendIntegrity,
|
|
11062
11450
|
summarizeHarnessResults,
|
|
11451
|
+
summarizePrReviewBenchmark,
|
|
11063
11452
|
summarizePreferenceMemory,
|
|
11064
11453
|
summaryTable,
|
|
11065
11454
|
testJudge,
|