@tangle-network/agent-eval 0.19.1 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +244 -4
- package/dist/index.js +317 -14
- package/dist/index.js.map +1 -1
- package/docs/knowledge-readiness.md +84 -0
- package/docs/multi-shot-optimization.md +7 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -2251,6 +2251,151 @@ async function finish(emitter, result) {
|
|
|
2251
2251
|
return result;
|
|
2252
2252
|
}
|
|
2253
2253
|
|
|
2254
|
+
// src/knowledge/readiness.ts
|
|
2255
|
+
function scoreKnowledgeReadiness(options) {
|
|
2256
|
+
const requirements = options.requirements.map(normalizeRequirement);
|
|
2257
|
+
const missing = requirements.filter((requirement) => requirement.currentConfidence < requirement.confidenceNeeded);
|
|
2258
|
+
const blockingMissingRequirements = missing.filter(isBlockingGap);
|
|
2259
|
+
const nonBlockingGaps = missing.filter((requirement) => !isBlockingGap(requirement));
|
|
2260
|
+
const readinessScore = weightedReadiness(requirements);
|
|
2261
|
+
const bundle = {
|
|
2262
|
+
taskId: options.taskId,
|
|
2263
|
+
requirements,
|
|
2264
|
+
evidenceIds: unique([...options.evidenceIds ?? [], ...requirements.flatMap((r) => r.evidenceIds)]),
|
|
2265
|
+
claimIds: unique(options.claimIds ?? []),
|
|
2266
|
+
wikiPageIds: unique(options.wikiPageIds ?? []),
|
|
2267
|
+
userAnswers: options.userAnswers ?? {},
|
|
2268
|
+
missing,
|
|
2269
|
+
readinessScore,
|
|
2270
|
+
metadata: options.metadata
|
|
2271
|
+
};
|
|
2272
|
+
const recommendedAction = chooseRecommendedAction(blockingMissingRequirements, nonBlockingGaps);
|
|
2273
|
+
const severity = blockingMissingRequirements.length > 0 ? "critical" : nonBlockingGaps.some((gap) => gap.importance === "high") ? "warning" : "info";
|
|
2274
|
+
const reason = blockingMissingRequirements.length > 0 ? `${blockingMissingRequirements.length} blocking knowledge requirement(s) are missing.` : nonBlockingGaps.length > 0 ? `${nonBlockingGaps.length} non-blocking knowledge gap(s) remain.` : "All declared knowledge requirements are ready.";
|
|
2275
|
+
return {
|
|
2276
|
+
taskId: options.taskId,
|
|
2277
|
+
readinessScore,
|
|
2278
|
+
blockingMissingRequirements,
|
|
2279
|
+
nonBlockingGaps,
|
|
2280
|
+
recommendedAction,
|
|
2281
|
+
bundle,
|
|
2282
|
+
severity,
|
|
2283
|
+
reason
|
|
2284
|
+
};
|
|
2285
|
+
}
|
|
2286
|
+
function blockingKnowledgeEval(report, options = {}) {
|
|
2287
|
+
const minimumScore = options.minimumScore ?? 0.7;
|
|
2288
|
+
const passed = report.blockingMissingRequirements.length === 0 && report.readinessScore >= minimumScore;
|
|
2289
|
+
return objectiveEval({
|
|
2290
|
+
id: options.id ?? "knowledge-ready",
|
|
2291
|
+
passed,
|
|
2292
|
+
score: report.readinessScore,
|
|
2293
|
+
severity: passed ? "info" : report.severity,
|
|
2294
|
+
detail: report.reason,
|
|
2295
|
+
evidence: report.blockingMissingRequirements.map((r) => r.id).join(", ") || void 0,
|
|
2296
|
+
metadata: { knowledgeReadiness: report }
|
|
2297
|
+
});
|
|
2298
|
+
}
|
|
2299
|
+
function userQuestionsForKnowledgeGaps(gaps) {
|
|
2300
|
+
return gaps.filter((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask").map((gap) => ({
|
|
2301
|
+
id: `question_${gap.id}`,
|
|
2302
|
+
question: `Please provide: ${gap.description}`,
|
|
2303
|
+
reason: `Required for ${gap.requiredFor.join(", ") || "the task"}.`,
|
|
2304
|
+
requirementId: gap.id,
|
|
2305
|
+
importance: gap.importance,
|
|
2306
|
+
answerType: gap.sensitivity === "secret" ? "credential" : "free_text",
|
|
2307
|
+
impactIfUnknown: impactFor(gap)
|
|
2308
|
+
}));
|
|
2309
|
+
}
|
|
2310
|
+
function acquisitionPlansForKnowledgeGaps(gaps) {
|
|
2311
|
+
const byMode = /* @__PURE__ */ new Map();
|
|
2312
|
+
for (const gap of gaps) {
|
|
2313
|
+
const mode = planMode(gap.acquisitionMode);
|
|
2314
|
+
if (!mode) continue;
|
|
2315
|
+
const bucket = byMode.get(mode) ?? [];
|
|
2316
|
+
bucket.push(gap);
|
|
2317
|
+
byMode.set(mode, bucket);
|
|
2318
|
+
}
|
|
2319
|
+
return [...byMode.entries()].map(([mode, requirements]) => ({
|
|
2320
|
+
id: `acquire_${mode}`,
|
|
2321
|
+
requirementIds: requirements.map((r) => r.id),
|
|
2322
|
+
mode,
|
|
2323
|
+
description: descriptionForPlan(mode, requirements),
|
|
2324
|
+
priority: maxImportance(requirements.map((r) => r.importance)),
|
|
2325
|
+
questions: mode === "ask_user" ? userQuestionsForKnowledgeGaps(requirements) : void 0
|
|
2326
|
+
}));
|
|
2327
|
+
}
|
|
2328
|
+
function normalizeRequirement(requirement) {
|
|
2329
|
+
return {
|
|
2330
|
+
...requirement,
|
|
2331
|
+
confidenceNeeded: clamp01(requirement.confidenceNeeded),
|
|
2332
|
+
currentConfidence: clamp01(requirement.currentConfidence),
|
|
2333
|
+
evidenceIds: unique(requirement.evidenceIds)
|
|
2334
|
+
};
|
|
2335
|
+
}
|
|
2336
|
+
function weightedReadiness(requirements) {
|
|
2337
|
+
if (requirements.length === 0) return 1;
|
|
2338
|
+
let weightSum = 0;
|
|
2339
|
+
let scoreSum = 0;
|
|
2340
|
+
for (const requirement of requirements) {
|
|
2341
|
+
const weight = importanceWeight(requirement.importance);
|
|
2342
|
+
const score = requirement.confidenceNeeded <= 0 ? 1 : Math.min(1, requirement.currentConfidence / requirement.confidenceNeeded);
|
|
2343
|
+
weightSum += weight;
|
|
2344
|
+
scoreSum += weight * score;
|
|
2345
|
+
}
|
|
2346
|
+
return clamp01(scoreSum / weightSum);
|
|
2347
|
+
}
|
|
2348
|
+
function isBlockingGap(requirement) {
|
|
2349
|
+
return requirement.importance === "blocking" || requirement.fallbackPolicy === "block" || requirement.sensitivity === "secret";
|
|
2350
|
+
}
|
|
2351
|
+
function chooseRecommendedAction(blocking, nonBlocking) {
|
|
2352
|
+
const gaps = blocking.length > 0 ? blocking : nonBlocking;
|
|
2353
|
+
if (gaps.length === 0) return "run_agent";
|
|
2354
|
+
if (blocking.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
|
|
2355
|
+
if (blocking.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
|
|
2356
|
+
if (blocking.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
|
|
2357
|
+
if (blocking.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
|
|
2358
|
+
if (blocking.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
|
|
2359
|
+
if (nonBlocking.some((gap) => gap.importance === "high")) return "build_domain_wiki";
|
|
2360
|
+
return "continue_with_caveat";
|
|
2361
|
+
}
|
|
2362
|
+
function planMode(mode) {
|
|
2363
|
+
if (mode === "infer_low_confidence" || mode === "not_available") return null;
|
|
2364
|
+
return mode;
|
|
2365
|
+
}
|
|
2366
|
+
function descriptionForPlan(mode, requirements) {
|
|
2367
|
+
const labels = requirements.map((r) => r.description).join("; ");
|
|
2368
|
+
if (mode === "ask_user") return `Ask the user for: ${labels}`;
|
|
2369
|
+
if (mode === "search_web") return `Search web or documentation sources for: ${labels}`;
|
|
2370
|
+
if (mode === "query_connector") return `Query configured connectors for: ${labels}`;
|
|
2371
|
+
if (mode === "inspect_repo") return `Inspect repository context for: ${labels}`;
|
|
2372
|
+
if (mode === "run_command") return `Run local commands to collect: ${labels}`;
|
|
2373
|
+
return `Build domain wiki evidence for: ${labels}`;
|
|
2374
|
+
}
|
|
2375
|
+
function impactFor(requirement) {
|
|
2376
|
+
if (requirement.fallbackPolicy === "block") return "The agent should not run until this is known.";
|
|
2377
|
+
if (requirement.fallbackPolicy === "continue_with_caveat") return "The agent may continue, but must disclose uncertainty.";
|
|
2378
|
+
if (requirement.fallbackPolicy === "use_default") return "The agent will use the configured default if skipped.";
|
|
2379
|
+
return "The agent should ask before continuing.";
|
|
2380
|
+
}
|
|
2381
|
+
function maxImportance(values) {
|
|
2382
|
+
const order = ["blocking", "high", "medium", "low"];
|
|
2383
|
+
return order.find((value) => values.includes(value)) ?? "low";
|
|
2384
|
+
}
|
|
2385
|
+
function importanceWeight(importance) {
|
|
2386
|
+
if (importance === "blocking") return 8;
|
|
2387
|
+
if (importance === "high") return 4;
|
|
2388
|
+
if (importance === "medium") return 2;
|
|
2389
|
+
return 1;
|
|
2390
|
+
}
|
|
2391
|
+
function clamp01(value) {
|
|
2392
|
+
if (!Number.isFinite(value)) return 0;
|
|
2393
|
+
return Math.max(0, Math.min(1, value));
|
|
2394
|
+
}
|
|
2395
|
+
function unique(items) {
|
|
2396
|
+
return [...new Set(items)];
|
|
2397
|
+
}
|
|
2398
|
+
|
|
2254
2399
|
// src/feedback-trajectory.ts
|
|
2255
2400
|
var DEFAULT_SPLIT_POLICY = {
|
|
2256
2401
|
trainPct: 70,
|
|
@@ -3521,9 +3666,9 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
|
|
|
3521
3666
|
};
|
|
3522
3667
|
function aggregateRunScore(score, weights = {}) {
|
|
3523
3668
|
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
3524
|
-
return w.success *
|
|
3669
|
+
return w.success * clamp012(score.success) + w.goalProgress * clamp012(score.goalProgress) + w.repoGroundedness * clamp012(score.repoGroundedness) + w.driftPenalty * clamp012(score.driftPenalty) + w.toolUseQuality * clamp012(score.toolUseQuality) + w.patchQuality * clamp012(score.patchQuality) + w.testReality * clamp012(score.testReality) + w.finalGate * clamp012(score.finalGate) + w.reviewerBlockers * clamp012(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
3525
3670
|
}
|
|
3526
|
-
function
|
|
3671
|
+
function clamp012(value) {
|
|
3527
3672
|
if (!Number.isFinite(value)) return 0;
|
|
3528
3673
|
return Math.max(0, Math.min(1, value));
|
|
3529
3674
|
}
|
|
@@ -3567,13 +3712,13 @@ var RunCritic = class {
|
|
|
3567
3712
|
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
3568
3713
|
if (!success) notes.push("run did not complete with pass=true");
|
|
3569
3714
|
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
3570
|
-
const outcomeScore = typeof trace.run.outcome?.score === "number" ?
|
|
3715
|
+
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp012(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
|
|
3571
3716
|
const goalProgress = outcomeScore ?? judgeAverage ?? success;
|
|
3572
3717
|
const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
|
|
3573
3718
|
const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
|
|
3574
3719
|
if (toolSpans2.length === 0) notes.push("no tool spans recorded");
|
|
3575
3720
|
const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
|
|
3576
|
-
const patchQuality = patchEvidence > 0 ?
|
|
3721
|
+
const patchQuality = patchEvidence > 0 ? clamp012(patchEvidence / 4) : 0;
|
|
3577
3722
|
if (!patchQuality) notes.push("no artifact or edit evidence recorded");
|
|
3578
3723
|
const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
|
|
3579
3724
|
const testReality = sandboxTests.length ? sandboxTests.reduce((sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
|
|
@@ -3617,7 +3762,7 @@ var RunCritic = class {
|
|
|
3617
3762
|
}
|
|
3618
3763
|
};
|
|
3619
3764
|
function normalizeJudgeScore(score) {
|
|
3620
|
-
return score > 1 ?
|
|
3765
|
+
return score > 1 ? clamp012(score / 10) : clamp012(score);
|
|
3621
3766
|
}
|
|
3622
3767
|
function looksRepoGrounded(text) {
|
|
3623
3768
|
return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
|
|
@@ -4973,6 +5118,17 @@ var FAILURE_CLASSES = [
|
|
|
4973
5118
|
"cost_overrun",
|
|
4974
5119
|
"timeout",
|
|
4975
5120
|
"sandbox_failure",
|
|
5121
|
+
"missing_user_data",
|
|
5122
|
+
"missing_domain_data",
|
|
5123
|
+
"missing_codebase_context",
|
|
5124
|
+
"missing_runtime_context",
|
|
5125
|
+
"missing_credentials",
|
|
5126
|
+
"stale_external_data",
|
|
5127
|
+
"bad_retrieval",
|
|
5128
|
+
"insufficient_evidence",
|
|
5129
|
+
"contradictory_evidence",
|
|
5130
|
+
"ambiguous_user_intent",
|
|
5131
|
+
"knowledge_readiness_blocked",
|
|
4976
5132
|
"unknown"
|
|
4977
5133
|
];
|
|
4978
5134
|
function isLlmSpan(s) {
|
|
@@ -5329,6 +5485,62 @@ var DEFAULT_RULES = [
|
|
|
5329
5485
|
return null;
|
|
5330
5486
|
}
|
|
5331
5487
|
},
|
|
5488
|
+
{
|
|
5489
|
+
id: "knowledge-readiness-blocked",
|
|
5490
|
+
match: ({ events }) => {
|
|
5491
|
+
const event = events.find((e) => e.kind === "custom" && e.payload.kind === "readiness_scored" && e.payload.passed === false);
|
|
5492
|
+
return event ? {
|
|
5493
|
+
failureClass: "knowledge_readiness_blocked",
|
|
5494
|
+
reason: "knowledge readiness report blocked execution",
|
|
5495
|
+
triggerEventId: event.eventId
|
|
5496
|
+
} : null;
|
|
5497
|
+
}
|
|
5498
|
+
},
|
|
5499
|
+
{
|
|
5500
|
+
id: "missing-credentials",
|
|
5501
|
+
match: ({ events }) => {
|
|
5502
|
+
const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.category === "credential_or_secret");
|
|
5503
|
+
return event ? {
|
|
5504
|
+
failureClass: "missing_credentials",
|
|
5505
|
+
reason: "required credential or secret was missing",
|
|
5506
|
+
triggerEventId: event.eventId
|
|
5507
|
+
} : null;
|
|
5508
|
+
}
|
|
5509
|
+
},
|
|
5510
|
+
{
|
|
5511
|
+
id: "bad-retrieval",
|
|
5512
|
+
match: ({ run, spans }) => {
|
|
5513
|
+
if (run.outcome?.pass !== false) return null;
|
|
5514
|
+
const retrieval = spans.find((s) => s.kind === "retrieval" && (s.hits.length === 0 || s.hits.every((hit) => hit.score <= 0)));
|
|
5515
|
+
return retrieval ? {
|
|
5516
|
+
failureClass: "bad_retrieval",
|
|
5517
|
+
reason: "retrieval returned no useful hits for a failed run",
|
|
5518
|
+
triggerSpanId: retrieval.spanId
|
|
5519
|
+
} : null;
|
|
5520
|
+
}
|
|
5521
|
+
},
|
|
5522
|
+
{
|
|
5523
|
+
id: "insufficient-evidence",
|
|
5524
|
+
match: ({ events }) => {
|
|
5525
|
+
const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "insufficient_evidence");
|
|
5526
|
+
return event ? {
|
|
5527
|
+
failureClass: "insufficient_evidence",
|
|
5528
|
+
reason: "task proceeded with insufficient supporting evidence",
|
|
5529
|
+
triggerEventId: event.eventId
|
|
5530
|
+
} : null;
|
|
5531
|
+
}
|
|
5532
|
+
},
|
|
5533
|
+
{
|
|
5534
|
+
id: "contradictory-evidence",
|
|
5535
|
+
match: ({ events }) => {
|
|
5536
|
+
const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "contradictory_evidence");
|
|
5537
|
+
return event ? {
|
|
5538
|
+
failureClass: "contradictory_evidence",
|
|
5539
|
+
reason: "supporting evidence contradicted itself",
|
|
5540
|
+
triggerEventId: event.eventId
|
|
5541
|
+
} : null;
|
|
5542
|
+
}
|
|
5543
|
+
},
|
|
5332
5544
|
// Budget breach events
|
|
5333
5545
|
{
|
|
5334
5546
|
id: "budget-breach",
|
|
@@ -5667,11 +5879,14 @@ async function failureClusterView(store, options = {}) {
|
|
|
5667
5879
|
const cls = classifyFailure({ run, spans, events }, rules);
|
|
5668
5880
|
let toolName;
|
|
5669
5881
|
let argPrefix;
|
|
5882
|
+
let dimension;
|
|
5670
5883
|
if (cls.triggerSpanId) {
|
|
5671
5884
|
const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
|
|
5672
5885
|
if (trig?.kind === "tool") {
|
|
5673
5886
|
toolName = trig.toolName;
|
|
5674
5887
|
argPrefix = argHash(trig.args).slice(0, 16);
|
|
5888
|
+
} else if (trig?.kind === "judge") {
|
|
5889
|
+
dimension = trig.dimension;
|
|
5675
5890
|
}
|
|
5676
5891
|
}
|
|
5677
5892
|
if (!toolName) {
|
|
@@ -5682,13 +5897,18 @@ async function failureClusterView(store, options = {}) {
|
|
|
5682
5897
|
argPrefix = argHash(errored.args).slice(0, 16);
|
|
5683
5898
|
}
|
|
5684
5899
|
}
|
|
5685
|
-
|
|
5900
|
+
if (!dimension) {
|
|
5901
|
+
const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
|
|
5902
|
+
if (judge?.kind === "judge") dimension = judge.dimension;
|
|
5903
|
+
}
|
|
5904
|
+
const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
|
|
5686
5905
|
let cluster = clusters.get(key);
|
|
5687
5906
|
if (!cluster) {
|
|
5688
5907
|
cluster = {
|
|
5689
5908
|
failureClass: cls.failureClass,
|
|
5690
5909
|
toolName,
|
|
5691
5910
|
argPrefix,
|
|
5911
|
+
dimension,
|
|
5692
5912
|
runCount: 0,
|
|
5693
5913
|
scenarioIds: [],
|
|
5694
5914
|
exampleRunId: run.runId,
|
|
@@ -6673,6 +6893,46 @@ function checkCanaries(output, scenarios) {
|
|
|
6673
6893
|
}
|
|
6674
6894
|
return leaks;
|
|
6675
6895
|
}
|
|
6896
|
+
function checkBehavioralCanary(output, scenario) {
|
|
6897
|
+
const pattern = scenario.forbiddenPattern ?? scenario.canary;
|
|
6898
|
+
if (!pattern) return null;
|
|
6899
|
+
const hit = matchForbidden(output, pattern);
|
|
6900
|
+
if (!hit) return null;
|
|
6901
|
+
return {
|
|
6902
|
+
scenarioId: scenario.id,
|
|
6903
|
+
canary: pattern,
|
|
6904
|
+
evidence: excerpt2(output, hit)
|
|
6905
|
+
};
|
|
6906
|
+
}
|
|
6907
|
+
function runBehavioralCanaries(cases) {
|
|
6908
|
+
const leaks = [];
|
|
6909
|
+
for (const c of cases) {
|
|
6910
|
+
const leak = checkBehavioralCanary(c.output, c.scenario);
|
|
6911
|
+
if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
|
|
6912
|
+
}
|
|
6913
|
+
return leaks;
|
|
6914
|
+
}
|
|
6915
|
+
function matchForbidden(output, pattern) {
|
|
6916
|
+
const re = tryParseRegex(pattern);
|
|
6917
|
+
if (re) {
|
|
6918
|
+
const m = output.match(re);
|
|
6919
|
+
return m && m[0].length > 0 ? m[0] : null;
|
|
6920
|
+
}
|
|
6921
|
+
return output.includes(pattern) ? pattern : null;
|
|
6922
|
+
}
|
|
6923
|
+
function tryParseRegex(pattern) {
|
|
6924
|
+
if (pattern.length < 2 || pattern[0] !== "/") return null;
|
|
6925
|
+
const last = pattern.lastIndexOf("/");
|
|
6926
|
+
if (last <= 0) return null;
|
|
6927
|
+
const body = pattern.slice(1, last);
|
|
6928
|
+
const flags = pattern.slice(last + 1);
|
|
6929
|
+
if (!/^[gimsuy]*$/.test(flags)) return null;
|
|
6930
|
+
try {
|
|
6931
|
+
return new RegExp(body, flags);
|
|
6932
|
+
} catch {
|
|
6933
|
+
return null;
|
|
6934
|
+
}
|
|
6935
|
+
}
|
|
6676
6936
|
async function canaryLeakView(store, scenarios) {
|
|
6677
6937
|
const targets = scenarios.filter((s) => !!s.canary);
|
|
6678
6938
|
if (targets.length === 0) return [];
|
|
@@ -7519,6 +7779,41 @@ var DEFAULT_MUTATORS = [
|
|
|
7519
7779
|
{ id: "politeness-prefix", fn: politenessPrefixMutator },
|
|
7520
7780
|
{ id: "whitespace-collapse", fn: whitespaceCollapseMutator }
|
|
7521
7781
|
];
|
|
7782
|
+
async function paraphraseRobustnessScenarios(args) {
|
|
7783
|
+
const reps = Math.max(1, args.reps ?? 1);
|
|
7784
|
+
const mutatorNames = args.mutators.map((m) => m.name);
|
|
7785
|
+
const perScenario = [];
|
|
7786
|
+
for (const scenario of args.scenarios) {
|
|
7787
|
+
const baseline = await args.runScenario({
|
|
7788
|
+
id: scenario.id,
|
|
7789
|
+
userTurns: scenario.userTurns
|
|
7790
|
+
});
|
|
7791
|
+
const originalScore = baseline.score;
|
|
7792
|
+
const deltas = {};
|
|
7793
|
+
const paraphrasedAll = [];
|
|
7794
|
+
for (const m of args.mutators) {
|
|
7795
|
+
const scores2 = [];
|
|
7796
|
+
for (let r = 0; r < reps; r++) {
|
|
7797
|
+
const mutatedTurns = scenario.userTurns.map((t) => m.mutator(t));
|
|
7798
|
+
const out = await args.runScenario({
|
|
7799
|
+
id: scenario.id,
|
|
7800
|
+
userTurns: mutatedTurns
|
|
7801
|
+
});
|
|
7802
|
+
scores2.push(out.score);
|
|
7803
|
+
}
|
|
7804
|
+
const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
7805
|
+
deltas[m.name] = mean10 - originalScore;
|
|
7806
|
+
paraphrasedAll.push(...scores2);
|
|
7807
|
+
}
|
|
7808
|
+
const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
|
|
7809
|
+
perScenario.push({ id: scenario.id, originalScore, paraphrasedMean, deltas });
|
|
7810
|
+
}
|
|
7811
|
+
const meanOriginal = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.originalScore, 0) / perScenario.length;
|
|
7812
|
+
const meanParaphrased = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.paraphrasedMean, 0) / perScenario.length;
|
|
7813
|
+
const ratio2 = meanOriginal <= 0 ? 0 : meanParaphrased / meanOriginal;
|
|
7814
|
+
const score = Math.max(0, Math.min(1, ratio2));
|
|
7815
|
+
return { score, perScenario, mutators: mutatorNames };
|
|
7816
|
+
}
|
|
7522
7817
|
|
|
7523
7818
|
// src/visual-diff.ts
|
|
7524
7819
|
function visualDiff(a, b, options = {}) {
|
|
@@ -8747,10 +9042,11 @@ async function signManifest(m) {
|
|
|
8747
9042
|
const bytes = new TextEncoder().encode(JSON.stringify(canonical));
|
|
8748
9043
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
8749
9044
|
const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
8750
|
-
return { ...m, contentHash: hash };
|
|
9045
|
+
return { ...m, contentHash: hash, algo: "sha256-content" };
|
|
8751
9046
|
}
|
|
8752
9047
|
async function verifyManifest(m) {
|
|
8753
|
-
const { contentHash, ...rest } = m;
|
|
9048
|
+
const { contentHash, algo: _algo, ...rest } = m;
|
|
9049
|
+
void _algo;
|
|
8754
9050
|
const resigned = await signManifest(rest);
|
|
8755
9051
|
return resigned.contentHash === contentHash;
|
|
8756
9052
|
}
|
|
@@ -10989,7 +11285,7 @@ function defaultReferenceReplayMatcher(reference, candidate) {
|
|
|
10989
11285
|
const textScore = tokenJaccard(referenceText, candidateText);
|
|
10990
11286
|
const severityScore = reference.severity && candidate.severity ? normalize(reference.severity) === normalize(candidate.severity) ? 0.1 : -0.05 : 0;
|
|
10991
11287
|
const tagScore = tagOverlap(reference.tags, candidate.tags) * 0.15;
|
|
10992
|
-
const score =
|
|
11288
|
+
const score = clamp013(textScore * 0.85 + tagScore + severityScore);
|
|
10993
11289
|
return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
|
|
10994
11290
|
}
|
|
10995
11291
|
function scoreScenario(scenario, matcher, threshold, matchStrategy) {
|
|
@@ -11089,7 +11385,7 @@ function scorePair(scenario, matcher, reference, candidate) {
|
|
|
11089
11385
|
if (!Number.isFinite(result.score)) {
|
|
11090
11386
|
throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
|
|
11091
11387
|
}
|
|
11092
|
-
return { score:
|
|
11388
|
+
return { score: clamp013(result.score), reason: result.reason ?? "" };
|
|
11093
11389
|
}
|
|
11094
11390
|
function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
11095
11391
|
const matched = matches2.filter((match) => match.matched).length;
|
|
@@ -11188,7 +11484,7 @@ function tokens(text) {
|
|
|
11188
11484
|
function normalize(text) {
|
|
11189
11485
|
return text.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
|
|
11190
11486
|
}
|
|
11191
|
-
function
|
|
11487
|
+
function clamp013(value) {
|
|
11192
11488
|
if (!Number.isFinite(value)) return 0;
|
|
11193
11489
|
return Math.max(0, Math.min(1, value));
|
|
11194
11490
|
}
|
|
@@ -12653,7 +12949,7 @@ async function scoreOne(config, variant, scenarioId, rep, split) {
|
|
|
12653
12949
|
scenarioId,
|
|
12654
12950
|
rep,
|
|
12655
12951
|
ok: scored.ok ?? true,
|
|
12656
|
-
score:
|
|
12952
|
+
score: clamp014(scored.score),
|
|
12657
12953
|
cost: scored.costUsd ?? run.costUsd ?? 0,
|
|
12658
12954
|
durationMs: scored.durationMs ?? run.durationMs ?? 0,
|
|
12659
12955
|
metrics: {
|
|
@@ -12765,7 +13061,7 @@ function stableHash2(input) {
|
|
|
12765
13061
|
}
|
|
12766
13062
|
return h >>> 0;
|
|
12767
13063
|
}
|
|
12768
|
-
function
|
|
13064
|
+
function clamp014(n) {
|
|
12769
13065
|
if (!Number.isFinite(n)) return 0;
|
|
12770
13066
|
return Math.max(0, Math.min(1, n));
|
|
12771
13067
|
}
|
|
@@ -14148,6 +14444,7 @@ export {
|
|
|
14148
14444
|
TraceEmitter,
|
|
14149
14445
|
TrialTelemetry,
|
|
14150
14446
|
UNIVERSAL_FINDERS,
|
|
14447
|
+
acquisitionPlansForKnowledgeGaps,
|
|
14151
14448
|
adversarialJudge,
|
|
14152
14449
|
aggregateLlm,
|
|
14153
14450
|
aggregateRunScore,
|
|
@@ -14163,6 +14460,7 @@ export {
|
|
|
14163
14460
|
benjaminiHochberg,
|
|
14164
14461
|
bhAdjust,
|
|
14165
14462
|
bisect,
|
|
14463
|
+
blockingKnowledgeEval,
|
|
14166
14464
|
bonferroni,
|
|
14167
14465
|
bootstrapCi,
|
|
14168
14466
|
budgetBreachView,
|
|
@@ -14176,9 +14474,10 @@ export {
|
|
|
14176
14474
|
callLlmJson,
|
|
14177
14475
|
canaryLeakView,
|
|
14178
14476
|
causalAttribution,
|
|
14477
|
+
checkBehavioralCanary,
|
|
14179
14478
|
checkCanaries,
|
|
14180
14479
|
checkSlos,
|
|
14181
|
-
clamp01,
|
|
14480
|
+
clamp012 as clamp01,
|
|
14182
14481
|
classifyEuAiRisk,
|
|
14183
14482
|
classifyFailure,
|
|
14184
14483
|
codeExecutionJudge,
|
|
@@ -14299,6 +14598,7 @@ export {
|
|
|
14299
14598
|
pairedTTest,
|
|
14300
14599
|
pairedWilcoxon,
|
|
14301
14600
|
paraphraseRobustness,
|
|
14601
|
+
paraphraseRobustnessScenarios,
|
|
14302
14602
|
paretoChart,
|
|
14303
14603
|
paretoFrontier,
|
|
14304
14604
|
paretoFrontierWithCrowding,
|
|
@@ -14344,6 +14644,7 @@ export {
|
|
|
14344
14644
|
rowWhere,
|
|
14345
14645
|
runAgentControlLoop,
|
|
14346
14646
|
runAssertions,
|
|
14647
|
+
runBehavioralCanaries,
|
|
14347
14648
|
runCanaries,
|
|
14348
14649
|
runCounterfactual,
|
|
14349
14650
|
runE2EWorkflow,
|
|
@@ -14367,6 +14668,7 @@ export {
|
|
|
14367
14668
|
scanForMuffledGates,
|
|
14368
14669
|
scoreAllProjects,
|
|
14369
14670
|
scoreContinuity,
|
|
14671
|
+
scoreKnowledgeReadiness,
|
|
14370
14672
|
scoreProject,
|
|
14371
14673
|
scoreRedTeamOutput,
|
|
14372
14674
|
scoreReferenceReplay,
|
|
@@ -14401,6 +14703,7 @@ export {
|
|
|
14401
14703
|
trialTraceFromMultiShotTrial,
|
|
14402
14704
|
typoMutator,
|
|
14403
14705
|
urlContains,
|
|
14706
|
+
userQuestionsForKnowledgeGaps,
|
|
14404
14707
|
validateRunRecord,
|
|
14405
14708
|
verbosityBias,
|
|
14406
14709
|
verifyManifest,
|