@tangle-network/agent-eval 0.19.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2251,6 +2251,151 @@ async function finish(emitter, result) {
2251
2251
  return result;
2252
2252
  }
2253
2253
 
2254
+ // src/knowledge/readiness.ts
2255
+ function scoreKnowledgeReadiness(options) {
2256
+ const requirements = options.requirements.map(normalizeRequirement);
2257
+ const missing = requirements.filter((requirement) => requirement.currentConfidence < requirement.confidenceNeeded);
2258
+ const blockingMissingRequirements = missing.filter(isBlockingGap);
2259
+ const nonBlockingGaps = missing.filter((requirement) => !isBlockingGap(requirement));
2260
+ const readinessScore = weightedReadiness(requirements);
2261
+ const bundle = {
2262
+ taskId: options.taskId,
2263
+ requirements,
2264
+ evidenceIds: unique([...options.evidenceIds ?? [], ...requirements.flatMap((r) => r.evidenceIds)]),
2265
+ claimIds: unique(options.claimIds ?? []),
2266
+ wikiPageIds: unique(options.wikiPageIds ?? []),
2267
+ userAnswers: options.userAnswers ?? {},
2268
+ missing,
2269
+ readinessScore,
2270
+ metadata: options.metadata
2271
+ };
2272
+ const recommendedAction = chooseRecommendedAction(blockingMissingRequirements, nonBlockingGaps);
2273
+ const severity = blockingMissingRequirements.length > 0 ? "critical" : nonBlockingGaps.some((gap) => gap.importance === "high") ? "warning" : "info";
2274
+ const reason = blockingMissingRequirements.length > 0 ? `${blockingMissingRequirements.length} blocking knowledge requirement(s) are missing.` : nonBlockingGaps.length > 0 ? `${nonBlockingGaps.length} non-blocking knowledge gap(s) remain.` : "All declared knowledge requirements are ready.";
2275
+ return {
2276
+ taskId: options.taskId,
2277
+ readinessScore,
2278
+ blockingMissingRequirements,
2279
+ nonBlockingGaps,
2280
+ recommendedAction,
2281
+ bundle,
2282
+ severity,
2283
+ reason
2284
+ };
2285
+ }
2286
+ function blockingKnowledgeEval(report, options = {}) {
2287
+ const minimumScore = options.minimumScore ?? 0.7;
2288
+ const passed = report.blockingMissingRequirements.length === 0 && report.readinessScore >= minimumScore;
2289
+ return objectiveEval({
2290
+ id: options.id ?? "knowledge-ready",
2291
+ passed,
2292
+ score: report.readinessScore,
2293
+ severity: passed ? "info" : report.severity,
2294
+ detail: report.reason,
2295
+ evidence: report.blockingMissingRequirements.map((r) => r.id).join(", ") || void 0,
2296
+ metadata: { knowledgeReadiness: report }
2297
+ });
2298
+ }
2299
+ function userQuestionsForKnowledgeGaps(gaps) {
2300
+ return gaps.filter((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask").map((gap) => ({
2301
+ id: `question_${gap.id}`,
2302
+ question: `Please provide: ${gap.description}`,
2303
+ reason: `Required for ${gap.requiredFor.join(", ") || "the task"}.`,
2304
+ requirementId: gap.id,
2305
+ importance: gap.importance,
2306
+ answerType: gap.sensitivity === "secret" ? "credential" : "free_text",
2307
+ impactIfUnknown: impactFor(gap)
2308
+ }));
2309
+ }
2310
+ function acquisitionPlansForKnowledgeGaps(gaps) {
2311
+ const byMode = /* @__PURE__ */ new Map();
2312
+ for (const gap of gaps) {
2313
+ const mode = planMode(gap.acquisitionMode);
2314
+ if (!mode) continue;
2315
+ const bucket = byMode.get(mode) ?? [];
2316
+ bucket.push(gap);
2317
+ byMode.set(mode, bucket);
2318
+ }
2319
+ return [...byMode.entries()].map(([mode, requirements]) => ({
2320
+ id: `acquire_${mode}`,
2321
+ requirementIds: requirements.map((r) => r.id),
2322
+ mode,
2323
+ description: descriptionForPlan(mode, requirements),
2324
+ priority: maxImportance(requirements.map((r) => r.importance)),
2325
+ questions: mode === "ask_user" ? userQuestionsForKnowledgeGaps(requirements) : void 0
2326
+ }));
2327
+ }
2328
+ function normalizeRequirement(requirement) {
2329
+ return {
2330
+ ...requirement,
2331
+ confidenceNeeded: clamp01(requirement.confidenceNeeded),
2332
+ currentConfidence: clamp01(requirement.currentConfidence),
2333
+ evidenceIds: unique(requirement.evidenceIds)
2334
+ };
2335
+ }
2336
+ function weightedReadiness(requirements) {
2337
+ if (requirements.length === 0) return 1;
2338
+ let weightSum = 0;
2339
+ let scoreSum = 0;
2340
+ for (const requirement of requirements) {
2341
+ const weight = importanceWeight(requirement.importance);
2342
+ const score = requirement.confidenceNeeded <= 0 ? 1 : Math.min(1, requirement.currentConfidence / requirement.confidenceNeeded);
2343
+ weightSum += weight;
2344
+ scoreSum += weight * score;
2345
+ }
2346
+ return clamp01(scoreSum / weightSum);
2347
+ }
2348
+ function isBlockingGap(requirement) {
2349
+ return requirement.importance === "blocking" || requirement.fallbackPolicy === "block" || requirement.sensitivity === "secret";
2350
+ }
2351
+ function chooseRecommendedAction(blocking, nonBlocking) {
2352
+ const gaps = blocking.length > 0 ? blocking : nonBlocking;
2353
+ if (gaps.length === 0) return "run_agent";
2354
+ if (blocking.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
2355
+ if (blocking.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
2356
+ if (blocking.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
2357
+ if (blocking.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
2358
+ if (blocking.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
2359
+ if (nonBlocking.some((gap) => gap.importance === "high")) return "build_domain_wiki";
2360
+ return "continue_with_caveat";
2361
+ }
2362
+ function planMode(mode) {
2363
+ if (mode === "infer_low_confidence" || mode === "not_available") return null;
2364
+ return mode;
2365
+ }
2366
+ function descriptionForPlan(mode, requirements) {
2367
+ const labels = requirements.map((r) => r.description).join("; ");
2368
+ if (mode === "ask_user") return `Ask the user for: ${labels}`;
2369
+ if (mode === "search_web") return `Search web or documentation sources for: ${labels}`;
2370
+ if (mode === "query_connector") return `Query configured connectors for: ${labels}`;
2371
+ if (mode === "inspect_repo") return `Inspect repository context for: ${labels}`;
2372
+ if (mode === "run_command") return `Run local commands to collect: ${labels}`;
2373
+ return `Build domain wiki evidence for: ${labels}`;
2374
+ }
2375
+ function impactFor(requirement) {
2376
+ if (requirement.fallbackPolicy === "block") return "The agent should not run until this is known.";
2377
+ if (requirement.fallbackPolicy === "continue_with_caveat") return "The agent may continue, but must disclose uncertainty.";
2378
+ if (requirement.fallbackPolicy === "use_default") return "The agent will use the configured default if skipped.";
2379
+ return "The agent should ask before continuing.";
2380
+ }
2381
+ function maxImportance(values) {
2382
+ const order = ["blocking", "high", "medium", "low"];
2383
+ return order.find((value) => values.includes(value)) ?? "low";
2384
+ }
2385
+ function importanceWeight(importance) {
2386
+ if (importance === "blocking") return 8;
2387
+ if (importance === "high") return 4;
2388
+ if (importance === "medium") return 2;
2389
+ return 1;
2390
+ }
2391
+ function clamp01(value) {
2392
+ if (!Number.isFinite(value)) return 0;
2393
+ return Math.max(0, Math.min(1, value));
2394
+ }
2395
+ function unique(items) {
2396
+ return [...new Set(items)];
2397
+ }
2398
+
2254
2399
  // src/feedback-trajectory.ts
2255
2400
  var DEFAULT_SPLIT_POLICY = {
2256
2401
  trainPct: 70,
@@ -3521,9 +3666,9 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
3521
3666
  };
3522
3667
  function aggregateRunScore(score, weights = {}) {
3523
3668
  const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
3524
- return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
3669
+ return w.success * clamp012(score.success) + w.goalProgress * clamp012(score.goalProgress) + w.repoGroundedness * clamp012(score.repoGroundedness) + w.driftPenalty * clamp012(score.driftPenalty) + w.toolUseQuality * clamp012(score.toolUseQuality) + w.patchQuality * clamp012(score.patchQuality) + w.testReality * clamp012(score.testReality) + w.finalGate * clamp012(score.finalGate) + w.reviewerBlockers * clamp012(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
3525
3670
  }
3526
- function clamp01(value) {
3671
+ function clamp012(value) {
3527
3672
  if (!Number.isFinite(value)) return 0;
3528
3673
  return Math.max(0, Math.min(1, value));
3529
3674
  }
@@ -3567,13 +3712,13 @@ var RunCritic = class {
3567
3712
  const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
3568
3713
  if (!success) notes.push("run did not complete with pass=true");
3569
3714
  const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
3570
- const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
3715
+ const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp012(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
3571
3716
  const goalProgress = outcomeScore ?? judgeAverage ?? success;
3572
3717
  const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
3573
3718
  const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
3574
3719
  if (toolSpans2.length === 0) notes.push("no tool spans recorded");
3575
3720
  const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
3576
- const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
3721
+ const patchQuality = patchEvidence > 0 ? clamp012(patchEvidence / 4) : 0;
3577
3722
  if (!patchQuality) notes.push("no artifact or edit evidence recorded");
3578
3723
  const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
3579
3724
  const testReality = sandboxTests.length ? sandboxTests.reduce((sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
@@ -3617,7 +3762,7 @@ var RunCritic = class {
3617
3762
  }
3618
3763
  };
3619
3764
  function normalizeJudgeScore(score) {
3620
- return score > 1 ? clamp01(score / 10) : clamp01(score);
3765
+ return score > 1 ? clamp012(score / 10) : clamp012(score);
3621
3766
  }
3622
3767
  function looksRepoGrounded(text) {
3623
3768
  return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
@@ -4973,6 +5118,17 @@ var FAILURE_CLASSES = [
4973
5118
  "cost_overrun",
4974
5119
  "timeout",
4975
5120
  "sandbox_failure",
5121
+ "missing_user_data",
5122
+ "missing_domain_data",
5123
+ "missing_codebase_context",
5124
+ "missing_runtime_context",
5125
+ "missing_credentials",
5126
+ "stale_external_data",
5127
+ "bad_retrieval",
5128
+ "insufficient_evidence",
5129
+ "contradictory_evidence",
5130
+ "ambiguous_user_intent",
5131
+ "knowledge_readiness_blocked",
4976
5132
  "unknown"
4977
5133
  ];
4978
5134
  function isLlmSpan(s) {
@@ -5329,6 +5485,62 @@ var DEFAULT_RULES = [
5329
5485
  return null;
5330
5486
  }
5331
5487
  },
5488
+ {
5489
+ id: "knowledge-readiness-blocked",
5490
+ match: ({ events }) => {
5491
+ const event = events.find((e) => e.kind === "custom" && e.payload.kind === "readiness_scored" && e.payload.passed === false);
5492
+ return event ? {
5493
+ failureClass: "knowledge_readiness_blocked",
5494
+ reason: "knowledge readiness report blocked execution",
5495
+ triggerEventId: event.eventId
5496
+ } : null;
5497
+ }
5498
+ },
5499
+ {
5500
+ id: "missing-credentials",
5501
+ match: ({ events }) => {
5502
+ const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.category === "credential_or_secret");
5503
+ return event ? {
5504
+ failureClass: "missing_credentials",
5505
+ reason: "required credential or secret was missing",
5506
+ triggerEventId: event.eventId
5507
+ } : null;
5508
+ }
5509
+ },
5510
+ {
5511
+ id: "bad-retrieval",
5512
+ match: ({ run, spans }) => {
5513
+ if (run.outcome?.pass !== false) return null;
5514
+ const retrieval = spans.find((s) => s.kind === "retrieval" && (s.hits.length === 0 || s.hits.every((hit) => hit.score <= 0)));
5515
+ return retrieval ? {
5516
+ failureClass: "bad_retrieval",
5517
+ reason: "retrieval returned no useful hits for a failed run",
5518
+ triggerSpanId: retrieval.spanId
5519
+ } : null;
5520
+ }
5521
+ },
5522
+ {
5523
+ id: "insufficient-evidence",
5524
+ match: ({ events }) => {
5525
+ const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "insufficient_evidence");
5526
+ return event ? {
5527
+ failureClass: "insufficient_evidence",
5528
+ reason: "task proceeded with insufficient supporting evidence",
5529
+ triggerEventId: event.eventId
5530
+ } : null;
5531
+ }
5532
+ },
5533
+ {
5534
+ id: "contradictory-evidence",
5535
+ match: ({ events }) => {
5536
+ const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "contradictory_evidence");
5537
+ return event ? {
5538
+ failureClass: "contradictory_evidence",
5539
+ reason: "supporting evidence contradicted itself",
5540
+ triggerEventId: event.eventId
5541
+ } : null;
5542
+ }
5543
+ },
5332
5544
  // Budget breach events
5333
5545
  {
5334
5546
  id: "budget-breach",
@@ -5667,11 +5879,14 @@ async function failureClusterView(store, options = {}) {
5667
5879
  const cls = classifyFailure({ run, spans, events }, rules);
5668
5880
  let toolName;
5669
5881
  let argPrefix;
5882
+ let dimension;
5670
5883
  if (cls.triggerSpanId) {
5671
5884
  const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
5672
5885
  if (trig?.kind === "tool") {
5673
5886
  toolName = trig.toolName;
5674
5887
  argPrefix = argHash(trig.args).slice(0, 16);
5888
+ } else if (trig?.kind === "judge") {
5889
+ dimension = trig.dimension;
5675
5890
  }
5676
5891
  }
5677
5892
  if (!toolName) {
@@ -5682,13 +5897,18 @@ async function failureClusterView(store, options = {}) {
5682
5897
  argPrefix = argHash(errored.args).slice(0, 16);
5683
5898
  }
5684
5899
  }
5685
- const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}`;
5900
+ if (!dimension) {
5901
+ const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
5902
+ if (judge?.kind === "judge") dimension = judge.dimension;
5903
+ }
5904
+ const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
5686
5905
  let cluster = clusters.get(key);
5687
5906
  if (!cluster) {
5688
5907
  cluster = {
5689
5908
  failureClass: cls.failureClass,
5690
5909
  toolName,
5691
5910
  argPrefix,
5911
+ dimension,
5692
5912
  runCount: 0,
5693
5913
  scenarioIds: [],
5694
5914
  exampleRunId: run.runId,
@@ -6673,6 +6893,46 @@ function checkCanaries(output, scenarios) {
6673
6893
  }
6674
6894
  return leaks;
6675
6895
  }
6896
+ function checkBehavioralCanary(output, scenario) {
6897
+ const pattern = scenario.forbiddenPattern ?? scenario.canary;
6898
+ if (!pattern) return null;
6899
+ const hit = matchForbidden(output, pattern);
6900
+ if (!hit) return null;
6901
+ return {
6902
+ scenarioId: scenario.id,
6903
+ canary: pattern,
6904
+ evidence: excerpt2(output, hit)
6905
+ };
6906
+ }
6907
+ function runBehavioralCanaries(cases) {
6908
+ const leaks = [];
6909
+ for (const c of cases) {
6910
+ const leak = checkBehavioralCanary(c.output, c.scenario);
6911
+ if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
6912
+ }
6913
+ return leaks;
6914
+ }
6915
+ function matchForbidden(output, pattern) {
6916
+ const re = tryParseRegex(pattern);
6917
+ if (re) {
6918
+ const m = output.match(re);
6919
+ return m && m[0].length > 0 ? m[0] : null;
6920
+ }
6921
+ return output.includes(pattern) ? pattern : null;
6922
+ }
6923
+ function tryParseRegex(pattern) {
6924
+ if (pattern.length < 2 || pattern[0] !== "/") return null;
6925
+ const last = pattern.lastIndexOf("/");
6926
+ if (last <= 0) return null;
6927
+ const body = pattern.slice(1, last);
6928
+ const flags = pattern.slice(last + 1);
6929
+ if (!/^[gimsuy]*$/.test(flags)) return null;
6930
+ try {
6931
+ return new RegExp(body, flags);
6932
+ } catch {
6933
+ return null;
6934
+ }
6935
+ }
6676
6936
  async function canaryLeakView(store, scenarios) {
6677
6937
  const targets = scenarios.filter((s) => !!s.canary);
6678
6938
  if (targets.length === 0) return [];
@@ -7519,6 +7779,41 @@ var DEFAULT_MUTATORS = [
7519
7779
  { id: "politeness-prefix", fn: politenessPrefixMutator },
7520
7780
  { id: "whitespace-collapse", fn: whitespaceCollapseMutator }
7521
7781
  ];
7782
+ async function paraphraseRobustnessScenarios(args) {
7783
+ const reps = Math.max(1, args.reps ?? 1);
7784
+ const mutatorNames = args.mutators.map((m) => m.name);
7785
+ const perScenario = [];
7786
+ for (const scenario of args.scenarios) {
7787
+ const baseline = await args.runScenario({
7788
+ id: scenario.id,
7789
+ userTurns: scenario.userTurns
7790
+ });
7791
+ const originalScore = baseline.score;
7792
+ const deltas = {};
7793
+ const paraphrasedAll = [];
7794
+ for (const m of args.mutators) {
7795
+ const scores2 = [];
7796
+ for (let r = 0; r < reps; r++) {
7797
+ const mutatedTurns = scenario.userTurns.map((t) => m.mutator(t));
7798
+ const out = await args.runScenario({
7799
+ id: scenario.id,
7800
+ userTurns: mutatedTurns
7801
+ });
7802
+ scores2.push(out.score);
7803
+ }
7804
+ const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
7805
+ deltas[m.name] = mean10 - originalScore;
7806
+ paraphrasedAll.push(...scores2);
7807
+ }
7808
+ const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
7809
+ perScenario.push({ id: scenario.id, originalScore, paraphrasedMean, deltas });
7810
+ }
7811
+ const meanOriginal = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.originalScore, 0) / perScenario.length;
7812
+ const meanParaphrased = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.paraphrasedMean, 0) / perScenario.length;
7813
+ const ratio2 = meanOriginal <= 0 ? 0 : meanParaphrased / meanOriginal;
7814
+ const score = Math.max(0, Math.min(1, ratio2));
7815
+ return { score, perScenario, mutators: mutatorNames };
7816
+ }
7522
7817
 
7523
7818
  // src/visual-diff.ts
7524
7819
  function visualDiff(a, b, options = {}) {
@@ -8747,10 +9042,11 @@ async function signManifest(m) {
8747
9042
  const bytes = new TextEncoder().encode(JSON.stringify(canonical));
8748
9043
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
8749
9044
  const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
8750
- return { ...m, contentHash: hash };
9045
+ return { ...m, contentHash: hash, algo: "sha256-content" };
8751
9046
  }
8752
9047
  async function verifyManifest(m) {
8753
- const { contentHash, ...rest } = m;
9048
+ const { contentHash, algo: _algo, ...rest } = m;
9049
+ void _algo;
8754
9050
  const resigned = await signManifest(rest);
8755
9051
  return resigned.contentHash === contentHash;
8756
9052
  }
@@ -10989,7 +11285,7 @@ function defaultReferenceReplayMatcher(reference, candidate) {
10989
11285
  const textScore = tokenJaccard(referenceText, candidateText);
10990
11286
  const severityScore = reference.severity && candidate.severity ? normalize(reference.severity) === normalize(candidate.severity) ? 0.1 : -0.05 : 0;
10991
11287
  const tagScore = tagOverlap(reference.tags, candidate.tags) * 0.15;
10992
- const score = clamp012(textScore * 0.85 + tagScore + severityScore);
11288
+ const score = clamp013(textScore * 0.85 + tagScore + severityScore);
10993
11289
  return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
10994
11290
  }
10995
11291
  function scoreScenario(scenario, matcher, threshold, matchStrategy) {
@@ -11089,7 +11385,7 @@ function scorePair(scenario, matcher, reference, candidate) {
11089
11385
  if (!Number.isFinite(result.score)) {
11090
11386
  throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
11091
11387
  }
11092
- return { score: clamp012(result.score), reason: result.reason ?? "" };
11388
+ return { score: clamp013(result.score), reason: result.reason ?? "" };
11093
11389
  }
11094
11390
  function buildScenarioScore(scenario, matches2, falsePositives) {
11095
11391
  const matched = matches2.filter((match) => match.matched).length;
@@ -11188,7 +11484,7 @@ function tokens(text) {
11188
11484
  function normalize(text) {
11189
11485
  return text.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
11190
11486
  }
11191
- function clamp012(value) {
11487
+ function clamp013(value) {
11192
11488
  if (!Number.isFinite(value)) return 0;
11193
11489
  return Math.max(0, Math.min(1, value));
11194
11490
  }
@@ -12653,7 +12949,7 @@ async function scoreOne(config, variant, scenarioId, rep, split) {
12653
12949
  scenarioId,
12654
12950
  rep,
12655
12951
  ok: scored.ok ?? true,
12656
- score: clamp013(scored.score),
12952
+ score: clamp014(scored.score),
12657
12953
  cost: scored.costUsd ?? run.costUsd ?? 0,
12658
12954
  durationMs: scored.durationMs ?? run.durationMs ?? 0,
12659
12955
  metrics: {
@@ -12765,7 +13061,7 @@ function stableHash2(input) {
12765
13061
  }
12766
13062
  return h >>> 0;
12767
13063
  }
12768
- function clamp013(n) {
13064
+ function clamp014(n) {
12769
13065
  if (!Number.isFinite(n)) return 0;
12770
13066
  return Math.max(0, Math.min(1, n));
12771
13067
  }
@@ -14148,6 +14444,7 @@ export {
14148
14444
  TraceEmitter,
14149
14445
  TrialTelemetry,
14150
14446
  UNIVERSAL_FINDERS,
14447
+ acquisitionPlansForKnowledgeGaps,
14151
14448
  adversarialJudge,
14152
14449
  aggregateLlm,
14153
14450
  aggregateRunScore,
@@ -14163,6 +14460,7 @@ export {
14163
14460
  benjaminiHochberg,
14164
14461
  bhAdjust,
14165
14462
  bisect,
14463
+ blockingKnowledgeEval,
14166
14464
  bonferroni,
14167
14465
  bootstrapCi,
14168
14466
  budgetBreachView,
@@ -14176,9 +14474,10 @@ export {
14176
14474
  callLlmJson,
14177
14475
  canaryLeakView,
14178
14476
  causalAttribution,
14477
+ checkBehavioralCanary,
14179
14478
  checkCanaries,
14180
14479
  checkSlos,
14181
- clamp01,
14480
+ clamp012 as clamp01,
14182
14481
  classifyEuAiRisk,
14183
14482
  classifyFailure,
14184
14483
  codeExecutionJudge,
@@ -14299,6 +14598,7 @@ export {
14299
14598
  pairedTTest,
14300
14599
  pairedWilcoxon,
14301
14600
  paraphraseRobustness,
14601
+ paraphraseRobustnessScenarios,
14302
14602
  paretoChart,
14303
14603
  paretoFrontier,
14304
14604
  paretoFrontierWithCrowding,
@@ -14344,6 +14644,7 @@ export {
14344
14644
  rowWhere,
14345
14645
  runAgentControlLoop,
14346
14646
  runAssertions,
14647
+ runBehavioralCanaries,
14347
14648
  runCanaries,
14348
14649
  runCounterfactual,
14349
14650
  runE2EWorkflow,
@@ -14367,6 +14668,7 @@ export {
14367
14668
  scanForMuffledGates,
14368
14669
  scoreAllProjects,
14369
14670
  scoreContinuity,
14671
+ scoreKnowledgeReadiness,
14370
14672
  scoreProject,
14371
14673
  scoreRedTeamOutput,
14372
14674
  scoreReferenceReplay,
@@ -14401,6 +14703,7 @@ export {
14401
14703
  trialTraceFromMultiShotTrial,
14402
14704
  typoMutator,
14403
14705
  urlContains,
14706
+ userQuestionsForKnowledgeGaps,
14404
14707
  validateRunRecord,
14405
14708
  verbosityBias,
14406
14709
  verifyManifest,