@tangle-network/agent-eval 0.66.0 → 0.67.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/dist/campaign/index.d.ts +107 -4
- package/dist/campaign/index.js +10 -2
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-RDK3P4JE.js → chunk-MZ2IYGGN.js} +141 -31
- package/dist/chunk-MZ2IYGGN.js.map +1 -0
- package/dist/{chunk-Q56RRLEC.js → chunk-NV2PF37Q.js} +13 -3
- package/dist/{chunk-Q56RRLEC.js.map → chunk-NV2PF37Q.js.map} +1 -1
- package/dist/contract/index.d.ts +4 -4
- package/dist/contract/index.js +2 -2
- package/dist/index.d.ts +7 -6
- package/dist/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-BZUFC1_D.d.ts → provenance-CChUqexv.d.ts} +23 -1
- package/dist/{registry-BzAEvqAt.d.ts → registry-BGKyX6bw.d.ts} +1 -1
- package/dist/release-report-CN8hJlhk.d.ts +233 -0
- package/dist/reporting.d.ts +4 -3
- package/dist/statistics-B7yCbi9i.d.ts +253 -0
- package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
- package/package.json +1 -1
- package/dist/chunk-RDK3P4JE.js.map +0 -1
- package/dist/release-report-DGoeObZT.d.ts +0 -484
|
@@ -293,4 +293,4 @@ interface EvalResult {
|
|
|
293
293
|
artifact?: string;
|
|
294
294
|
}
|
|
295
295
|
|
|
296
|
-
export type { ArtifactCheck as A, BenchmarkRunnerConfig as B, CheckResult as C, DriverResult as D, EvalResult as E, FeedbackPattern as F,
|
|
296
|
+
export type { ArtifactCheck as A, BenchmarkRunnerConfig as B, CheckResult as C, DriverResult as D, EvalResult as E, FeedbackPattern as F, JudgeScore as J, ProductClientConfig as P, RouteMap as R, Scenario as S, TestResult as T, JudgeInput as a, JudgeFn as b, BenchmarkReport as c, PersonaConfig as d, DriverState as e, CollectedArtifacts as f, ScenarioResult as g, TurnMetrics as h, ScenarioFile as i, CompletionCriterion as j, ArtifactResult as k, JudgeConfig as l, JudgeRubric as m, PersonaRigor as n, RubricDimension as o, Turn as p, TurnResult as q };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.67.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/campaign/drivers/evolutionary.ts","../src/campaign/gates/compose.ts","../src/campaign/gates/default-production-gate.ts","../src/campaign/presets/run-eval.ts","../src/campaign/provenance.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:\n * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is\n * the evolutionary strategy: each generation, mutate the current best surface\n * into N candidates, measure, select. No generation memory beyond the current\n * surface; the loop body handles ranking + promotion.\n *\n * The reflective alternative is agent-runtime's `improvementDriver` with a\n * `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +\n * trace findings to propose targeted edits rather than blind mutations. Both\n * conform to `ImprovementDriver`; the improvement loop is identical regardless\n * of which drives it.\n */\n\nimport type { ImprovementDriver, Mutator } from '../types'\n\nexport interface EvolutionaryDriverOptions<TFindings = unknown> {\n mutator: Mutator<TFindings>\n /** External findings fed to the mutator each generation. Default: []. */\n findings?: TFindings[]\n}\n\nexport function evolutionaryDriver<TFindings = unknown>(\n opts: EvolutionaryDriverOptions<TFindings>,\n): ImprovementDriver<TFindings> {\n return {\n kind: `evolutionary:${opts.mutator.kind}`,\n async propose({ currentSurface, findings, populationSize, signal }) {\n return opts.mutator.mutate({\n findings: findings.length > 0 ? findings : (opts.findings ?? []),\n currentSurface,\n populationSize,\n signal,\n })\n },\n }\n}\n","/**\n * @experimental\n *\n * Compose multiple `Gate` implementations — every gate must pass for the\n * composite to ship. Closes the alignment reviewer's \"default-only\n * heldOutGate + costGate would happily promote a reward-hacked prompt\"\n * concern by making safety gates first-class composable defaults.\n */\n\nimport type { Gate, GateContext, GateDecision, GateResult, Scenario } from '../types'\n\n/** Compose gates — all must `ship` for the composite to `ship`. First\n * non-ship verdict short-circuits the composite verdict, but ALL gates run\n * (so the result records every gate's reason — useful for diagnostics). */\nexport function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(\n ...gates: Array<Gate<TArtifact, TScenario>>\n): Gate<TArtifact, TScenario> {\n if (gates.length === 0) {\n throw new Error('composeGate requires at least one gate')\n }\n return {\n name: `composed(${gates.map((g) => g.name).join(',')})`,\n async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {\n const results: Array<{ gate: Gate<TArtifact, TScenario>; res: GateResult }> = []\n for (const gate of gates) {\n const res = await gate.decide(ctx)\n results.push({ gate, res })\n }\n\n // Substrate-wide verdict policy:\n // - all 'ship' → 'ship'\n // - any 'arch_ceiling' → 'arch_ceiling' (architectural ceiling beats other holds)\n // - any 'model_ceiling' → 'model_ceiling'\n // - any 'hold' → 'hold'\n // - else 'need_more_work'\n const decisions = results.map((r) => r.res.decision)\n const overall: GateDecision = decisions.every((d) => d === 'ship')\n ? 'ship'\n : decisions.includes('arch_ceiling')\n ? 'arch_ceiling'\n : decisions.includes('model_ceiling')\n ? 'model_ceiling'\n : decisions.includes('hold')\n ? 'hold'\n : 'need_more_work'\n\n const contributing = results.flatMap((r) =>\n r.res.contributingGates.length > 0\n ? r.res.contributingGates\n : [{ name: r.gate.name, passed: r.res.decision === 'ship', detail: r.res }],\n )\n\n const reasons = results.flatMap((r) =>\n r.res.reasons.map((reason) => `[${r.gate.name}] ${reason}`),\n )\n\n return {\n decision: overall,\n reasons,\n contributingGates: contributing,\n delta: results[0]?.res.delta,\n }\n },\n }\n}\n","/**\n * @experimental\n *\n * `defaultProductionGate` — composes the substrate's existing safety\n * primitives (red-team / reward-hacking / canary / heldout) into a single\n * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' \"safety\n * primitives are off the critical path\" blocker.\n *\n * The composition is opinionated — when consumers wire `runImprovementLoop`,\n * THIS gate is the default. Consumers can still pass a custom gate to\n * override; the recommended pattern is to compose THIS gate with whatever\n * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).\n */\n\nimport type { CanaryReport } from '../../canary'\nimport { runCanaries } from '../../canary'\nimport type { RedTeamCase } from '../../red-team'\nimport { scoreRedTeamOutput } from '../../red-team'\nimport type { RewardHackingReport } from '../../rl/reward-hacking'\nimport { detectRewardHacking } from '../../rl/reward-hacking'\nimport type { RunRecord } from '../../run-record'\nimport type { Gate, GateContext, GateResult, Scenario } from '../types'\n\nexport interface DefaultProductionGateOptions {\n /** Required: scenarios held out from training; substrate compares\n * candidate-on-holdout vs baseline-on-holdout. */\n holdoutScenarios: Scenario[]\n /** Minimum mean-composite improvement required to ship. Default 0.5. */\n deltaThreshold?: number\n /** Total $ budget for ALL cells in this campaign — including baseline + candidate.\n * Composite verdict refuses to ship when spend exceeded budget. */\n budgetUsd?: number\n /** Red-team cases to probe candidate outputs against. When omitted the\n * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific\n * battery for tighter coverage. */\n redTeamBattery?: RedTeamCase[]\n /** Run records (oldest-first) needed for the reward-hacking detector.\n * Substrate populates from prior production-loop generations. */\n recentRuns?: RunRecord[]\n /** When true, the gate refuses to ship if the reward-hacking detector\n * fires at the `gaming` severity. Default true. */\n blockOnRewardHackingGaming?: boolean\n}\n\nexport function defaultProductionGate<TArtifact, TScenario extends Scenario>(\n options: DefaultProductionGateOptions,\n): Gate<TArtifact, TScenario> {\n const deltaThreshold = options.deltaThreshold ?? 0.5\n const blockOnGaming = options.blockOnRewardHackingGaming ?? true\n\n return {\n name: 'defaultProductionGate',\n async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {\n const reasons: string[] = []\n const contributing: Array<{ name: string; passed: boolean; detail: unknown }> = []\n\n // ── (1) heldout composite delta ─────────────────────────────────\n // Baseline scores come from their OWN map; sharing `judgeScores` would\n // compare the candidate against itself (delta 0).\n const baselineComposite = meanComposite(\n ctx.baselineArtifacts,\n ctx.baselineJudgeScores ?? ctx.judgeScores,\n options.holdoutScenarios,\n )\n const candidateComposite = meanComposite(\n ctx.candidateArtifacts,\n ctx.judgeScores,\n options.holdoutScenarios,\n )\n const delta = candidateComposite - baselineComposite\n const heldoutPass = delta >= deltaThreshold\n contributing.push({\n name: 'heldout-delta',\n passed: heldoutPass,\n detail: { baselineComposite, candidateComposite, delta, deltaThreshold },\n })\n if (!heldoutPass) {\n reasons.push(`heldout delta ${delta.toFixed(3)} < threshold ${deltaThreshold}`)\n }\n\n // ── (2) budget gate ─────────────────────────────────────────────\n const budgetPass =\n options.budgetUsd === undefined ||\n ctx.cost.candidate + ctx.cost.baseline <= options.budgetUsd\n contributing.push({\n name: 'budget',\n passed: budgetPass,\n detail: {\n candidateUsd: ctx.cost.candidate,\n baselineUsd: ctx.cost.baseline,\n budgetUsd: options.budgetUsd,\n },\n })\n if (!budgetPass) {\n reasons.push(\n `spend ${(ctx.cost.candidate + ctx.cost.baseline).toFixed(2)} > budget ${options.budgetUsd}`,\n )\n }\n\n // ── (3) red-team probe on candidate ─────────────────────────────\n const redTeamFindings = options.redTeamBattery\n ? probeRedTeam(ctx.candidateArtifacts, options.redTeamBattery)\n : { passed: true, findings: [] }\n contributing.push({\n name: 'red-team',\n passed: redTeamFindings.passed,\n detail: {\n failures: redTeamFindings.findings.length,\n sample: redTeamFindings.findings.slice(0, 3),\n },\n })\n if (!redTeamFindings.passed) {\n reasons.push(`red-team probe failed (${redTeamFindings.findings.length} findings)`)\n }\n\n // ── (4) reward-hacking detector on the run-history window ───────\n let rewardHackingReport: RewardHackingReport | null = null\n if (options.recentRuns && options.recentRuns.length >= 10) {\n rewardHackingReport = detectRewardHacking({ runs: options.recentRuns })\n }\n // reward-hacking severity is numeric (0..1). \"gaming\" threshold per\n // detectRewardHacking defaults = 0.6. Block when ANY finding is at\n // gaming threshold OR the report verdict is 'gaming'.\n const gamingThreshold = 0.6\n const gamingFindings = (rewardHackingReport?.findings ?? []).filter(\n (f) => f.severity >= gamingThreshold,\n )\n const rewardHackingPass =\n !rewardHackingReport ||\n !blockOnGaming ||\n (gamingFindings.length === 0 && rewardHackingReport.verdict !== 'gaming')\n contributing.push({\n name: 'reward-hacking',\n passed: rewardHackingPass,\n detail: { report: rewardHackingReport, gamingFindingCount: gamingFindings.length },\n })\n if (!rewardHackingPass) {\n reasons.push(\n `reward-hacking detector flagged ${gamingFindings.length} gaming-severity findings (verdict=${rewardHackingReport!.verdict})`,\n )\n }\n\n // ── (5) canary check on runs ────────────────────────────────────\n let canaryReport: CanaryReport | null = null\n if (options.recentRuns && options.recentRuns.length >= 10) {\n canaryReport = runCanaries(options.recentRuns, {})\n }\n // CanarySeverity is 'info' | 'warn' | 'error' — block on 'error'.\n const errorAlerts = (canaryReport?.alerts ?? []).filter((a) => a.severity === 'error')\n const canaryPass = errorAlerts.length === 0\n contributing.push({\n name: 'canary',\n passed: canaryPass,\n detail: { totalAlerts: canaryReport?.alerts.length ?? 0, errorAlerts: errorAlerts.length },\n })\n if (!canaryPass) {\n reasons.push(`canary error alerts: ${errorAlerts.length}`)\n }\n\n // ── Verdict ─────────────────────────────────────────────────────\n const allPassed = contributing.every((c) => c.passed)\n const decision = allPassed ? 'ship' : 'hold'\n\n return {\n decision,\n reasons: reasons.length > 0 ? reasons : ['all gates passed'],\n contributingGates: contributing,\n delta,\n }\n },\n }\n}\n\nfunction meanComposite<TArtifact, TScenario extends Scenario>(\n artifacts: Map<string, TArtifact> | undefined,\n judgeScoresByCell: Map<string, Record<string, { composite: number }>>,\n scenarios: TScenario[],\n): number {\n if (!artifacts || artifacts.size === 0) return 0\n const scenarioIds = new Set(scenarios.map((s) => s.id))\n const composites: number[] = []\n for (const [cellId, scores] of judgeScoresByCell) {\n const scenarioId = cellId.split(':')[0] ?? ''\n if (!scenarioIds.has(scenarioId)) continue\n const cellComposites = Object.values(scores).map((s) => s.composite)\n if (cellComposites.length === 0) continue\n composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length)\n }\n if (composites.length === 0) return 0\n return composites.reduce((a, b) => a + b, 0) / composites.length\n}\n\nfunction probeRedTeam<TArtifact>(\n artifacts: Map<string, TArtifact>,\n battery: RedTeamCase[],\n): { passed: boolean; findings: Array<{ scenarioId: string; reason: string }> } {\n const findings: Array<{ scenarioId: string; reason: string }> = []\n for (const [_cellId, artifact] of artifacts) {\n const text = extractText(artifact)\n if (text === undefined) continue\n for (const rtCase of battery) {\n const finding = scoreRedTeamOutput(text, [], rtCase)\n if (!finding.passed) {\n findings.push({ scenarioId: rtCase.id, reason: finding.reason ?? 'red-team probe failed' })\n }\n }\n }\n return { passed: findings.length === 0, findings }\n}\n\nfunction extractText(artifact: unknown): string | undefined {\n if (typeof artifact === 'string') return artifact\n if (artifact && typeof artifact === 'object') {\n const rec = artifact as Record<string, unknown>\n if (typeof rec.text === 'string') return rec.text\n if (typeof rec.output === 'string') return rec.output\n if (typeof rec.content === 'string') return rec.content\n }\n return undefined\n}\n","/**\n * @experimental\n *\n * `runEval` — the simplest preset over `runCampaign`. No optimizer, no\n * gate, no auto-PR. Just: run scenarios through dispatch, score with\n * judges, return CampaignResult.\n *\n * The 80% case for consumers who want a scorecard, not an improvement loop.\n */\n\nimport { type RunCampaignOptions, runCampaign } from '../run-campaign'\nimport type { CampaignResult, Scenario } from '../types'\n\nexport interface RunEvalOptions<TScenario extends Scenario, TArtifact>\n extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {\n runDir: string\n}\n\nexport async function runEval<TScenario extends Scenario, TArtifact>(\n opts: RunEvalOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n return runCampaign(opts)\n}\n","/**\n * @experimental\n *\n * Loop provenance — the durable, queryable record of WHAT a self-improvement\n * loop did and WHY, plus the OTel spans that let an OTLP collector pivot from\n * an eval-run to the underlying candidate→cell→gate→promote chain.\n *\n * Two artifacts, one source of truth:\n *\n * 1. `LoopProvenanceRecord` — a structured JSON record capturing every\n * candidate (surfaceHash + label + rationale), its measured composite,\n * the gate decision + reasons + delta, the held-out lift, the explicit\n * baseline→candidate diff, and BACKEND PROVENANCE (the\n * `assertRealBackend` verdict + worker call count + model). This is the\n * ingestable audit artifact: the +lift recomputes from it, the \"because\n * Z\" rationale survives in it, and a stub backend is detectable from it.\n *\n * 2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable\n * `TraceSpanEvent`s, pivoted on the substrate's standard\n * `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /\n * `tangle.generation` attributes (the same pivots `/adapters/otel`\n * reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,\n * not just the `cost.*` spans `runCampaign` already emits per cell.\n *\n * The record is built from the substrate's own loop result + the per-call\n * `RunRecord`s the worker emitted — no new measurement, no recomputation that\n * could drift from what the gate actually saw.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport type { HostedClient } from '../hosted/client'\nimport type {\n EvalRunCellScore,\n EvalRunEvent,\n EvalRunGenerationSnapshot,\n TraceSpanEvent,\n} from '../hosted/types'\nimport { summarizeBackendIntegrity } from '../integrity/backend-integrity'\nimport type { RunRecord } from '../run-record'\nimport type { CampaignStorage } from './storage'\nimport type { CampaignResult, GateDecision, GateResult, MutableSurface, Scenario } from './types'\n\n/** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash\n * their worktree+base identity since the content lives in git. Distinct from\n * `surfaceHash` (16-char content fingerprint used as a loop identity key);\n * this is the byte-identical-verifiable content hash the provenance record +\n * `RunRecord.promptHash` carry. */\nexport function surfaceContentHash(surface: MutableSurface): string {\n const material =\n typeof surface === 'string'\n ? surface\n : JSON.stringify({\n kind: surface.kind,\n worktreeRef: surface.worktreeRef,\n baseRef: surface.baseRef ?? null,\n })\n return `sha256:${createHash('sha256').update(material).digest('hex')}`\n}\n\nexport interface LoopProvenanceCandidate {\n /** Generation index this candidate was proposed in. */\n generation: number\n /** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */\n surfaceHash: string\n /** Full sha256 content hash — byte-identical-verifiable. */\n contentHash: string\n /** Driver label, when the driver returned a `ProposedCandidate`. */\n label?: string\n /** Driver rationale — the \"because Z\". When the driver returned a bare\n * surface (blind mutator) this is absent. */\n rationale?: string\n /** Mean composite this candidate scored on the search split. */\n composite: number\n /** Whether this candidate was promoted out of its generation. */\n promoted: boolean\n}\n\nexport interface LoopProvenanceBackend {\n /** `assertRealBackend`-grade verdict over the worker call records. */\n verdict: 'real' | 'mixed' | 'stub'\n /** Number of worker LLM calls captured (the audit's \"worker call count\"). */\n workerCallCount: number\n /** Distinct model ids observed across worker calls. */\n models: string[]\n totalInputTokens: number\n totalOutputTokens: number\n totalCostUsd: number\n}\n\n/**\n * The durable provenance record. Aligns to the hosted `EvalRunEvent` path but\n * ADDS the rationale + the explicit baseline→candidate diff (both omitted from\n * the bare hosted event) + backend provenance.\n */\nexport interface LoopProvenanceRecord {\n schema: 'tangle.loop-provenance.v1'\n runId: string\n runDir: string\n timestamp: string\n /** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */\n baselineContentHash: string\n winnerContentHash: string\n /** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */\n winnerLabel?: string\n winnerRationale?: string\n /** The explicit baseline→winner unified diff the gate decided on. */\n diff: string\n /** Every candidate across every generation, each carrying its rationale. */\n candidates: LoopProvenanceCandidate[]\n /** The gate verdict — decision + reasons + contributing gates + delta. */\n gate: {\n decision: GateDecision\n reasons: string[]\n delta?: number\n contributingGates: Array<{ name: string; passed: boolean }>\n }\n /** baseline-on-holdout composite mean. */\n baselineHoldoutComposite: number\n /** winner-on-holdout composite mean. */\n winnerHoldoutComposite: number\n /** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */\n heldOutLift: number\n /** Backend provenance: stub-vs-real verdict + worker call count + models. */\n backend: LoopProvenanceBackend\n totalCostUsd: number\n totalDurationMs: number\n}\n\nexport interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {\n runId: string\n runDir: string\n timestamp: string\n baselineSurface: MutableSurface\n winnerSurface: MutableSurface\n winnerLabel?: string\n winnerRationale?: string\n diff: string\n /** Per-generation candidate records straight off the loop result. */\n generations: Array<{\n generationIndex: number\n candidates: Array<{\n surfaceHash: string\n composite: number\n label?: string\n rationale?: string\n }>\n promoted: string[]\n /** Surfaces measured this generation, keyed positionally to candidates so\n * the content hash can be computed from the real surface text. */\n surfaces: Array<{ surfaceHash: string; surface: MutableSurface }>\n }>\n gate: GateResult\n baselineOnHoldout: CampaignResult<TArtifact, TScenario>\n winnerOnHoldout: CampaignResult<TArtifact, TScenario>\n /** Worker call records — the source for backend provenance. */\n workerRecords: ReadonlyArray<RunRecord>\n totalCostUsd: number\n totalDurationMs: number\n}\n\nfunction meanHoldoutComposite<TArtifact, TScenario extends Scenario>(\n campaign: CampaignResult<TArtifact, TScenario>,\n): number {\n const xs: number[] = []\n for (const cell of campaign.cells) {\n if (cell.error) continue\n const cs = Object.values(cell.judgeScores).map((s) => s.composite)\n if (cs.length) xs.push(cs.reduce((a, b) => a + b, 0) / cs.length)\n }\n return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0\n}\n\n/** Build the durable provenance record from a completed loop result. */\nexport function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(\n args: BuildLoopProvenanceArgs<TArtifact, TScenario>,\n): LoopProvenanceRecord {\n const integrity = summarizeBackendIntegrity(args.workerRecords)\n const models = [...new Set(args.workerRecords.map((r) => r.model))].sort()\n\n const candidates: LoopProvenanceCandidate[] = []\n for (const gen of args.generations) {\n const promotedSet = new Set(gen.promoted)\n const surfaceByHash = new Map(gen.surfaces.map((s) => [s.surfaceHash, s.surface]))\n for (const c of gen.candidates) {\n const surface = surfaceByHash.get(c.surfaceHash)\n const entry: LoopProvenanceCandidate = {\n generation: gen.generationIndex,\n surfaceHash: c.surfaceHash,\n contentHash:\n surface !== undefined ? surfaceContentHash(surface) : `sha256:${c.surfaceHash}`,\n composite: c.composite,\n promoted: promotedSet.has(c.surfaceHash),\n }\n if (c.label) entry.label = c.label\n if (c.rationale) entry.rationale = c.rationale\n candidates.push(entry)\n }\n }\n\n const baselineHoldoutComposite = meanHoldoutComposite(args.baselineOnHoldout)\n const winnerHoldoutComposite = meanHoldoutComposite(args.winnerOnHoldout)\n\n const record: LoopProvenanceRecord = {\n schema: 'tangle.loop-provenance.v1',\n runId: args.runId,\n runDir: args.runDir,\n timestamp: args.timestamp,\n baselineContentHash: surfaceContentHash(args.baselineSurface),\n winnerContentHash: surfaceContentHash(args.winnerSurface),\n diff: args.diff,\n candidates,\n gate: {\n decision: args.gate.decision,\n reasons: args.gate.reasons,\n delta: args.gate.delta,\n contributingGates: args.gate.contributingGates.map((g) => ({\n name: g.name,\n passed: g.passed,\n })),\n },\n baselineHoldoutComposite,\n winnerHoldoutComposite,\n heldOutLift: winnerHoldoutComposite - baselineHoldoutComposite,\n backend: {\n verdict: integrity.verdict,\n workerCallCount: integrity.totalRecords,\n models,\n totalInputTokens: integrity.totalInputTokens,\n totalOutputTokens: integrity.totalOutputTokens,\n totalCostUsd: integrity.totalCostUsd,\n },\n totalCostUsd: args.totalCostUsd,\n totalDurationMs: args.totalDurationMs,\n }\n if (args.winnerLabel) record.winnerLabel = args.winnerLabel\n if (args.winnerRationale) record.winnerRationale = args.winnerRationale\n return record\n}\n\n// ── OTel span emission ──────────────────────────────────────────────────\n\nconst DECISION_OK: GateDecision[] = ['ship']\n\nfunction hashId(parts: string[]): string {\n return createHash('sha256').update(parts.join(':')).digest('hex')\n}\n\nfunction gateStatus(decision: GateDecision): { code: 'OK' | 'ERROR' | 'UNSET'; message?: string } {\n return DECISION_OK.includes(decision)\n ? { code: 'OK' }\n : { code: 'ERROR', message: `gate decision: ${decision}` }\n}\n\n/**\n * Build the loop's OTLP-ingestable spans from a provenance record. One root\n * span per loop (`tangle.runId`), one span per generation, one span per\n * candidate (carrying its surfaceHash + label), and one span for the gate\n * decision (carrying reasons + delta + lift). Candidate + gate spans pivot on\n * the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`\n * reads, so the hosted collector reconstructs the full tree.\n *\n * Times are synthesized monotonically off a single base so the span tree is\n * orderable; the substrate does not retain per-candidate wall-clock starts.\n */\nexport function loopProvenanceSpans(\n record: LoopProvenanceRecord,\n opts: { baseTimeMs?: number } = {},\n): TraceSpanEvent[] {\n const traceId = hashId(['trace', record.runId]).slice(0, 32)\n const baseNano = (opts.baseTimeMs ?? (Date.parse(record.timestamp) || Date.now())) * 1_000_000\n const endNano = baseNano + Math.max(1, record.totalDurationMs) * 1_000_000\n const spans: TraceSpanEvent[] = []\n\n const rootSpanId = hashId(['root', record.runId]).slice(0, 16)\n spans.push({\n traceId,\n spanId: rootSpanId,\n name: 'improvement-loop',\n startTimeUnixNano: baseNano,\n endTimeUnixNano: endNano,\n attributes: {\n 'tangle.runId': record.runId,\n 'tangle.runDir': record.runDir,\n 'tangle.baselineContentHash': record.baselineContentHash,\n 'tangle.winnerContentHash': record.winnerContentHash,\n 'tangle.heldOutLift': record.heldOutLift,\n 'tangle.gateDecision': record.gate.decision,\n 'tangle.backendVerdict': record.backend.verdict,\n 'tangle.workerCallCount': record.backend.workerCallCount,\n 'tangle.totalCostUsd': record.totalCostUsd,\n },\n status: gateStatus(record.gate.decision),\n 'tangle.runId': record.runId,\n })\n\n // Group candidates by generation for the per-generation parent span.\n const byGen = new Map<number, LoopProvenanceCandidate[]>()\n for (const c of record.candidates) {\n const arr = byGen.get(c.generation) ?? []\n arr.push(c)\n byGen.set(c.generation, arr)\n }\n for (const [generation, cands] of [...byGen.entries()].sort((a, b) => a[0] - b[0])) {\n const genSpanId = hashId(['gen', record.runId, String(generation)]).slice(0, 16)\n const bestComposite = cands.reduce((m, c) => Math.max(m, c.composite), 0)\n spans.push({\n traceId,\n spanId: genSpanId,\n parentSpanId: rootSpanId,\n name: `generation-${generation}`,\n startTimeUnixNano: baseNano,\n endTimeUnixNano: endNano,\n attributes: {\n 'tangle.runId': record.runId,\n 'tangle.generation': generation,\n 'tangle.populationSize': cands.length,\n 'tangle.bestComposite': bestComposite,\n },\n 'tangle.runId': record.runId,\n 'tangle.generation': generation,\n })\n for (let i = 0; i < cands.length; i++) {\n const c = cands[i]!\n const candSpanId = hashId(['cand', record.runId, String(generation), c.surfaceHash]).slice(\n 0,\n 16,\n )\n const attributes: TraceSpanEvent['attributes'] = {\n 'tangle.runId': record.runId,\n 'tangle.generation': generation,\n 'tangle.surfaceHash': c.surfaceHash,\n 'tangle.contentHash': c.contentHash,\n 'tangle.composite': c.composite,\n 'tangle.promoted': c.promoted,\n }\n if (c.label) attributes['tangle.candidateLabel'] = c.label\n if (c.rationale) attributes['tangle.candidateRationale'] = c.rationale\n spans.push({\n traceId,\n spanId: candSpanId,\n parentSpanId: genSpanId,\n name: `candidate-${c.surfaceHash}`,\n startTimeUnixNano: baseNano,\n endTimeUnixNano: endNano,\n attributes,\n 'tangle.runId': record.runId,\n 'tangle.generation': generation,\n })\n }\n }\n\n // Gate span — child of root, carries the decision/reasons/delta the audit\n // needs and pivots back to the run.\n const gateSpanId = hashId(['gate', record.runId]).slice(0, 16)\n spans.push({\n traceId,\n spanId: gateSpanId,\n parentSpanId: rootSpanId,\n name: 'gate-decision',\n startTimeUnixNano: endNano,\n endTimeUnixNano: endNano,\n attributes: {\n 'tangle.runId': record.runId,\n 'tangle.gateDecision': record.gate.decision,\n 'tangle.gateDelta': record.gate.delta ?? record.heldOutLift,\n 'tangle.gateReasons': JSON.stringify(record.gate.reasons),\n 'tangle.heldOutLift': record.heldOutLift,\n 'tangle.baselineHoldoutComposite': record.baselineHoldoutComposite,\n 'tangle.winnerHoldoutComposite': record.winnerHoldoutComposite,\n },\n status: gateStatus(record.gate.decision),\n 'tangle.runId': record.runId,\n })\n\n return spans\n}\n\n// ── Durable emission ─────────────────────────────────────────────────────\n\n/** Canonical durable paths under the run dir. */\nexport function provenanceRecordPath(runDir: string): string {\n return join(runDir, 'loop-provenance.json')\n}\nexport function provenanceSpansPath(runDir: string): string {\n return join(runDir, 'loop-provenance-spans.jsonl')\n}\n\nexport interface EmitLoopProvenanceResult {\n record: LoopProvenanceRecord\n spans: TraceSpanEvent[]\n /** Absolute paths the record + spans were written to, when storage persists. */\n recordPath: string\n spansPath: string\n}\n\nexport interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario>\n extends BuildLoopProvenanceArgs<TArtifact, TScenario> {\n /** Storage the record + spans are written through. */\n storage: CampaignStorage\n /** When set, the spans are also shipped to the hosted `/v1/ingest/traces`\n * endpoint so the collector receives the full loop, not just `cost.*`. */\n hostedClient?: HostedClient\n}\n\n/** Snapshot a held-out campaign into the hosted `EvalRunGenerationSnapshot`\n * shape — per-cell composite + per-judge dimensions, aggregate mean, cost,\n * duration. The dashboard renders these as the baseline → winner comparison. */\nfunction snapshotFromHoldout<TArtifact, TScenario extends Scenario>(\n index: number,\n surfaceHash: string,\n surface: MutableSurface,\n campaign: CampaignResult<TArtifact, TScenario>,\n): EvalRunGenerationSnapshot {\n const cells: EvalRunCellScore[] = campaign.cells.map((cell) => {\n const judgeScores = Object.values(cell.judgeScores)\n const composite =\n judgeScores.length === 0\n ? 0\n : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length\n const score: EvalRunCellScore = {\n scenarioId: cell.scenarioId,\n rep: cell.rep,\n compositeMean: composite,\n dimensions: Object.fromEntries(\n Object.entries(cell.judgeScores).map(([name, s]) => [name, s.dimensions]),\n ),\n }\n if (cell.error) score.errorMessage = cell.error\n return score\n })\n const compositeMean =\n cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length\n return {\n index,\n surfaceHash,\n surface,\n cells,\n compositeMean,\n costUsd: campaign.aggregates.totalCostUsd,\n durationMs: campaign.durationMs,\n }\n}\n\n/** Build the hosted `EvalRunEvent` from the loop args + record — baseline +\n * winner snapshots, gate decision, held-out lift, cost, duration. Shipped to\n * `/v1/ingest/eval-runs` so the run appears in the dashboard's run list (the\n * trace spans, shipped separately, back the per-candidate drill-down). */\nfunction buildEvalRunEvent<TArtifact, TScenario extends Scenario>(\n args: EmitLoopProvenanceArgs<TArtifact, TScenario>,\n record: LoopProvenanceRecord,\n): EvalRunEvent {\n return {\n runId: args.runId,\n runDir: args.runDir,\n timestamp: args.timestamp,\n status: 'finished',\n labels: {},\n baseline: snapshotFromHoldout(\n 0,\n record.baselineContentHash,\n args.baselineSurface,\n args.baselineOnHoldout,\n ),\n generations: [\n snapshotFromHoldout(1, record.winnerContentHash, args.winnerSurface, args.winnerOnHoldout),\n ],\n gateDecision: args.gate.decision,\n holdoutLift: record.heldOutLift,\n totalCostUsd: args.totalCostUsd,\n totalDurationMs: args.totalDurationMs,\n }\n}\n\n/**\n * Build the provenance record + OTel spans and persist them durably under the\n * run dir (and ship spans to a hosted collector when one is wired). Returns\n * both artifacts so the caller can assert on / re-derive from them.\n *\n * Fail-loud: the durable write throws on storage failure (a swallowed write is\n * exactly the \"emitted but lost\" failure this closes). The hosted span ship is\n * the one best-effort leg — its failure is logged, not thrown, so an offline\n * collector never fails the loop (the durable artifact is the source of truth).\n */\nexport async function emitLoopProvenance<TArtifact, TScenario extends Scenario>(\n args: EmitLoopProvenanceArgs<TArtifact, TScenario>,\n): Promise<EmitLoopProvenanceResult> {\n const record = buildLoopProvenanceRecord(args)\n const spans = loopProvenanceSpans(record)\n\n args.storage.ensureDir(args.runDir)\n const recordPath = provenanceRecordPath(args.runDir)\n const spansPath = provenanceSpansPath(args.runDir)\n args.storage.write(recordPath, JSON.stringify(record, null, 2))\n args.storage.write(spansPath, spans.map((s) => JSON.stringify(s)).join('\\n'))\n\n if (args.hostedClient) {\n // Ship BOTH streams so the run is fully visible in the dashboard: the\n // eval-run event (→ run list + baseline/winner/gate/lift) AND the trace\n // spans (→ per-candidate drill-down). Best-effort: an offline collector is\n // logged, never thrown — the durable artifact above is the source of truth.\n try {\n await args.hostedClient.ingestEvalRun(buildEvalRunEvent(args, record))\n } catch (err) {\n const msg = err instanceof Error ? err.message : String(err)\n // eslint-disable-next-line no-console -- intentional: hosted ingest is best-effort\n console.warn(`[agent-eval] hosted eval-run ingest failed (continuing): ${msg}`)\n }\n try {\n await args.hostedClient.ingestTraces(spans)\n } catch (err) {\n const msg = err instanceof Error ? err.message : String(err)\n // eslint-disable-next-line no-console -- intentional: hosted span ship is best-effort\n console.warn(`[agent-eval] provenance span ingest failed (continuing): ${msg}`)\n }\n }\n\n return { record, spans, recordPath, spansPath }\n}\n"],"mappings":";;;;;;;;;;;;;AAwBO,SAAS,mBACd,MAC8B;AAC9B,SAAO;AAAA,IACL,MAAM,gBAAgB,KAAK,QAAQ,IAAI;AAAA,IACvC,MAAM,QAAQ,EAAE,gBAAgB,UAAU,gBAAgB,OAAO,GAAG;AAClE,aAAO,KAAK,QAAQ,OAAO;AAAA,QACzB,UAAU,SAAS,SAAS,IAAI,WAAY,KAAK,YAAY,CAAC;AAAA,QAC9D;AAAA,QACA;AAAA,QACA;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACF;;;ACxBO,SAAS,eACX,OACyB;AAC5B,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI,MAAM,wCAAwC;AAAA,EAC1D;AACA,SAAO;AAAA,IACL,MAAM,YAAY,MAAM,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,GAAG,CAAC;AAAA,IACpD,MAAM,OAAO,KAA6D;AACxE,YAAM,UAAwE,CAAC;AAC/E,iBAAW,QAAQ,OAAO;AACxB,cAAM,MAAM,MAAM,KAAK,OAAO,GAAG;AACjC,gBAAQ,KAAK,EAAE,MAAM,IAAI,CAAC;AAAA,MAC5B;AAQA,YAAM,YAAY,QAAQ,IAAI,CAAC,MAAM,EAAE,IAAI,QAAQ;AACnD,YAAM,UAAwB,UAAU,MAAM,CAAC,MAAM,MAAM,MAAM,IAC7D,SACA,UAAU,SAAS,cAAc,IAC/B,iBACA,UAAU,SAAS,eAAe,IAChC,kBACA,UAAU,SAAS,MAAM,IACvB,SACA;AAEV,YAAM,eAAe,QAAQ;AAAA,QAAQ,CAAC,MACpC,EAAE,IAAI,kBAAkB,SAAS,IAC7B,EAAE,IAAI,oBACN,CAAC,EAAE,MAAM,EAAE,KAAK,MAAM,QAAQ,EAAE,IAAI,aAAa,QAAQ,QAAQ,EAAE,IAAI,CAAC;AAAA,MAC9E;AAEA,YAAM,UAAU,QAAQ;AAAA,QAAQ,CAAC,MAC/B,EAAE,IAAI,QAAQ,IAAI,CAAC,WAAW,IAAI,EAAE,KAAK,IAAI,KAAK,MAAM,EAAE;AAAA,MAC5D;AAEA,aAAO;AAAA,QACL,UAAU;AAAA,QACV;AAAA,QACA,mBAAmB;AAAA,QACnB,OAAO,QAAQ,CAAC,GAAG,IAAI;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AACF;;;ACpBO,SAAS,sBACd,SAC4B;AAC5B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,gBAAgB,QAAQ,8BAA8B;AAE5D,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,OAAO,KAA6D;AACxE,YAAM,UAAoB,CAAC;AAC3B,YAAM,eAA0E,CAAC;AAKjF,YAAM,oBAAoB;AAAA,QACxB,IAAI;AAAA,QACJ,IAAI,uBAAuB,IAAI;AAAA,QAC/B,QAAQ;AAAA,MACV;AACA,YAAM,qBAAqB;AAAA,QACzB,IAAI;AAAA,QACJ,IAAI;AAAA,QACJ,QAAQ;AAAA,MACV;AACA,YAAM,QAAQ,qBAAqB;AACnC,YAAM,cAAc,SAAS;AAC7B,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ,EAAE,mBAAmB,oBAAoB,OAAO,eAAe;AAAA,MACzE,CAAC;AACD,UAAI,CAAC,aAAa;AAChB,gBAAQ,KAAK,iBAAiB,MAAM,QAAQ,CAAC,CAAC,gBAAgB,cAAc,EAAE;AAAA,MAChF;AAGA,YAAM,aACJ,QAAQ,cAAc,UACtB,IAAI,KAAK,YAAY,IAAI,KAAK,YAAY,QAAQ;AACpD,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ;AAAA,UACN,cAAc,IAAI,KAAK;AAAA,UACvB,aAAa,IAAI,KAAK;AAAA,UACtB,WAAW,QAAQ;AAAA,QACrB;AAAA,MACF,CAAC;AACD,UAAI,CAAC,YAAY;AACf,gBAAQ;AAAA,UACN,UAAU,IAAI,KAAK,YAAY,IAAI,KAAK,UAAU,QAAQ,CAAC,CAAC,aAAa,QAAQ,SAAS;AAAA,QAC5F;AAAA,MACF;AAGA,YAAM,kBAAkB,QAAQ,iBAC5B,aAAa,IAAI,oBAAoB,QAAQ,cAAc,IAC3D,EAAE,QAAQ,MAAM,UAAU,CAAC,EAAE;AACjC,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ,gBAAgB;AAAA,QACxB,QAAQ;AAAA,UACN,UAAU,gBAAgB,SAAS;AAAA,UACnC,QAAQ,gBAAgB,SAAS,MAAM,GAAG,CAAC;AAAA,QAC7C;AAAA,MACF,CAAC;AACD,UAAI,CAAC,gBAAgB,QAAQ;AAC3B,gBAAQ,KAAK,0BAA0B,gBAAgB,SAAS,MAAM,YAAY;AAAA,MACpF;AAGA,UAAI,sBAAkD;AACtD,UAAI,QAAQ,cAAc,QAAQ,WAAW,UAAU,IAAI;AACzD,8BAAsB,oBAAoB,EAAE,MAAM,QAAQ,WAAW,CAAC;AAAA,MACxE;AAIA,YAAM,kBAAkB;AACxB,YAAM,kBAAkB,qBAAqB,YAAY,CAAC,GAAG;AAAA,QAC3D,CAAC,MAAM,EAAE,YAAY;AAAA,MACvB;AACA,YAAM,oBACJ,CAAC,uBACD,CAAC,iBACA,eAAe,WAAW,KAAK,oBAAoB,YAAY;AAClE,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ,EAAE,QAAQ,qBAAqB,oBAAoB,eAAe,OAAO;AAAA,MACnF,CAAC;AACD,UAAI,CAAC,mBAAmB;AACtB,gBAAQ;AAAA,UACN,mCAAmC,eAAe,MAAM,sCAAsC,oBAAqB,OAAO;AAAA,QAC5H;AAAA,MACF;AAGA,UAAI,eAAoC;AACxC,UAAI,QAAQ,cAAc,QAAQ,WAAW,UAAU,IAAI;AACzD,uBAAe,YAAY,QAAQ,YAAY,CAAC,CAAC;AAAA,MACnD;AAEA,YAAM,eAAe,cAAc,UAAU,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,aAAa,OAAO;AACrF,YAAM,aAAa,YAAY,WAAW;AAC1C,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ,EAAE,aAAa,cAAc,OAAO,UAAU,GAAG,aAAa,YAAY,OAAO;AAAA,MAC3F,CAAC;AACD,UAAI,CAAC,YAAY;AACf,gBAAQ,KAAK,wBAAwB,YAAY,MAAM,EAAE;AAAA,MAC3D;AAGA,YAAM,YAAY,aAAa,MAAM,CAAC,MAAM,EAAE,MAAM;AACpD,YAAM,WAAW,YAAY,SAAS;AAEtC,aAAO;AAAA,QACL;AAAA,QACA,SAAS,QAAQ,SAAS,IAAI,UAAU,CAAC,kBAAkB;AAAA,QAC3D,mBAAmB;AAAA,QACnB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,cACP,WACA,mBACA,WACQ;AACR,MAAI,CAAC,aAAa,UAAU,SAAS,EAAG,QAAO;AAC/C,QAAM,cAAc,IAAI,IAAI,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;AACtD,QAAM,aAAuB,CAAC;AAC9B,aAAW,CAAC,QAAQ,MAAM,KAAK,mBAAmB;AAChD,UAAM,aAAa,OAAO,MAAM,GAAG,EAAE,CAAC,KAAK;AAC3C,QAAI,CAAC,YAAY,IAAI,UAAU,EAAG;AAClC,UAAM,iBAAiB,OAAO,OAAO,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACnE,QAAI,eAAe,WAAW,EAAG;AACjC,eAAW,KAAK,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,eAAe,MAAM;AAAA,EACnF;AACA,MAAI,WAAW,WAAW,EAAG,QAAO;AACpC,SAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAC5D;AAEA,SAAS,aACP,WACA,SAC8E;AAC9E,QAAM,WAA0D,CAAC;AACjE,aAAW,CAAC,SAAS,QAAQ,KAAK,WAAW;AAC3C,UAAM,OAAO,YAAY,QAAQ;AACjC,QAAI,SAAS,OAAW;AACxB,eAAW,UAAU,SAAS;AAC5B,YAAM,UAAU,mBAAmB,MAAM,CAAC,GAAG,MAAM;AACnD,UAAI,CAAC,QAAQ,QAAQ;AACnB,iBAAS,KAAK,EAAE,YAAY,OAAO,IAAI,QAAQ,QAAQ,UAAU,wBAAwB,CAAC;AAAA,MAC5F;AAAA,IACF;AAAA,EACF;AACA,SAAO,EAAE,QAAQ,SAAS,WAAW,GAAG,SAAS;AACnD;AAEA,SAAS,YAAY,UAAuC;AAC1D,MAAI,OAAO,aAAa,SAAU,QAAO;AACzC,MAAI,YAAY,OAAO,aAAa,UAAU;AAC5C,UAAM,MAAM;AACZ,QAAI,OAAO,IAAI,SAAS,SAAU,QAAO,IAAI;AAC7C,QAAI,OAAO,IAAI,WAAW,SAAU,QAAO,IAAI;AAC/C,QAAI,OAAO,IAAI,YAAY,SAAU,QAAO,IAAI;AAAA,EAClD;AACA,SAAO;AACT;;;ACzMA,eAAsB,QACpB,MAC+C;AAC/C,SAAO,YAAY,IAAI;AACzB;;;ACOA,SAAS,kBAAkB;AAC3B,SAAS,YAAY;AAkBd,SAAS,mBAAmB,SAAiC;AAClE,QAAM,WACJ,OAAO,YAAY,WACf,UACA,KAAK,UAAU;AAAA,IACb,MAAM,QAAQ;AAAA,IACd,aAAa,QAAQ;AAAA,IACrB,SAAS,QAAQ,WAAW;AAAA,EAC9B,CAAC;AACP,SAAO,UAAU,WAAW,QAAQ,EAAE,OAAO,QAAQ,EAAE,OAAO,KAAK,CAAC;AACtE;AAuGA,SAAS,qBACP,UACQ;AACR,QAAM,KAAe,CAAC;AACtB,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,MAAO;AAChB,UAAM,KAAK,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACjE,QAAI,GAAG,OAAQ,IAAG,KAAK,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG,MAAM;AAAA,EAClE;AACA,SAAO,GAAG,SAAS,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG,SAAS;AACjE;AAGO,SAAS,0BACd,MACsB;AACtB,QAAM,YAAY,0BAA0B,KAAK,aAAa;AAC9D,QAAM,SAAS,CAAC,GAAG,IAAI,IAAI,KAAK,cAAc,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,EAAE,KAAK;AAEzE,QAAM,aAAwC,CAAC;AAC/C,aAAW,OAAO,KAAK,aAAa;AAClC,UAAM,cAAc,IAAI,IAAI,IAAI,QAAQ;AACxC,UAAM,gBAAgB,IAAI,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,OAAO,CAAC,CAAC;AACjF,eAAW,KAAK,IAAI,YAAY;AAC9B,YAAM,UAAU,cAAc,IAAI,EAAE,WAAW;AAC/C,YAAM,QAAiC;AAAA,QACrC,YAAY,IAAI;AAAA,QAChB,aAAa,EAAE;AAAA,QACf,aACE,YAAY,SAAY,mBAAmB,OAAO,IAAI,UAAU,EAAE,WAAW;AAAA,QAC/E,WAAW,EAAE;AAAA,QACb,UAAU,YAAY,IAAI,EAAE,WAAW;AAAA,MACzC;AACA,UAAI,EAAE,MAAO,OAAM,QAAQ,EAAE;AAC7B,UAAI,EAAE,UAAW,OAAM,YAAY,EAAE;AACrC,iBAAW,KAAK,KAAK;AAAA,IACvB;AAAA,EACF;AAEA,QAAM,2BAA2B,qBAAqB,KAAK,iBAAiB;AAC5E,QAAM,yBAAyB,qBAAqB,KAAK,eAAe;AAExE,QAAM,SAA+B;AAAA,IACnC,QAAQ;AAAA,IACR,OAAO,KAAK;AAAA,IACZ,QAAQ,KAAK;AAAA,IACb,WAAW,KAAK;AAAA,IAChB,qBAAqB,mBAAmB,KAAK,eAAe;AAAA,IAC5D,mBAAmB,mBAAmB,KAAK,aAAa;AAAA,IACxD,MAAM,KAAK;AAAA,IACX;AAAA,IACA,MAAM;AAAA,MACJ,UAAU,KAAK,KAAK;AAAA,MACpB,SAAS,KAAK,KAAK;AAAA,MACnB,OAAO,KAAK,KAAK;AAAA,MACjB,mBAAmB,KAAK,KAAK,kBAAkB,IAAI,CAAC,OAAO;AAAA,QACzD,MAAM,EAAE;AAAA,QACR,QAAQ,EAAE;AAAA,MACZ,EAAE;AAAA,IACJ;AAAA,IACA;AAAA,IACA;AAAA,IACA,aAAa,yBAAyB;AAAA,IACtC,SAAS;AAAA,MACP,SAAS,UAAU;AAAA,MACnB,iBAAiB,UAAU;AAAA,MAC3B;AAAA,MACA,kBAAkB,UAAU;AAAA,MAC5B,mBAAmB,UAAU;AAAA,MAC7B,cAAc,UAAU;AAAA,IAC1B;AAAA,IACA,cAAc,KAAK;AAAA,IACnB,iBAAiB,KAAK;AAAA,EACxB;AACA,MAAI,KAAK,YAAa,QAAO,cAAc,KAAK;AAChD,MAAI,KAAK,gBAAiB,QAAO,kBAAkB,KAAK;AACxD,SAAO;AACT;AAIA,IAAM,cAA8B,CAAC,MAAM;AAE3C,SAAS,OAAO,OAAyB;AACvC,SAAO,WAAW,QAAQ,EAAE,OAAO,MAAM,KAAK,GAAG,CAAC,EAAE,OAAO,KAAK;AAClE;AAEA,SAAS,WAAW,UAA8E;AAChG,SAAO,YAAY,SAAS,QAAQ,IAChC,EAAE,MAAM,KAAK,IACb,EAAE,MAAM,SAAS,SAAS,kBAAkB,QAAQ,GAAG;AAC7D;AAaO,SAAS,oBACd,QACA,OAAgC,CAAC,GACf;AAClB,QAAM,UAAU,OAAO,CAAC,SAAS,OAAO,KAAK,CAAC,EAAE,MAAM,GAAG,EAAE;AAC3D,QAAM,YAAY,KAAK,eAAe,KAAK,MAAM,OAAO,SAAS,KAAK,KAAK,IAAI,MAAM;AACrF,QAAM,UAAU,WAAW,KAAK,IAAI,GAAG,OAAO,eAAe,IAAI;AACjE,QAAM,QAA0B,CAAC;AAEjC,QAAM,aAAa,OAAO,CAAC,QAAQ,OAAO,KAAK,CAAC,EAAE,MAAM,GAAG,EAAE;AAC7D,QAAM,KAAK;AAAA,IACT;AAAA,IACA,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,mBAAmB;AAAA,IACnB,iBAAiB;AAAA,IACjB,YAAY;AAAA,MACV,gBAAgB,OAAO;AAAA,MACvB,iBAAiB,OAAO;AAAA,MACxB,8BAA8B,OAAO;AAAA,MACrC,4BAA4B,OAAO;AAAA,MACnC,sBAAsB,OAAO;AAAA,MAC7B,uBAAuB,OAAO,KAAK;AAAA,MACnC,yBAAyB,OAAO,QAAQ;AAAA,MACxC,0BAA0B,OAAO,QAAQ;AAAA,MACzC,uBAAuB,OAAO;AAAA,IAChC;AAAA,IACA,QAAQ,WAAW,OAAO,KAAK,QAAQ;AAAA,IACvC,gBAAgB,OAAO;AAAA,EACzB,CAAC;AAGD,QAAM,QAAQ,oBAAI,IAAuC;AACzD,aAAW,KAAK,OAAO,YAAY;AACjC,UAAM,MAAM,MAAM,IAAI,EAAE,UAAU,KAAK,CAAC;AACxC,QAAI,KAAK,CAAC;AACV,UAAM,IAAI,EAAE,YAAY,GAAG;AAAA,EAC7B;AACA,aAAW,CAAC,YAAY,KAAK,KAAK,CAAC,GAAG,MAAM,QAAQ,CAAC,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,GAAG;AAClF,UAAM,YAAY,OAAO,CAAC,OAAO,OAAO,OAAO,OAAO,UAAU,CAAC,CAAC,EAAE,MAAM,GAAG,EAAE;AAC/E,UAAM,gBAAgB,MAAM,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,GAAG,EAAE,SAAS,GAAG,CAAC;AACxE,UAAM,KAAK;AAAA,MACT;AAAA,MACA,QAAQ;AAAA,MACR,cAAc;AAAA,MACd,MAAM,cAAc,UAAU;AAAA,MAC9B,mBAAmB;AAAA,MACnB,iBAAiB;AAAA,MACjB,YAAY;AAAA,QACV,gBAAgB,OAAO;AAAA,QACvB,qBAAqB;AAAA,QACrB,yBAAyB,MAAM;AAAA,QAC/B,wBAAwB;AAAA,MAC1B;AAAA,MACA,gBAAgB,OAAO;AAAA,MACvB,qBAAqB;AAAA,IACvB,CAAC;AACD,aAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,YAAM,IAAI,MAAM,CAAC;AACjB,YAAM,aAAa,OAAO,CAAC,QAAQ,OAAO,OAAO,OAAO,UAAU,GAAG,EAAE,WAAW,CAAC,EAAE;AAAA,QACnF;AAAA,QACA;AAAA,MACF;AACA,YAAM,aAA2C;AAAA,QAC/C,gBAAgB,OAAO;AAAA,QACvB,qBAAqB;AAAA,QACrB,sBAAsB,EAAE;AAAA,QACxB,sBAAsB,EAAE;AAAA,QACxB,oBAAoB,EAAE;AAAA,QACtB,mBAAmB,EAAE;AAAA,MACvB;AACA,UAAI,EAAE,MAAO,YAAW,uBAAuB,IAAI,EAAE;AACrD,UAAI,EAAE,UAAW,YAAW,2BAA2B,IAAI,EAAE;AAC7D,YAAM,KAAK;AAAA,QACT;AAAA,QACA,QAAQ;AAAA,QACR,cAAc;AAAA,QACd,MAAM,aAAa,EAAE,WAAW;AAAA,QAChC,mBAAmB;AAAA,QACnB,iBAAiB;AAAA,QACjB;AAAA,QACA,gBAAgB,OAAO;AAAA,QACvB,qBAAqB;AAAA,MACvB,CAAC;AAAA,IACH;AAAA,EACF;AAIA,QAAM,aAAa,OAAO,CAAC,QAAQ,OAAO,KAAK,CAAC,EAAE,MAAM,GAAG,EAAE;AAC7D,QAAM,KAAK;AAAA,IACT;AAAA,IACA,QAAQ;AAAA,IACR,cAAc;AAAA,IACd,MAAM;AAAA,IACN,mBAAmB;AAAA,IACnB,iBAAiB;AAAA,IACjB,YAAY;AAAA,MACV,gBAAgB,OAAO;AAAA,MACvB,uBAAuB,OAAO,KAAK;AAAA,MACnC,oBAAoB,OAAO,KAAK,SAAS,OAAO;AAAA,MAChD,sBAAsB,KAAK,UAAU,OAAO,KAAK,OAAO;AAAA,MACxD,sBAAsB,OAAO;AAAA,MAC7B,mCAAmC,OAAO;AAAA,MAC1C,iCAAiC,OAAO;AAAA,IAC1C;AAAA,IACA,QAAQ,WAAW,OAAO,KAAK,QAAQ;AAAA,IACvC,gBAAgB,OAAO;AAAA,EACzB,CAAC;AAED,SAAO;AACT;AAKO,SAAS,qBAAqB,QAAwB;AAC3D,SAAO,KAAK,QAAQ,sBAAsB;AAC5C;AACO,SAAS,oBAAoB,QAAwB;AAC1D,SAAO,KAAK,QAAQ,6BAA6B;AACnD;AAsBA,SAAS,oBACP,OACA,aACA,SACA,UAC2B;AAC3B,QAAM,QAA4B,SAAS,MAAM,IAAI,CAAC,SAAS;AAC7D,UAAM,cAAc,OAAO,OAAO,KAAK,WAAW;AAClD,UAAM,YACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,UAAM,QAA0B;AAAA,MAC9B,YAAY,KAAK;AAAA,MACjB,KAAK,KAAK;AAAA,MACV,eAAe;AAAA,MACf,YAAY,OAAO;AAAA,QACjB,OAAO,QAAQ,KAAK,WAAW,EAAE,IAAI,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC;AAAA,MAC1E;AAAA,IACF;AACA,QAAI,KAAK,MAAO,OAAM,eAAe,KAAK;AAC1C,WAAO;AAAA,EACT,CAAC;AACD,QAAM,gBACJ,MAAM,WAAW,IAAI,IAAI,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,MAAM;AAClF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,SAAS,SAAS,WAAW;AAAA,IAC7B,YAAY,SAAS;AAAA,EACvB;AACF;AAMA,SAAS,kBACP,MACA,QACc;AACd,SAAO;AAAA,IACL,OAAO,KAAK;AAAA,IACZ,QAAQ,KAAK;AAAA,IACb,WAAW,KAAK;AAAA,IAChB,QAAQ;AAAA,IACR,QAAQ,CAAC;AAAA,IACT,UAAU;AAAA,MACR;AAAA,MACA,OAAO;AAAA,MACP,KAAK;AAAA,MACL,KAAK;AAAA,IACP;AAAA,IACA,aAAa;AAAA,MACX,oBAAoB,GAAG,OAAO,mBAAmB,KAAK,eAAe,KAAK,eAAe;AAAA,IAC3F;AAAA,IACA,cAAc,KAAK,KAAK;AAAA,IACxB,aAAa,OAAO;AAAA,IACpB,cAAc,KAAK;AAAA,IACnB,iBAAiB,KAAK;AAAA,EACxB;AACF;AAYA,eAAsB,mBACpB,MACmC;AACnC,QAAM,SAAS,0BAA0B,IAAI;AAC7C,QAAM,QAAQ,oBAAoB,MAAM;AAExC,OAAK,QAAQ,UAAU,KAAK,MAAM;AAClC,QAAM,aAAa,qBAAqB,KAAK,MAAM;AACnD,QAAM,YAAY,oBAAoB,KAAK,MAAM;AACjD,OAAK,QAAQ,MAAM,YAAY,KAAK,UAAU,QAAQ,MAAM,CAAC,CAAC;AAC9D,OAAK,QAAQ,MAAM,WAAW,MAAM,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAE5E,MAAI,KAAK,cAAc;AAKrB,QAAI;AACF,YAAM,KAAK,aAAa,cAAc,kBAAkB,MAAM,MAAM,CAAC;AAAA,IACvE,SAAS,KAAK;AACZ,YAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE3D,cAAQ,KAAK,4DAA4D,GAAG,EAAE;AAAA,IAChF;AACA,QAAI;AACF,YAAM,KAAK,aAAa,aAAa,KAAK;AAAA,IAC5C,SAAS,KAAK;AACZ,YAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE3D,cAAQ,KAAK,4DAA4D,GAAG,EAAE;AAAA,IAChF;AAAA,EACF;AAEA,SAAO,EAAE,QAAQ,OAAO,YAAY,UAAU;AAChD;","names":[]}
|
|
@@ -1,484 +0,0 @@
|
|
|
1
|
-
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
2
|
-
import { a as JudgeScore } from './types-DhqpAi_z.js';
|
|
3
|
-
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
|
|
4
|
-
import { m as GateDecision } from './summary-report-ByiOUrHj.js';
|
|
5
|
-
import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
|
|
6
|
-
|
|
7
|
-
/**
|
|
8
|
-
* Release confidence gate.
|
|
9
|
-
*
|
|
10
|
-
* This is the production-facing composition layer over the lower-level
|
|
11
|
-
* primitives:
|
|
12
|
-
* - Dataset manifests prove corpus/version coverage.
|
|
13
|
-
* - RunRecord rows prove reproducible search/holdout outcomes.
|
|
14
|
-
* - Multi-shot trace evidence carries turn counts and ASI diagnostics.
|
|
15
|
-
* - HeldOutGate decisions remain the paired promotion authority.
|
|
16
|
-
*
|
|
17
|
-
* The gate is intentionally pure and conservative. Missing declared evidence
|
|
18
|
-
* fails closed instead of being treated as a neutral zero.
|
|
19
|
-
*/
|
|
20
|
-
|
|
21
|
-
/** Severity of an actionable finding attached to a run/trace. */
|
|
22
|
-
type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
23
|
-
/** Actionable side-info — a diagnosed finding the loop can act on. */
|
|
24
|
-
interface ActionableSideInfo {
|
|
25
|
-
/** Stable expectation/check id when available. */
|
|
26
|
-
expectationId?: string;
|
|
27
|
-
/** Human-readable diagnosis of what happened. */
|
|
28
|
-
message: string;
|
|
29
|
-
severity?: AsiSeverity;
|
|
30
|
-
/** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
|
|
31
|
-
evidence?: string;
|
|
32
|
-
/** Prompt/tool/context surface likely responsible. */
|
|
33
|
-
responsibleSurface?: string;
|
|
34
|
-
/** Suggested fix in natural language. */
|
|
35
|
-
suggestion?: string;
|
|
36
|
-
/** Whether this expectation was satisfied. Defaults to false for ASI rows. */
|
|
37
|
-
matched?: boolean;
|
|
38
|
-
metadata?: Record<string, unknown>;
|
|
39
|
-
}
|
|
40
|
-
type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
|
|
41
|
-
type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
|
|
42
|
-
interface ReleaseTraceEvidence {
|
|
43
|
-
scenarioId: string;
|
|
44
|
-
candidateId?: string;
|
|
45
|
-
split?: RunSplitTag;
|
|
46
|
-
score?: number;
|
|
47
|
-
ok?: boolean;
|
|
48
|
-
turnCount?: number;
|
|
49
|
-
costUsd?: number;
|
|
50
|
-
durationMs?: number;
|
|
51
|
-
failureMode?: string;
|
|
52
|
-
asi?: ActionableSideInfo[];
|
|
53
|
-
metadata?: Record<string, unknown>;
|
|
54
|
-
}
|
|
55
|
-
interface ReleaseConfidenceThresholds {
|
|
56
|
-
/** Require a Dataset manifest or explicit scenarios. Default true. */
|
|
57
|
-
requireCorpus?: boolean;
|
|
58
|
-
minScenarioCount?: number;
|
|
59
|
-
minSearchRuns?: number;
|
|
60
|
-
minHoldoutRuns?: number;
|
|
61
|
-
/** Require at least one holdout scenario/run. Default true. */
|
|
62
|
-
requireHoldout?: boolean;
|
|
63
|
-
minPassRate?: number;
|
|
64
|
-
minMeanScore?: number;
|
|
65
|
-
/** Search mean may exceed holdout mean by at most this much. */
|
|
66
|
-
maxOverfitGap?: number;
|
|
67
|
-
maxMeanCostUsd?: number;
|
|
68
|
-
maxP95WallMs?: number;
|
|
69
|
-
/** Low-score/failed rows must carry ASI. Default true. */
|
|
70
|
-
requireAsiForFailures?: boolean;
|
|
71
|
-
/** Score below this is considered a failure for ASI coverage. Default 0.5. */
|
|
72
|
-
failureScoreThreshold?: number;
|
|
73
|
-
}
|
|
74
|
-
interface ReleaseConfidenceInput {
|
|
75
|
-
target: string;
|
|
76
|
-
candidateId?: string;
|
|
77
|
-
baselineId?: string;
|
|
78
|
-
dataset?: DatasetManifest;
|
|
79
|
-
scenarios?: readonly DatasetScenario[];
|
|
80
|
-
runs?: readonly RunRecord[];
|
|
81
|
-
traces?: readonly ReleaseTraceEvidence[];
|
|
82
|
-
gateDecision?: GateDecision | null;
|
|
83
|
-
thresholds?: ReleaseConfidenceThresholds;
|
|
84
|
-
}
|
|
85
|
-
interface ReleaseConfidenceAxis {
|
|
86
|
-
name: ReleaseConfidenceAxisName;
|
|
87
|
-
status: ReleaseConfidenceStatus;
|
|
88
|
-
score: number;
|
|
89
|
-
detail: string;
|
|
90
|
-
}
|
|
91
|
-
interface ReleaseConfidenceIssue {
|
|
92
|
-
axis: ReleaseConfidenceAxisName;
|
|
93
|
-
severity: 'critical' | 'warning';
|
|
94
|
-
code: string;
|
|
95
|
-
detail: string;
|
|
96
|
-
}
|
|
97
|
-
interface ReleaseConfidenceMetrics {
|
|
98
|
-
scenarioCount: number;
|
|
99
|
-
searchRuns: number;
|
|
100
|
-
holdoutRuns: number;
|
|
101
|
-
passRate: number;
|
|
102
|
-
meanScore: number;
|
|
103
|
-
searchMeanScore: number;
|
|
104
|
-
holdoutMeanScore: number;
|
|
105
|
-
overfitGap: number;
|
|
106
|
-
meanCostUsd: number;
|
|
107
|
-
p95WallMs: number;
|
|
108
|
-
failedRows: number;
|
|
109
|
-
failuresWithAsi: number;
|
|
110
|
-
singleShotTraces: number;
|
|
111
|
-
multiShotTraces: number;
|
|
112
|
-
splitCounts: Record<DatasetSplit, number>;
|
|
113
|
-
domainCounts: Record<string, number>;
|
|
114
|
-
failureModeCounts: Record<string, number>;
|
|
115
|
-
responsibleSurfaceCounts: Record<string, number>;
|
|
116
|
-
}
|
|
117
|
-
interface ReleaseConfidenceScorecard {
|
|
118
|
-
target: string;
|
|
119
|
-
candidateId: string | null;
|
|
120
|
-
baselineId: string | null;
|
|
121
|
-
status: ReleaseConfidenceStatus;
|
|
122
|
-
promote: boolean;
|
|
123
|
-
axes: ReleaseConfidenceAxis[];
|
|
124
|
-
issues: ReleaseConfidenceIssue[];
|
|
125
|
-
metrics: ReleaseConfidenceMetrics;
|
|
126
|
-
dataset: DatasetManifest | null;
|
|
127
|
-
gateDecision: GateDecision | null;
|
|
128
|
-
summary: string;
|
|
129
|
-
}
|
|
130
|
-
declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
|
|
131
|
-
declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
|
|
132
|
-
|
|
133
|
-
/**
|
|
134
|
-
* Normalize scores so all dimensions follow "higher = better".
|
|
135
|
-
* Inverted dimensions (hallucination, false_confidence, worst_failure)
|
|
136
|
-
* already use inverted scoring in the prompt (10 = no hallucination),
|
|
137
|
-
* but this function ensures consistency if raw scores leak through.
|
|
138
|
-
*/
|
|
139
|
-
declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
|
|
140
|
-
/** Weighted mean — falls back to uniform weights when omitted */
|
|
141
|
-
declare function weightedMean(scores: {
|
|
142
|
-
score: number;
|
|
143
|
-
weight?: number;
|
|
144
|
-
}[]): number;
|
|
145
|
-
/** Bootstrap confidence interval */
|
|
146
|
-
declare function confidenceInterval(scores: number[], confidence?: number, opts?: {
|
|
147
|
-
seed?: number;
|
|
148
|
-
resamples?: number;
|
|
149
|
-
}): {
|
|
150
|
-
mean: number;
|
|
151
|
-
lower: number;
|
|
152
|
-
upper: number;
|
|
153
|
-
};
|
|
154
|
-
/**
|
|
155
|
-
* Inter-rater reliability — simplified Krippendorff's alpha.
|
|
156
|
-
*
|
|
157
|
-
* Each inner array is one judge's scores for all items.
|
|
158
|
-
* All arrays must have the same length (same items scored).
|
|
159
|
-
*/
|
|
160
|
-
declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
|
|
161
|
-
/**
|
|
162
|
-
* Mann-Whitney U test for comparing two independent groups.
|
|
163
|
-
* Returns U statistic and approximate p-value (normal approximation).
|
|
164
|
-
*/
|
|
165
|
-
declare function mannWhitneyU(a: number[], b: number[]): {
|
|
166
|
-
u: number;
|
|
167
|
-
p: number;
|
|
168
|
-
};
|
|
169
|
-
/** Partial credit: returns 0-1 ratio of current toward target */
|
|
170
|
-
declare function partialCredit(current: number, target: number): number;
|
|
171
|
-
/**
|
|
172
|
-
* Paired t-test — before/after measurements on the SAME items.
|
|
173
|
-
* Pairing removes inter-item variance, giving tighter significance than
|
|
174
|
-
* an unpaired test when comparing prompt v1 vs prompt v2 on identical
|
|
175
|
-
* scenarios.
|
|
176
|
-
*/
|
|
177
|
-
declare function pairedTTest(before: number[], after: number[]): {
|
|
178
|
-
t: number;
|
|
179
|
-
df: number;
|
|
180
|
-
p: number;
|
|
181
|
-
};
|
|
182
|
-
/**
|
|
183
|
-
* Wilcoxon signed-rank test — paired non-parametric alternative.
|
|
184
|
-
* Use when the differences aren't normally distributed.
|
|
185
|
-
*/
|
|
186
|
-
declare function wilcoxonSignedRank(before: number[], after: number[]): {
|
|
187
|
-
w: number;
|
|
188
|
-
p: number;
|
|
189
|
-
};
|
|
190
|
-
/**
|
|
191
|
-
* Cohen's d — standardized effect size for two independent groups.
|
|
192
|
-
* Positive d means group b has higher mean than group a.
|
|
193
|
-
* Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
|
|
194
|
-
*/
|
|
195
|
-
declare function cohensD(a: number[], b: number[]): number;
|
|
196
|
-
type CliffsMagnitude = 'negligible' | 'small' | 'medium' | 'large';
|
|
197
|
-
/**
|
|
198
|
-
* Cliff's delta — a non-parametric effect size for two independent samples.
|
|
199
|
-
* `δ = (#(after > before) − #(after < before)) / (n_before · n_after)`,
|
|
200
|
-
* ranging [-1, 1]. Positive ⇒ `after` tends to exceed `before` (improvement).
|
|
201
|
-
*
|
|
202
|
-
* Distribution-free counterpart to Cohen's d: no normality assumption, robust
|
|
203
|
-
* to the bounded/skewed score distributions judges produce. Pairs with
|
|
204
|
-
* `pairedBootstrap` / `wilcoxonSignedRank` for the non-parametric reporting
|
|
205
|
-
* path. Returns 0 when either sample is empty.
|
|
206
|
-
*/
|
|
207
|
-
declare function cliffsDelta(before: number[], after: number[]): number;
|
|
208
|
-
/**
|
|
209
|
-
* Map a Cliff's delta to a qualitative magnitude using the standard
|
|
210
|
-
* Romano et al. thresholds (|δ|): <0.147 negligible, <0.33 small,
|
|
211
|
-
* <0.474 medium, else large.
|
|
212
|
-
*/
|
|
213
|
-
declare function interpretCliffs(delta: number): CliffsMagnitude;
|
|
214
|
-
interface WeightedCompositeInput {
|
|
215
|
-
/** Per-dimension scores (typically 0..1). */
|
|
216
|
-
dims: Record<string, number>;
|
|
217
|
-
/** Weight per dimension. Every weighted dimension MUST be present in
|
|
218
|
-
* `dims` — a weight for an absent dimension is a config error and throws,
|
|
219
|
-
* because silently dropping it would renormalise the composite onto a
|
|
220
|
-
* different denominator than intended. */
|
|
221
|
-
weights: Record<string, number>;
|
|
222
|
-
/** Optional pass threshold; when set, the result reports `pass`. */
|
|
223
|
-
threshold?: number;
|
|
224
|
-
}
|
|
225
|
-
interface WeightedCompositeResult {
|
|
226
|
-
composite: number;
|
|
227
|
-
pass?: boolean;
|
|
228
|
-
}
|
|
229
|
-
/**
|
|
230
|
-
* Weighted composite over judge dimensions: `Σ(score_d · w_d) / Σ(w_d)` across
|
|
231
|
-
* the weighted dimensions. The canonical replacement for the per-consumer
|
|
232
|
-
* hand-rolled composite math (tax/legal/creative/gtm each ship a copy).
|
|
233
|
-
*
|
|
234
|
-
* Fail-loud: throws if a weighted dimension is missing from `dims`, if any
|
|
235
|
-
* weight is negative, or if the weights sum to 0 — none of which can produce
|
|
236
|
-
* a meaningful composite.
|
|
237
|
-
*/
|
|
238
|
-
declare function weightedComposite(input: WeightedCompositeInput): WeightedCompositeResult;
|
|
239
|
-
interface CorpusScoreRecord {
|
|
240
|
-
/** Stable identifier for the rated item (scenario, span, turn, …). */
|
|
241
|
-
itemId: string;
|
|
242
|
-
/** Identifier for the judge that produced this score. */
|
|
243
|
-
judgeName: string;
|
|
244
|
-
/** Dimension name (matches `JudgeScore.dimension`). */
|
|
245
|
-
dimension: string;
|
|
246
|
-
/** Numeric score; must be finite. */
|
|
247
|
-
score: number;
|
|
248
|
-
}
|
|
249
|
-
interface CorpusAgreementPerDimension extends ContinuousAgreement {
|
|
250
|
-
dimension: string;
|
|
251
|
-
/** Item IDs that contributed to this dimension's matrix (every judge scored them). */
|
|
252
|
-
itemIds: string[];
|
|
253
|
-
/** Judge IDs that contributed to this dimension's matrix. */
|
|
254
|
-
judgeIds: string[];
|
|
255
|
-
}
|
|
256
|
-
interface CorpusAgreementReport {
|
|
257
|
-
/** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
|
|
258
|
-
perDimension: CorpusAgreementPerDimension[];
|
|
259
|
-
/** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
|
|
260
|
-
overallIcc: number;
|
|
261
|
-
/** Mean weighted κ across dimensions (NaN if none finite). */
|
|
262
|
-
overallWeightedKappa: number;
|
|
263
|
-
/** Dimensions evaluated (sorted). */
|
|
264
|
-
dimensions: string[];
|
|
265
|
-
/** Judges seen across the corpus (sorted). */
|
|
266
|
-
judgeIds: string[];
|
|
267
|
-
}
|
|
268
|
-
interface CorpusAgreementOptions extends ContinuousAgreementOptions {
|
|
269
|
-
/**
|
|
270
|
-
* Restrict the audit to these dimensions. Default = every dimension
|
|
271
|
-
* that appears in the input. A dimension named here but absent from
|
|
272
|
-
* the input throws — silent omission would corrupt the overall metric.
|
|
273
|
-
*/
|
|
274
|
-
dimensions?: string[];
|
|
275
|
-
/**
|
|
276
|
-
* Restrict the audit to these judges. Default = every judge that
|
|
277
|
-
* appears in the input. A judge named here but absent from a
|
|
278
|
-
* dimension throws (see "fail loud" below).
|
|
279
|
-
*/
|
|
280
|
-
judges?: string[];
|
|
281
|
-
}
|
|
282
|
-
/**
|
|
283
|
-
* Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
|
|
284
|
-
*
|
|
285
|
-
* For each dimension, builds the [n_items][n_judges] matrix of scores
|
|
286
|
-
* (keeping only items every judge rated on that dimension), then runs
|
|
287
|
-
* `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
|
|
288
|
-
* bootstrap CIs. Reports a pooled mean across dimensions as a single
|
|
289
|
-
* "is this judge panel reliable on this corpus?" number.
|
|
290
|
-
*
|
|
291
|
-
* Fail-loud contract:
|
|
292
|
-
* - Empty input throws.
|
|
293
|
-
* - Fewer than 2 judges or fewer than 2 items per dimension throws.
|
|
294
|
-
* - A judge present in some dimensions but with zero scored items on
|
|
295
|
-
* another dimension throws (would silently shrink the matrix).
|
|
296
|
-
* - Duplicate (itemId, judgeName, dimension) records throw.
|
|
297
|
-
*/
|
|
298
|
-
declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
|
|
299
|
-
/**
|
|
300
|
-
* Convenience adapter for `JudgeScore[]` data keyed externally by item.
|
|
301
|
-
*
|
|
302
|
-
* Use when you have per-item arrays of `JudgeScore[]` (e.g. one
|
|
303
|
-
* `ScenarioResult.judgeScores` per scenario) and want corpus-wide
|
|
304
|
-
* agreement without manually flattening. `itemId` must be unique per
|
|
305
|
-
* row of `itemsScores`.
|
|
306
|
-
*/
|
|
307
|
-
declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
|
|
308
|
-
itemId: string;
|
|
309
|
-
scores: JudgeScore[];
|
|
310
|
-
}>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
|
|
311
|
-
/**
|
|
312
|
-
* Required N per arm for a two-sample comparison at target effect size,
|
|
313
|
-
* alpha, and power. Normal-approximation formula:
|
|
314
|
-
* n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
|
|
315
|
-
* where d is Cohen's d. Returns Infinity for effect ≤ 0.
|
|
316
|
-
*/
|
|
317
|
-
declare function requiredSampleSize(opts: {
|
|
318
|
-
effect: number;
|
|
319
|
-
alpha?: number;
|
|
320
|
-
power?: number;
|
|
321
|
-
twoSided?: boolean;
|
|
322
|
-
}): number;
|
|
323
|
-
/**
|
|
324
|
-
* Minimum detectable paired effect (standardised units) for a target paired
|
|
325
|
-
* sample size: d_min = (z_{1-α/2} + z_β) / sqrt(n_paired). Multiply by
|
|
326
|
-
* sd(deltas) for score units; treat as a lower bound — Wilcoxon and bootstrap
|
|
327
|
-
* have asymptotic relative efficiency below 1 vs the t-test on heavy tails.
|
|
328
|
-
*/
|
|
329
|
-
declare function pairedMde(opts: {
|
|
330
|
-
nPaired: number;
|
|
331
|
-
alpha?: number;
|
|
332
|
-
power?: number;
|
|
333
|
-
twoSided?: boolean;
|
|
334
|
-
}): number;
|
|
335
|
-
/** Bonferroni adjustment: multiply every p-value by the test count, clamp at 1. */
|
|
336
|
-
declare function bonferroni(pValues: number[], alpha?: number): {
|
|
337
|
-
adjusted: number[];
|
|
338
|
-
significant: boolean[];
|
|
339
|
-
};
|
|
340
|
-
/**
|
|
341
|
-
* Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
|
|
342
|
-
* significance at the target FDR; handles ties and preserves q monotonicity.
|
|
343
|
-
*/
|
|
344
|
-
declare function benjaminiHochberg(pValues: number[], fdr?: number): {
|
|
345
|
-
qValues: number[];
|
|
346
|
-
significant: boolean[];
|
|
347
|
-
};
|
|
348
|
-
interface PairedBootstrapResult {
|
|
349
|
-
/** Number of paired observations. */
|
|
350
|
-
n: number;
|
|
351
|
-
/** Median of paired deltas (after − before). */
|
|
352
|
-
median: number;
|
|
353
|
-
/** Mean of paired deltas. */
|
|
354
|
-
mean: number;
|
|
355
|
-
/** Lower bound of the bootstrap CI on the chosen statistic. */
|
|
356
|
-
low: number;
|
|
357
|
-
/** Upper bound of the bootstrap CI on the chosen statistic. */
|
|
358
|
-
high: number;
|
|
359
|
-
/** Confidence level used (e.g. 0.95). */
|
|
360
|
-
confidence: number;
|
|
361
|
-
/** Number of bootstrap resamples used. */
|
|
362
|
-
resamples: number;
|
|
363
|
-
}
|
|
364
|
-
interface PairedBootstrapOptions {
|
|
365
|
-
/** Confidence level. Default 0.95. */
|
|
366
|
-
confidence?: number;
|
|
367
|
-
/** Bootstrap resample count. Default 2000. */
|
|
368
|
-
resamples?: number;
|
|
369
|
-
/** Statistic to bootstrap. Default 'median'. */
|
|
370
|
-
statistic?: 'median' | 'mean';
|
|
371
|
-
/** Deterministic seed. If omitted, uses Math.random(). */
|
|
372
|
-
seed?: number;
|
|
373
|
-
}
|
|
374
|
-
/**
|
|
375
|
-
* Paired bootstrap on (after − before) deltas. Returns a CI on the chosen
|
|
376
|
-
* statistic (median by default); pairs are resampled with replacement. The
|
|
377
|
-
* lower bound is what the promotion gate checks — `low > threshold` means the
|
|
378
|
-
* gain is real at the confidence level. Throws on unequal sample sizes.
|
|
379
|
-
*/
|
|
380
|
-
declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
|
|
381
|
-
|
|
382
|
-
/**
|
|
383
|
-
* Bootstrap-CI promotion gate.
|
|
384
|
-
*
|
|
385
|
-
* In any iterative-improvement loop (GEPA, prompt evolution, dataset
|
|
386
|
-
* curation), the question is "did this generation actually improve, or are
|
|
387
|
-
* we celebrating noise?". With small N and noisy outcomes, point-estimate
|
|
388
|
-
* deltas lie. Bootstrap confidence intervals tell the operator whether the
|
|
389
|
-
* delta is real before code or prompts get promoted.
|
|
390
|
-
*
|
|
391
|
-
* This module is pure functions — no I/O, no model calls. Easy to unit-test
|
|
392
|
-
* and to compose into any verdict gate.
|
|
393
|
-
*
|
|
394
|
-
* Default gate:
|
|
395
|
-
* - Bootstrap mean baseline vs candidate (1k resamples).
|
|
396
|
-
* - Compute the delta distribution; pass if the lower CI bound > 0.
|
|
397
|
-
* - Tunable confidence (default 95%) and resample count.
|
|
398
|
-
*
|
|
399
|
-
* Verdict semantics intentionally match the existing `experiments.jsonl`
|
|
400
|
-
* vocabulary:
|
|
401
|
-
* - ADVANCE: candidate's CI lower bound > baseline mean (real win)
|
|
402
|
-
* - KEEP: overlap, but candidate point estimate >= baseline (neutral)
|
|
403
|
-
* - REVERT: candidate's CI upper bound < baseline mean (real regression)
|
|
404
|
-
* - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
|
|
405
|
-
*/
|
|
406
|
-
type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
|
|
407
|
-
interface BootstrapResult {
|
|
408
|
-
baselineMean: number;
|
|
409
|
-
candidateMean: number;
|
|
410
|
-
/** candidateMean - baselineMean, point estimate. */
|
|
411
|
-
delta: number;
|
|
412
|
-
/** Lower bound of the (1 - alpha) CI on the delta. */
|
|
413
|
-
ciLower: number;
|
|
414
|
-
/** Upper bound of the (1 - alpha) CI on the delta. */
|
|
415
|
-
ciUpper: number;
|
|
416
|
-
/** Number of bootstrap resamples used. */
|
|
417
|
-
iterations: number;
|
|
418
|
-
alpha: number;
|
|
419
|
-
verdict: Verdict;
|
|
420
|
-
}
|
|
421
|
-
interface BootstrapOptions {
|
|
422
|
-
/** Confidence level alpha (default 0.05 → 95% CI). */
|
|
423
|
-
alpha?: number;
|
|
424
|
-
/** Number of resamples (default 1000). */
|
|
425
|
-
iterations?: number;
|
|
426
|
-
/**
|
|
427
|
-
* Minimum total samples (baseline + candidate) below which we always
|
|
428
|
-
* return INCONCLUSIVE — bootstrap with too few samples is meaningless.
|
|
429
|
-
* Default 6 (combined).
|
|
430
|
-
*/
|
|
431
|
-
minTotalSamples?: number;
|
|
432
|
-
/** RNG seed for reproducibility. Default: Math.random. */
|
|
433
|
-
seed?: number;
|
|
434
|
-
}
|
|
435
|
-
/**
|
|
436
|
-
* Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
|
|
437
|
-
*
|
|
438
|
-
* Uses simple percentile bootstrap on the difference of resampled means.
|
|
439
|
-
* That's the standard non-parametric primitive — no distributional
|
|
440
|
-
* assumptions, robust to skew, easy to reason about.
|
|
441
|
-
*/
|
|
442
|
-
declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
|
|
443
|
-
/**
|
|
444
|
-
* Judge-replay promotion gate.
|
|
445
|
-
*
|
|
446
|
-
* The cheap inner-loop judge that drives an evolution run is by definition
|
|
447
|
-
* fast and noisy. When you're about to promote a winning variant to the
|
|
448
|
-
* canonical default, you want a STRONGER judge (a more expensive model, a
|
|
449
|
-
* human grader, a separately-trained reward model) to confirm the win
|
|
450
|
-
* generalises beyond the inner loop.
|
|
451
|
-
*
|
|
452
|
-
* This helper takes raw winner + baseline outputs, scores both through the
|
|
453
|
-
* stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
|
|
454
|
-
* judge agrees the winner is real with the configured confidence. Doesn't
|
|
455
|
-
* matter what shape your "output" is — pass a string, an object, anything
|
|
456
|
-
* the judge can read.
|
|
457
|
-
*/
|
|
458
|
-
interface JudgeReplayGateArgs<TOutput> {
|
|
459
|
-
baselineOutputs: TOutput[];
|
|
460
|
-
candidateOutputs: TOutput[];
|
|
461
|
-
/** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
|
|
462
|
-
judge: (output: TOutput) => Promise<number> | number;
|
|
463
|
-
alpha?: number;
|
|
464
|
-
iterations?: number;
|
|
465
|
-
/** RNG seed for reproducibility. */
|
|
466
|
-
seed?: number;
|
|
467
|
-
/** Maximum concurrent judge calls. Default 4. */
|
|
468
|
-
judgeConcurrency?: number;
|
|
469
|
-
}
|
|
470
|
-
declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
|
|
471
|
-
baselineSamples: number;
|
|
472
|
-
candidateSamples: number;
|
|
473
|
-
}>;
|
|
474
|
-
|
|
475
|
-
interface RenderReleaseReportOptions {
|
|
476
|
-
title?: string;
|
|
477
|
-
runs?: readonly RunRecord[];
|
|
478
|
-
comparator?: string;
|
|
479
|
-
traceAnalystFindings?: readonly string[];
|
|
480
|
-
nextActions?: readonly string[];
|
|
481
|
-
}
|
|
482
|
-
declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
|
|
483
|
-
|
|
484
|
-
export { type ActionableSideInfo as A, type BootstrapOptions as B, type CliffsMagnitude as C, cliffsDelta as D, cohensD as E, confidenceInterval as F, corpusInterRaterAgreement as G, corpusInterRaterAgreementFromJudgeScores as H, interRaterReliability as I, type JudgeReplayGateArgs as J, interpretCliffs as K, mannWhitneyU as L, normalizeScores as M, pairedMde as N, pairedTTest as O, type PairedBootstrapOptions as P, partialCredit as Q, type ReleaseConfidenceAxis as R, requiredSampleSize as S, weightedComposite as T, weightedMean as U, type Verdict as V, type WeightedCompositeInput as W, type BootstrapResult as a, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, renderReleaseReport as r, type AsiSeverity as s, type CorpusAgreementOptions as t, type CorpusAgreementPerDimension as u, type CorpusAgreementReport as v, wilcoxonSignedRank as w, type CorpusScoreRecord as x, type WeightedCompositeResult as y, bonferroni as z };
|