@tangle-network/agent-eval 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -1
- package/README.md +4 -0
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/chunk-UAND2LOT.js +738 -0
- package/dist/chunk-UAND2LOT.js.map +1 -0
- package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
- package/dist/chunk-USHQBPMH.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/index.d.ts +10 -284
- package/dist/index.js +39 -19
- package/dist/index.js.map +1 -1
- package/dist/integrity-K2oVlF57.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization-UVDNKaO6.d.ts +574 -0
- package/dist/optimization.d.ts +6 -144
- package/dist/optimization.js +9 -2
- package/dist/reporting-B82RSv9C.d.ts +593 -0
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +15 -8
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +16 -5
- package/dist/wire/index.js +3 -3
- package/docs/research-report-methodology.md +19 -4
- package/docs/wire-protocol.md +1 -1
- package/package.json +2 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-HRZELXCR.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/release-confidence.ts","../src/meta-eval/rubric-predictive-validity.ts","../src/sequential.ts","../src/release-report.ts","../src/promotion-gate.ts"],"sourcesContent":["/**\n * Release confidence gate.\n *\n * This is the production-facing composition layer over the lower-level\n * primitives:\n * - Dataset manifests prove corpus/version coverage.\n * - RunRecord rows prove reproducible search/holdout outcomes.\n * - Multi-shot trace evidence carries turn counts and ASI diagnostics.\n * - HeldOutGate decisions remain the paired promotion authority.\n *\n * The gate is intentionally pure and conservative. Missing declared evidence\n * fails closed instead of being treated as a neutral zero.\n */\n\nimport type { DatasetManifest, DatasetScenario, DatasetSplit } from './dataset'\nimport type { GateDecision } from './held-out-gate'\nimport type { ActionableSideInfo, MultiShotTrialResult } from './multi-shot-optimization'\nimport type { RunRecord, RunSplitTag } from './run-record'\n\nexport type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail'\nexport type ReleaseConfidenceAxisName =\n | 'corpus'\n | 'quality'\n | 'generalization'\n | 'diagnostics'\n | 'efficiency'\n\nexport interface ReleaseTraceEvidence {\n scenarioId: string\n candidateId?: string\n split?: RunSplitTag\n score?: number\n ok?: boolean\n turnCount?: number\n costUsd?: number\n durationMs?: number\n failureMode?: string\n asi?: ActionableSideInfo[]\n metadata?: Record<string, unknown>\n}\n\nexport interface ReleaseConfidenceThresholds {\n /** Require a Dataset manifest or explicit scenarios. Default true. */\n requireCorpus?: boolean\n minScenarioCount?: number\n minSearchRuns?: number\n minHoldoutRuns?: number\n /** Require at least one holdout scenario/run. Default true. */\n requireHoldout?: boolean\n minPassRate?: number\n minMeanScore?: number\n /** Search mean may exceed holdout mean by at most this much. */\n maxOverfitGap?: number\n maxMeanCostUsd?: number\n maxP95WallMs?: number\n /** Low-score/failed rows must carry ASI. Default true. */\n requireAsiForFailures?: boolean\n /** Score below this is considered a failure for ASI coverage. Default 0.5. */\n failureScoreThreshold?: number\n}\n\nexport interface ReleaseConfidenceInput {\n target: string\n candidateId?: string\n baselineId?: string\n dataset?: DatasetManifest\n scenarios?: readonly DatasetScenario[]\n runs?: readonly RunRecord[]\n traces?: readonly ReleaseTraceEvidence[]\n gateDecision?: GateDecision | null\n thresholds?: ReleaseConfidenceThresholds\n}\n\nexport interface ReleaseConfidenceAxis {\n name: ReleaseConfidenceAxisName\n status: ReleaseConfidenceStatus\n score: number\n detail: string\n}\n\nexport interface ReleaseConfidenceIssue {\n axis: ReleaseConfidenceAxisName\n severity: 'critical' | 'warning'\n code: string\n detail: string\n}\n\nexport interface ReleaseConfidenceMetrics {\n scenarioCount: number\n searchRuns: number\n holdoutRuns: number\n passRate: number\n meanScore: number\n searchMeanScore: number\n holdoutMeanScore: number\n overfitGap: number\n meanCostUsd: number\n p95WallMs: number\n failedRows: number\n failuresWithAsi: number\n singleShotTraces: number\n multiShotTraces: number\n splitCounts: Record<DatasetSplit, number>\n domainCounts: Record<string, number>\n failureModeCounts: Record<string, number>\n responsibleSurfaceCounts: Record<string, number>\n}\n\nexport interface ReleaseConfidenceScorecard {\n target: string\n candidateId: string | null\n baselineId: string | null\n status: ReleaseConfidenceStatus\n promote: boolean\n axes: ReleaseConfidenceAxis[]\n issues: ReleaseConfidenceIssue[]\n metrics: ReleaseConfidenceMetrics\n dataset: DatasetManifest | null\n gateDecision: GateDecision | null\n summary: string\n}\n\nconst DEFAULT_THRESHOLDS: Required<ReleaseConfidenceThresholds> = {\n requireCorpus: true,\n minScenarioCount: 1,\n minSearchRuns: 1,\n minHoldoutRuns: 1,\n requireHoldout: true,\n minPassRate: 0.8,\n minMeanScore: 0.7,\n maxOverfitGap: 0.15,\n maxMeanCostUsd: Number.POSITIVE_INFINITY,\n maxP95WallMs: Number.POSITIVE_INFINITY,\n requireAsiForFailures: true,\n failureScoreThreshold: 0.5,\n}\n\nexport function releaseTraceEvidenceFromMultiShotTrials(\n trials: readonly MultiShotTrialResult[],\n): ReleaseTraceEvidence[] {\n return trials.map((trial) => ({\n scenarioId: trial.scenarioId,\n candidateId: trial.variantId,\n split: trial.split === 'holdout' ? 'holdout' : trial.split === 'dev' ? 'dev' : 'search',\n score: trial.score,\n ok: trial.ok,\n turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : undefined,\n costUsd: trial.cost,\n durationMs: trial.durationMs,\n failureMode: trial.error ? 'runtime_error' : undefined,\n asi: trial.asi,\n metadata: trial.metadata,\n }))\n}\n\nexport function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard {\n const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds }\n const candidateId = input.candidateId ?? null\n const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId)\n const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId)\n const scenarios = input.scenarios ?? []\n const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length\n const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios)\n const searchScores = scoresFor(runs, 'search')\n const holdoutScores = scoresFor(runs, 'holdout')\n const allScores = [...searchScores, ...holdoutScores]\n const traceScores = traces.map((t) => t.score).filter(isFiniteNumber)\n const scoreUniverse = allScores.length > 0 ? allScores : traceScores\n const searchRuns = runs.filter((r) => r.splitTag === 'search').length\n const holdoutRuns = runs.filter((r) => r.splitTag === 'holdout').length\n const searchMeanScore = mean(searchScores)\n const holdoutMeanScore = mean(holdoutScores)\n const metrics: ReleaseConfidenceMetrics = {\n scenarioCount,\n searchRuns,\n holdoutRuns,\n passRate: passRate(runs, traces, thresholds.failureScoreThreshold),\n meanScore: mean(scoreUniverse),\n searchMeanScore,\n holdoutMeanScore,\n overfitGap: safeDiff(searchMeanScore, holdoutMeanScore),\n meanCostUsd: mean([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),\n p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),\n failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,\n failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,\n singleShotTraces: traces.filter((t) => t.turnCount === 1).length,\n multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,\n splitCounts,\n domainCounts: countDomains(scenarios),\n failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),\n responsibleSurfaceCounts: countResponsibleSurfaces(traces),\n }\n\n const issues: ReleaseConfidenceIssue[] = []\n checkCorpus(input, thresholds, metrics, issues)\n checkQuality(thresholds, metrics, issues)\n checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues)\n checkDiagnostics(thresholds, metrics, issues)\n checkEfficiency(thresholds, metrics, issues)\n\n const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues)\n const status = issues.some((i) => i.severity === 'critical') ? 'fail'\n : issues.length > 0 ? 'warn'\n : 'pass'\n\n return {\n target: input.target,\n candidateId,\n baselineId: input.baselineId ?? null,\n status,\n promote: status === 'pass' && (input.gateDecision ? input.gateDecision.promote : true),\n axes,\n issues,\n metrics,\n dataset: input.dataset ?? null,\n gateDecision: input.gateDecision ?? null,\n summary: renderSummary(input.target, status, metrics, issues),\n }\n}\n\nexport function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard {\n const scorecard = evaluateReleaseConfidence(input)\n if (scorecard.status === 'fail') {\n throw new Error(scorecard.summary)\n }\n return scorecard\n}\n\nfunction filterCandidate(\n runs: readonly RunRecord[],\n candidateId: string | null,\n baselineId?: string,\n): RunRecord[] {\n if (candidateId) return runs.filter((r) => r.candidateId === candidateId)\n if (baselineId) return runs.filter((r) => r.candidateId !== baselineId)\n return [...runs]\n}\n\nfunction filterTraceCandidate(\n traces: readonly ReleaseTraceEvidence[],\n candidateId: string | null,\n baselineId?: string,\n): ReleaseTraceEvidence[] {\n if (candidateId) return traces.filter((t) => t.candidateId === undefined || t.candidateId === candidateId)\n if (baselineId) return traces.filter((t) => t.candidateId === undefined || t.candidateId !== baselineId)\n return [...traces]\n}\n\nfunction checkCorpus(\n input: ReleaseConfidenceInput,\n thresholds: Required<ReleaseConfidenceThresholds>,\n metrics: ReleaseConfidenceMetrics,\n issues: ReleaseConfidenceIssue[],\n): void {\n if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {\n issues.push({ axis: 'corpus', severity: 'critical', code: 'missing_corpus', detail: 'No Dataset manifest or scenarios supplied.' })\n }\n if (metrics.scenarioCount < thresholds.minScenarioCount) {\n issues.push({ axis: 'corpus', severity: 'critical', code: 'few_scenarios', detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` })\n }\n if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {\n issues.push({ axis: 'corpus', severity: 'critical', code: 'missing_holdout_split', detail: 'Corpus has no holdout scenarios.' })\n }\n}\n\nfunction checkQuality(\n thresholds: Required<ReleaseConfidenceThresholds>,\n metrics: ReleaseConfidenceMetrics,\n issues: ReleaseConfidenceIssue[],\n): void {\n if (metrics.searchRuns < thresholds.minSearchRuns) {\n issues.push({ axis: 'quality', severity: 'critical', code: 'few_search_runs', detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` })\n }\n if (metrics.passRate < thresholds.minPassRate) {\n issues.push({ axis: 'quality', severity: 'critical', code: 'low_pass_rate', detail: `passRate ${fmt(metrics.passRate)} < ${fmt(thresholds.minPassRate)}.` })\n }\n if (metrics.meanScore < thresholds.minMeanScore) {\n issues.push({ axis: 'quality', severity: 'critical', code: 'low_mean_score', detail: `meanScore ${fmt(metrics.meanScore)} < ${fmt(thresholds.minMeanScore)}.` })\n }\n}\n\nfunction checkGeneralization(\n gateDecision: GateDecision | null,\n thresholds: Required<ReleaseConfidenceThresholds>,\n metrics: ReleaseConfidenceMetrics,\n issues: ReleaseConfidenceIssue[],\n): void {\n if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {\n issues.push({ axis: 'generalization', severity: 'critical', code: 'few_holdout_runs', detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` })\n }\n if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {\n issues.push({ axis: 'generalization', severity: 'critical', code: 'overfit_gap', detail: `search-holdout gap ${fmt(metrics.overfitGap)} > ${fmt(thresholds.maxOverfitGap)}.` })\n }\n if (gateDecision && !gateDecision.promote) {\n issues.push({ axis: 'generalization', severity: 'critical', code: `gate_${gateDecision.rejectionCode ?? 'reject'}`, detail: gateDecision.reason })\n }\n}\n\nfunction checkDiagnostics(\n thresholds: Required<ReleaseConfidenceThresholds>,\n metrics: ReleaseConfidenceMetrics,\n issues: ReleaseConfidenceIssue[],\n): void {\n if (!thresholds.requireAsiForFailures) return\n if (metrics.failedRows > metrics.failuresWithAsi) {\n issues.push({\n axis: 'diagnostics',\n severity: 'critical',\n code: 'missing_failure_asi',\n detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`,\n })\n }\n}\n\nfunction checkEfficiency(\n thresholds: Required<ReleaseConfidenceThresholds>,\n metrics: ReleaseConfidenceMetrics,\n issues: ReleaseConfidenceIssue[],\n): void {\n if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {\n issues.push({ axis: 'efficiency', severity: 'critical', code: 'cost_budget', detail: `meanCostUsd ${fmt(metrics.meanCostUsd)} > ${fmt(thresholds.maxMeanCostUsd)}.` })\n }\n if (metrics.p95WallMs > thresholds.maxP95WallMs) {\n issues.push({ axis: 'efficiency', severity: 'critical', code: 'latency_budget', detail: `p95WallMs ${fmt(metrics.p95WallMs)} > ${fmt(thresholds.maxP95WallMs)}.` })\n }\n}\n\nfunction buildAxes(\n metrics: ReleaseConfidenceMetrics,\n thresholds: Required<ReleaseConfidenceThresholds>,\n gateDecision: GateDecision | null,\n issues: ReleaseConfidenceIssue[],\n): ReleaseConfidenceAxis[] {\n return [\n axis('corpus', issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),\n axis('quality', issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`),\n axis('generalization', issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt(metrics.overfitGap)}`),\n axis('diagnostics', issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),\n axis('efficiency', issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt(metrics.meanCostUsd)} p95WallMs=${fmt(metrics.p95WallMs)}`),\n ]\n}\n\nfunction axis(\n name: ReleaseConfidenceAxisName,\n issues: ReleaseConfidenceIssue[],\n score: number,\n detail: string,\n): ReleaseConfidenceAxis {\n const own = issues.filter((i) => i.axis === name)\n const status = own.some((i) => i.severity === 'critical') ? 'fail'\n : own.length > 0 ? 'warn'\n : 'pass'\n return { name, status, score: bounded(score), detail }\n}\n\nfunction countScenarioSplits(scenarios: readonly DatasetScenario[]): Record<DatasetSplit, number> {\n const counts: Record<DatasetSplit, number> = { train: 0, dev: 0, test: 0, holdout: 0 }\n for (const scenario of scenarios) counts[scenario.split ?? 'train']++\n return counts\n}\n\nfunction countDomains(scenarios: readonly DatasetScenario[]): Record<string, number> {\n const out: Record<string, number> = {}\n for (const scenario of scenarios) {\n const domain = scenario.tags?.domain ?? scenario.tags?.category ?? 'uncategorized'\n out[domain] = (out[domain] ?? 0) + 1\n }\n return out\n}\n\nfunction countFailureModes(\n runs: readonly RunRecord[],\n traces: readonly ReleaseTraceEvidence[],\n threshold: number,\n): Record<string, number> {\n const out: Record<string, number> = {}\n for (const run of runs) {\n const score = run.outcome.holdoutScore ?? run.outcome.searchScore\n if (run.failureMode || (score !== undefined && score < threshold)) {\n const mode = run.failureMode ?? 'low_score'\n out[mode] = (out[mode] ?? 0) + 1\n }\n }\n for (const trace of traces) {\n if (trace.failureMode || trace.ok === false || (trace.score !== undefined && trace.score < threshold)) {\n const mode = trace.failureMode ?? (trace.ok === false ? 'not_ok' : 'low_score')\n out[mode] = (out[mode] ?? 0) + 1\n }\n }\n return out\n}\n\nfunction countResponsibleSurfaces(traces: readonly ReleaseTraceEvidence[]): Record<string, number> {\n const out: Record<string, number> = {}\n for (const trace of traces) {\n for (const asi of trace.asi ?? []) {\n const surface = asi.responsibleSurface ?? 'unknown'\n out[surface] = (out[surface] ?? 0) + 1\n }\n }\n return out\n}\n\nfunction failedRows(\n runs: readonly RunRecord[],\n traces: readonly ReleaseTraceEvidence[],\n threshold: number,\n): Array<{ hasAsi: boolean }> {\n const out: Array<{ hasAsi: boolean }> = []\n for (const run of runs) {\n const score = run.outcome.holdoutScore ?? run.outcome.searchScore\n if (run.failureMode || (score !== undefined && score < threshold)) {\n const asiMetric = run.outcome.raw.asi\n out.push({ hasAsi: typeof asiMetric === 'number' && asiMetric > 0 })\n }\n }\n for (const trace of traces) {\n if (trace.failureMode || trace.ok === false || (trace.score !== undefined && trace.score < threshold)) {\n out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 })\n }\n }\n return out\n}\n\nfunction passRate(\n runs: readonly RunRecord[],\n traces: readonly ReleaseTraceEvidence[],\n threshold: number,\n): number {\n const outcomes = [\n ...runs.map((run) => {\n const score = run.outcome.holdoutScore ?? run.outcome.searchScore\n return !run.failureMode && score !== undefined && score >= threshold\n }),\n ...traces.map((trace) => trace.ok !== false && (trace.score === undefined || trace.score >= threshold)),\n ]\n if (outcomes.length === 0) return 0\n return outcomes.filter(Boolean).length / outcomes.length\n}\n\nfunction scoresFor(runs: readonly RunRecord[], split: RunSplitTag): number[] {\n return runs\n .filter((run) => run.splitTag === split)\n .map((run) => split === 'holdout' ? run.outcome.holdoutScore : run.outcome.searchScore)\n .filter(isFiniteNumber)\n}\n\nfunction mean(xs: readonly number[]): number {\n if (xs.length === 0) return Number.NaN\n return xs.reduce((sum, x) => sum + x, 0) / xs.length\n}\n\nfunction percentile(xs: readonly number[], p: number): number {\n if (xs.length === 0) return Number.NaN\n const sorted = [...xs].sort((a, b) => a - b)\n return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))]!\n}\n\nfunction isFiniteNumber(value: unknown): value is number {\n return typeof value === 'number' && Number.isFinite(value)\n}\n\nfunction safeDiff(a: number, b: number): number {\n if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN\n return a - b\n}\n\nfunction gapScore(gap: number, maxGap: number): number {\n if (!Number.isFinite(gap)) return 0\n if (maxGap <= 0) return gap <= 0 ? 1 : 0\n return bounded(1 - Math.max(0, gap) / maxGap)\n}\n\nfunction efficiencyScore(\n metrics: ReleaseConfidenceMetrics,\n thresholds: Required<ReleaseConfidenceThresholds>,\n): number {\n const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd)\n ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12))\n : 1\n const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs)\n ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12))\n : 1\n return Math.min(cost, latency)\n}\n\nfunction bounded(x: number): number {\n if (!Number.isFinite(x)) return 0\n return Math.max(0, Math.min(1, x))\n}\n\nfunction renderSummary(\n target: string,\n status: ReleaseConfidenceStatus,\n metrics: ReleaseConfidenceMetrics,\n issues: ReleaseConfidenceIssue[],\n): string {\n const prefix = `release confidence ${status}: ${target}`\n const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`\n if (issues.length === 0) return `${prefix}; ${metricText}`\n return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(',')}`\n}\n\nfunction fmt(x: number): string {\n if (!Number.isFinite(x)) return String(x)\n return x.toFixed(4)\n}\n","/**\n * Rubric predictive validity — does our eval rubric predict deployment\n * outcomes?\n *\n * `correlationStudy` (already in this package) joins a `TraceStore` to an\n * `OutcomeStore` and computes Pearson + Spearman + bootstrap CI for each\n * (eval-metric, outcome-metric) pair. That answers \"does X correlate with\n * Y at all.\" `rubricPredictiveValidity` is the campaign-shaped wrapper\n * around it: take a sequence of `RunRecord`s (the canonical campaign\n * artifact) and a `DeploymentOutcomeStore`, join on `runId`, return a\n * ranked verdict on every rubric whose dimension scores were captured in\n * `outcome.raw`.\n *\n * The point — quoting the methodology doc — is that **without this loop\n * every rubric is faith-based**. Once it's wired, you know which rubrics\n * have earned their promotion power and which ones are decoration.\n *\n * const validity = await rubricPredictiveValidity({\n * runs: lastQuarter,\n * outcomes: shipFlagOutcomeStore,\n * outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],\n * rubrics: ['anti_slop', 'semantic_concept', 'tool_recovery'],\n * })\n * for (const r of validity.ranked) {\n * console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)}`)\n * }\n *\n * The function is intentionally read-only. Use the verdict to deprecate\n * decorative rubrics, re-weight composite scores, or trigger a\n * recalibration sweep when predictive validity drops below a threshold.\n */\n\nimport type { RunRecord } from '../run-record'\nimport type { DeploymentOutcome, OutcomeStore } from './outcome-store'\n\nexport interface RubricPredictiveValidityInput {\n /**\n * Canonical campaign output. Each record's `outcome.raw[<rubricId>]`\n * provides the eval score; missing keys are silently skipped per pair.\n */\n runs: RunRecord[]\n outcomes: OutcomeStore\n /**\n * Outcome metric names to evaluate against. Each must appear in at\n * least one `DeploymentOutcome.metrics` keyspace; pairs with too few\n * joined samples are excluded from the result.\n */\n outcomeMetrics: string[]\n /**\n * Rubric ids to evaluate. Must appear as keys in `RunRecord.outcome.raw`.\n * If omitted, every numeric key in `outcome.raw` across the run set is\n * treated as a rubric.\n */\n rubrics?: string[]\n /** Minimum joined-sample count before a pair is reported. Default 8. */\n minSamples?: number\n /** Bootstrap resamples for CI. Default 500. */\n bootstrapResamples?: number\n /** Random seed for the bootstrap (mulberry32). Default unset (Math.random). */\n seed?: number\n /**\n * Reduction when multiple outcomes attach to one runId. Default `'latest'`\n * (most recently captured).\n */\n reduction?: 'latest' | 'mean' | 'max'\n}\n\nexport interface RubricOutcomePair {\n rubric: string\n outcome: string\n n: number\n pearson: number\n spearman: number\n ci95: { low: number; high: number }\n /**\n * Verdict bucket. `load_bearing` ≥ 0.7, `informative` ≥ 0.4,\n * `decorative` < 0.4 in absolute correlation. A negative correlation\n * with a desired outcome is also `decorative` — actively misleading\n * is worse than uninformative.\n */\n verdict: 'load_bearing' | 'informative' | 'decorative'\n}\n\nexport interface RubricRanking {\n rubric: string\n /** Outcome metric this rubric correlated best with. */\n bestOutcome: string\n spearman: number\n pearson: number\n n: number\n verdict: RubricOutcomePair['verdict']\n}\n\nexport interface RubricPredictiveValidityReport {\n pairs: RubricOutcomePair[]\n /** Per-rubric best pair, sorted descending by |spearman|. */\n ranked: RubricRanking[]\n joinedSamples: number\n skippedRuns: number\n /** Rubrics that were declared but never produced a usable score. */\n rubricsWithoutData: string[]\n}\n\nexport async function rubricPredictiveValidity(\n input: RubricPredictiveValidityInput,\n): Promise<RubricPredictiveValidityReport> {\n const minSamples = input.minSamples ?? 8\n const reduction = input.reduction ?? 'latest'\n const resamples = input.bootstrapResamples ?? 500\n const rng = makeRng(input.seed)\n\n const outcomes = await input.outcomes.list()\n const outcomesByRun = new Map<string, DeploymentOutcome[]>()\n for (const o of outcomes) {\n const arr = outcomesByRun.get(o.runId) ?? []\n arr.push(o)\n outcomesByRun.set(o.runId, arr)\n }\n\n // Discover rubrics: caller-declared OR every numeric key in outcome.raw\n // observed across runs.\n const observedRubrics = new Set<string>()\n for (const r of input.runs) {\n for (const k of Object.keys(r.outcome.raw)) observedRubrics.add(k)\n }\n const rubrics = input.rubrics ?? [...observedRubrics]\n\n // Collect aligned (x, y) pairs per (rubric, outcome).\n type Bucket = { rubric: string; outcome: string; xs: number[]; ys: number[] }\n const buckets: Bucket[] = []\n for (const r of rubrics) {\n for (const o of input.outcomeMetrics) {\n buckets.push({ rubric: r, outcome: o, xs: [], ys: [] })\n }\n }\n\n let joined = 0\n let skipped = 0\n for (const run of input.runs) {\n const os = outcomesByRun.get(run.runId)\n if (!os || os.length === 0) { skipped++; continue }\n let joinedThisRun = false\n for (const r of rubrics) {\n const x = run.outcome.raw[r]\n if (typeof x !== 'number' || !Number.isFinite(x)) continue\n for (const o of input.outcomeMetrics) {\n const values = os\n .map((row) => row.metrics[o])\n .filter((v): v is number => typeof v === 'number' && Number.isFinite(v))\n if (values.length === 0) continue\n const y = reduce(values, os, o, reduction)\n if (y === null) continue\n const bucket = buckets.find((b) => b.rubric === r && b.outcome === o)!\n bucket.xs.push(x)\n bucket.ys.push(y)\n joinedThisRun = true\n }\n }\n if (joinedThisRun) joined++\n }\n\n const pairs: RubricOutcomePair[] = []\n for (const b of buckets) {\n if (b.xs.length < minSamples) continue\n const pearson = pearsonR(b.xs, b.ys)\n const spearman = pearsonR(rankWithTies(b.xs), rankWithTies(b.ys))\n const ci = bootstrapCi(b.xs, b.ys, resamples, rng)\n const verdict: RubricOutcomePair['verdict'] =\n Math.abs(spearman) >= 0.7 ? 'load_bearing'\n : Math.abs(spearman) >= 0.4 ? 'informative'\n : 'decorative'\n pairs.push({\n rubric: b.rubric, outcome: b.outcome, n: b.xs.length,\n pearson, spearman, ci95: ci, verdict,\n })\n }\n\n const byRubric = new Map<string, RubricOutcomePair[]>()\n for (const p of pairs) {\n const arr = byRubric.get(p.rubric) ?? []\n arr.push(p)\n byRubric.set(p.rubric, arr)\n }\n const ranked: RubricRanking[] = [...byRubric.entries()]\n .map(([rubric, ps]) => {\n const best = ps.reduce((a, b) => (Math.abs(b.spearman) > Math.abs(a.spearman) ? b : a))\n return {\n rubric,\n bestOutcome: best.outcome,\n spearman: best.spearman,\n pearson: best.pearson,\n n: best.n,\n verdict: best.verdict,\n }\n })\n .sort((a, b) => Math.abs(b.spearman) - Math.abs(a.spearman))\n\n const rubricsWithoutData = rubrics.filter((r) => !byRubric.has(r))\n\n return { pairs, ranked, joinedSamples: joined, skippedRuns: skipped, rubricsWithoutData }\n}\n\n// ── Helpers ──────────────────────────────────────────────────────────────\n\nfunction reduce(\n values: number[],\n outcomes: DeploymentOutcome[],\n metric: string,\n kind: 'latest' | 'mean' | 'max',\n): number | null {\n if (values.length === 0) return null\n if (kind === 'mean') return values.reduce((s, v) => s + v, 0) / values.length\n if (kind === 'max') return Math.max(...values)\n // 'latest'\n const sorted = [...outcomes]\n .filter((o) => typeof o.metrics[metric] === 'number')\n .sort((a, b) => b.capturedAt - a.capturedAt)\n return sorted[0]?.metrics[metric] ?? null\n}\n\nfunction pearsonR(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return Number.NaN\n const ma = a.reduce((s, v) => s + v, 0) / a.length\n const mb = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0, da = 0, db = 0\n for (let i = 0; i < a.length; i++) {\n const xa = a[i]! - ma\n const xb = b[i]! - mb\n num += xa * xb; da += xa * xa; db += xb * xb\n }\n if (da === 0 || db === 0) return da === 0 && db === 0 ? 1 : 0\n return num / Math.sqrt(da * db)\n}\n\nfunction rankWithTies(xs: number[]): number[] {\n const indexed = xs.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v)\n const r = new Array<number>(xs.length)\n for (let i = 0; i < indexed.length; ) {\n let j = i\n while (j + 1 < indexed.length && indexed[j + 1]!.v === indexed[i]!.v) j++\n const avg = (i + j + 2) / 2\n for (let k = i; k <= j; k++) r[indexed[k]!.i] = avg\n i = j + 1\n }\n return r\n}\n\nfunction bootstrapCi(\n xs: number[],\n ys: number[],\n iterations: number,\n rng: () => number,\n): { low: number; high: number } {\n const n = xs.length\n if (n < 3) return { low: Number.NaN, high: Number.NaN }\n const samples: number[] = []\n for (let b = 0; b < iterations; b++) {\n const rx = new Array<number>(n)\n const ry = new Array<number>(n)\n for (let i = 0; i < n; i++) {\n const idx = Math.floor(rng() * n)\n rx[i] = xs[idx]!\n ry[i] = ys[idx]!\n }\n const r = pearsonR(rx, ry)\n if (Number.isFinite(r)) samples.push(r)\n }\n samples.sort((a, b) => a - b)\n if (samples.length === 0) return { low: Number.NaN, high: Number.NaN }\n return {\n low: samples[Math.floor(0.025 * samples.length)]!,\n high: samples[Math.min(samples.length - 1, Math.floor(0.975 * samples.length))]!,\n }\n}\n\nfunction makeRng(seed?: number): () => number {\n if (seed === undefined) return Math.random\n let s = seed >>> 0\n return () => {\n s = (s + 0x6D2B79F5) >>> 0\n let t = s\n t = Math.imul(t ^ (t >>> 15), t | 1)\n t ^= t + Math.imul(t ^ (t >>> 7), t | 61)\n return ((t ^ (t >>> 14)) >>> 0) / 4294967296\n }\n}\n","/**\n * Always-valid sequential evaluation.\n *\n * `researchReport` (0.21+) assumes a single pre-specified analysis. Real\n * consumers run campaigns weekly / nightly / per-PR; each new run silently\n * inflates the false-discovery rate, because the BH-FDR guarantee was for\n * the *first* look, not the 47th. Without time-uniform inference,\n * launch-decision teams either (a) don't peek, which forfeits the cost\n * advantage of stop-when-decisive, or (b) peek and pretend they didn't,\n * which forfeits scientific validity.\n *\n * This module ships **e-value-based confidence sequences** for paired\n * bounded outcomes. The methodology is the predictable plug-in betting\n * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*\n * stopping time. Concretely:\n *\n * For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,\n * a betting fraction λ_i is chosen using only D_{1..i-1} (predictable\n * plug-in), and the running e-value is\n *\n * E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)\n *\n * E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by\n * Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null\n * at any time without inflating the type-I error.\n *\n * Combined with `runEvalCampaign`, every consumer running rolling\n * campaigns gains the ability to ship the moment evidence is decisive,\n * stop-early on dead-on-arrival variants, and accumulate evidence across\n * partial runs without spending the FDR budget. No new sweep is wasted.\n *\n * References:\n * - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).\n * Time-uniform, nonparametric, nonasymptotic confidence sequences.\n * Annals of Statistics, 49(2), 1055–1080.\n * - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded\n * random variables by betting. JRSS B, 86(1), 1–27.\n */\n\nexport type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent'\n\nexport interface PairedEvalueOptions {\n /**\n * Bound on |delta|. Default 1 (matching most score scales). Must satisfy\n * c > 0; deltas outside [-c, c] are clipped with a warning attached to\n * the return value.\n */\n bound?: number\n /** Target Type-I error. Default 0.05. */\n alpha?: number\n /**\n * Region of Practical Equivalence on the *mean* paired delta. When\n * supplied, the verdict can return `'equivalent'` once the running\n * confidence sequence on the mean is fully contained in [low, high].\n */\n rope?: { low: number; high: number }\n /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */\n initialBetShrinkage?: number\n}\n\nexport interface PairedEvalueStep {\n /** 1-indexed observation count. */\n t: number\n delta: number\n /** Running e-value E_t = ∏ (1 + λ_i · D_i). */\n evalue: number\n /** Time-uniform p-value at stopping time t. */\n pValue: number\n /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */\n csLow: number\n csHigh: number\n /** Verdict at this stopping time. */\n decision: SequentialDecision\n}\n\nexport interface PairedEvalueSequence {\n steps: PairedEvalueStep[]\n /** The decision at the final step. */\n finalDecision: SequentialDecision\n /** Index (1-based) at which a non-`continue` decision first fired, or null. */\n decisionFiredAt: number | null\n /** True if any deltas were clipped to [-bound, bound]. */\n clipped: boolean\n}\n\n/**\n * Run the paired e-value sequence over an in-order delta stream.\n *\n * Use for *streaming* / interim analyses: pass the deltas you have so\n * far, get the verdict at every prefix length. The decision is\n * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`\n * fires, the verdict at later steps remains decisive (the e-value is a\n * non-negative martingale; once it crosses the threshold, it's crossed).\n */\nexport function pairedEvalueSequence(\n deltas: number[],\n opts: PairedEvalueOptions = {},\n): PairedEvalueSequence {\n const c = opts.bound ?? 1\n const alpha = opts.alpha ?? 0.05\n const initialShrink = opts.initialBetShrinkage ?? 0.5\n const rope = opts.rope ?? null\n if (c <= 0) throw new Error('pairedEvalueSequence: bound must be > 0')\n if (alpha <= 0 || alpha >= 1) throw new Error('pairedEvalueSequence: alpha must be in (0,1)')\n if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {\n throw new Error('pairedEvalueSequence: rope must satisfy low ≤ high')\n }\n\n const steps: PairedEvalueStep[] = []\n let clipped = false\n let evalue = 1\n let decisionFiredAt: number | null = null\n\n // Running statistics (using only D_{1..i-1} for the bet → predictable plug-in).\n let sum = 0\n let sumSq = 0\n let count = 0\n\n for (let i = 0; i < deltas.length; i++) {\n let d = deltas[i]!\n if (d < -c || d > c) {\n d = Math.max(-c, Math.min(c, d))\n clipped = true\n }\n\n // Predictable plug-in bet (positive λ tests for E[D] > 0; we run a two-sided\n // test by tracking the symmetric e-value via |bet|).\n // λ_i ∝ mean / (variance + bound^2). Shrink early to avoid overbetting.\n const muHat = count === 0 ? 0 : sum / count\n const varHat = count === 0 ? c * c : Math.max(1e-12, sumSq / count - muHat * muHat)\n const t = i + 1\n const shrink = initialShrink * Math.min(1, count / 32) // anneal toward 1\n let lambda = (muHat / (varHat + c * c)) * shrink\n // Clip to ensure 1 + λ·D > 0 for all |D| ≤ c (so the e-value stays non-negative).\n const lambdaMax = 0.99 / c\n if (lambda > lambdaMax) lambda = lambdaMax\n if (lambda < -lambdaMax) lambda = -lambdaMax\n\n evalue = evalue * (1 + lambda * d)\n if (!Number.isFinite(evalue) || evalue < 0) evalue = 0\n\n sum += d\n sumSq += d * d\n count += 1\n\n const pValue = Math.min(1, 1 / Math.max(evalue, 1e-300))\n\n // Empirical Bernstein confidence sequence on the mean. Howard et al.\n // (2021), Theorem 4.4 with σ̂² the running sample variance and a\n // calibration constant tuned for two-sided coverage at level 1 - α.\n const cs = empiricalBernsteinCs(sum, sumSq, count, c, alpha)\n\n let decision: SequentialDecision = 'continue'\n if (rope && cs.low >= rope.low && cs.high <= rope.high) decision = 'equivalent'\n else if (evalue >= 2 / alpha && muHat > 0) decision = 'promote_now'\n else if (evalue >= 2 / alpha && muHat < 0) decision = 'reject_now'\n else if (rope && cs.high < rope.low) decision = 'reject_now'\n\n if (decision !== 'continue' && decisionFiredAt === null) decisionFiredAt = t\n\n steps.push({ t, delta: d, evalue, pValue, csLow: cs.low, csHigh: cs.high, decision })\n }\n\n const finalDecision = steps.length === 0 ? 'continue' : steps[steps.length - 1]!.decision\n return { steps, finalDecision, decisionFiredAt, clipped }\n}\n\nexport interface InterimReleaseConfidenceInput {\n /**\n * One delta series per candidate (paired deltas vs comparator). Order\n * within a series is the order the campaigns were run.\n */\n deltaSeries: Array<{ candidateId: string; deltas: number[] }>\n alpha?: number\n bound?: number\n rope?: { low: number; high: number }\n}\n\nexport interface InterimReleaseConfidence {\n candidates: Array<{\n candidateId: string\n decision: SequentialDecision\n decisionFiredAt: number | null\n finalEvalue: number\n finalPValue: number\n pairs: number\n csLow: number\n csHigh: number\n }>\n /**\n * Campaign-level recommendation: pick the strongest 'promote_now', else\n * 'continue' if any candidate is still live, else 'reject_now' if every\n * candidate is dead, else 'equivalent'.\n */\n recommendation: { decision: SequentialDecision; candidateId: string | null }\n}\n\n/**\n * Run interim sequential analyses across many candidates at once,\n * preserving the time-uniform α guarantee for each candidate's series and\n * synthesising a campaign-level recommendation. Designed to be called on\n * every campaign tick — the recommendation is anytime-valid.\n */\nexport function evaluateInterimReleaseConfidence(\n input: InterimReleaseConfidenceInput,\n): InterimReleaseConfidence {\n const candidates = input.deltaSeries.map((s) => {\n const seq = pairedEvalueSequence(s.deltas, {\n alpha: input.alpha,\n bound: input.bound,\n rope: input.rope,\n })\n const last = seq.steps[seq.steps.length - 1]\n return {\n candidateId: s.candidateId,\n decision: seq.finalDecision,\n decisionFiredAt: seq.decisionFiredAt,\n finalEvalue: last?.evalue ?? 1,\n finalPValue: last?.pValue ?? 1,\n pairs: seq.steps.length,\n csLow: last?.csLow ?? Number.NEGATIVE_INFINITY,\n csHigh: last?.csHigh ?? Number.POSITIVE_INFINITY,\n }\n })\n\n const promote = candidates.find((c) => c.decision === 'promote_now')\n if (promote) return { candidates, recommendation: { decision: 'promote_now', candidateId: promote.candidateId } }\n const live = candidates.find((c) => c.decision === 'continue')\n if (live) return { candidates, recommendation: { decision: 'continue', candidateId: null } }\n const equiv = candidates.find((c) => c.decision === 'equivalent')\n if (equiv) return { candidates, recommendation: { decision: 'equivalent', candidateId: equiv.candidateId } }\n return { candidates, recommendation: { decision: 'reject_now', candidateId: null } }\n}\n\n// ── Internals ────────────────────────────────────────────────────────────\n\n/**\n * Empirical Bernstein confidence sequence on the mean of bounded variables.\n * Adapted from Howard et al. (2021) §4.4. Provides a time-uniform CI on\n * the running mean; valid at every stopping time.\n */\nfunction empiricalBernsteinCs(\n sum: number,\n sumSq: number,\n n: number,\n bound: number,\n alpha: number,\n): { low: number; high: number } {\n if (n === 0) return { low: -bound, high: bound }\n const mean = sum / n\n const variance = Math.max(0, sumSq / n - mean * mean)\n // Iterated-log calibration constant. The 1.7 exponent matches the\n // recommended choice in Howard et al. for two-sided coverage at level\n // 1 - α with mild log-corrections; tightening further requires a\n // tuned mixture and is out of scope.\n const psi = Math.log(2 / alpha) + 1.7 * Math.log(Math.log(Math.max(Math.E, n)) + 1)\n const radius = Math.sqrt((2 * variance * psi) / n) + (3 * bound * psi) / n\n return { low: mean - radius, high: mean + radius }\n}\n","import type { ReleaseConfidenceScorecard } from './release-confidence'\nimport { summaryTable } from './summary-report'\nimport type { RunRecord } from './run-record'\n\nexport interface RenderReleaseReportOptions {\n title?: string\n runs?: readonly RunRecord[]\n comparator?: string\n traceAnalystFindings?: readonly string[]\n nextActions?: readonly string[]\n}\n\nexport function renderReleaseReport(\n scorecard: ReleaseConfidenceScorecard,\n options: RenderReleaseReportOptions = {},\n): string {\n const title = options.title ?? `Release Report: ${scorecard.target}`\n const lines: string[] = []\n lines.push(`# ${title}`)\n lines.push('')\n lines.push(`Status: **${scorecard.status.toUpperCase()}**`)\n lines.push(`Promote: **${scorecard.promote ? 'yes' : 'no'}**`)\n if (scorecard.candidateId) lines.push(`Candidate: \\`${scorecard.candidateId}\\``)\n if (scorecard.baselineId) lines.push(`Baseline: \\`${scorecard.baselineId}\\``)\n lines.push('')\n lines.push(scorecard.summary)\n lines.push('')\n\n lines.push('## Metrics')\n lines.push('')\n lines.push('| Metric | Value |')\n lines.push('|---|---:|')\n lines.push(`| Scenarios | ${scorecard.metrics.scenarioCount} |`)\n lines.push(`| Search runs | ${scorecard.metrics.searchRuns} |`)\n lines.push(`| Holdout runs | ${scorecard.metrics.holdoutRuns} |`)\n lines.push(`| Pass rate | ${pct(scorecard.metrics.passRate)} |`)\n lines.push(`| Mean score | ${num(scorecard.metrics.meanScore)} |`)\n lines.push(`| Search mean | ${num(scorecard.metrics.searchMeanScore)} |`)\n lines.push(`| Holdout mean | ${num(scorecard.metrics.holdoutMeanScore)} |`)\n lines.push(`| Overfit gap | ${num(scorecard.metrics.overfitGap)} |`)\n lines.push(`| Mean cost | $${num(scorecard.metrics.meanCostUsd)} |`)\n lines.push(`| p95 wall time | ${Math.round(scorecard.metrics.p95WallMs)} ms |`)\n lines.push('')\n\n if (scorecard.issues.length > 0) {\n lines.push('## Issues')\n lines.push('')\n for (const issue of scorecard.issues) {\n lines.push(`- **${issue.severity}** \\`${issue.code}\\` (${issue.axis}): ${issue.detail}`)\n }\n lines.push('')\n }\n\n const surfaces = entries(scorecard.metrics.responsibleSurfaceCounts)\n if (surfaces.length > 0) {\n lines.push('## Responsible Surfaces')\n lines.push('')\n for (const [surface, count] of surfaces) lines.push(`- ${surface}: ${count}`)\n lines.push('')\n }\n\n const failures = entries(scorecard.metrics.failureModeCounts)\n if (failures.length > 0) {\n lines.push('## Failure Modes')\n lines.push('')\n for (const [mode, count] of failures) lines.push(`- ${mode}: ${count}`)\n lines.push('')\n }\n\n if (options.runs && options.runs.length > 0) {\n lines.push('## Run Summary')\n lines.push('')\n lines.push(summaryTable([...options.runs], {\n comparator: options.comparator ?? scorecard.baselineId ?? undefined,\n split: 'holdout',\n }).markdown)\n lines.push('')\n }\n\n if (options.traceAnalystFindings && options.traceAnalystFindings.length > 0) {\n lines.push('## TraceAnalyst Findings')\n lines.push('')\n for (const finding of options.traceAnalystFindings) lines.push(`- ${finding}`)\n lines.push('')\n }\n\n const nextActions = options.nextActions ?? defaultNextActions(scorecard)\n if (nextActions.length > 0) {\n lines.push('## Next Actions')\n lines.push('')\n for (const action of nextActions) lines.push(`- ${action}`)\n lines.push('')\n }\n\n return lines.join('\\n').trimEnd() + '\\n'\n}\n\nfunction defaultNextActions(scorecard: ReleaseConfidenceScorecard): string[] {\n if (scorecard.promote) return ['Promote the candidate and keep canaries enabled.']\n return scorecard.issues\n .filter((issue) => issue.severity === 'critical')\n .map((issue) => `Resolve ${issue.code}: ${issue.detail}`)\n}\n\nfunction entries(values: Record<string, number>): Array<[string, number]> {\n return Object.entries(values)\n .filter(([, count]) => count > 0)\n .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))\n}\n\nfunction pct(value: number): string {\n return Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : 'n/a'\n}\n\nfunction num(value: number): string {\n return Number.isFinite(value) ? value.toFixed(3) : 'n/a'\n}\n","/**\n * Bootstrap-CI promotion gate.\n *\n * In any iterative-improvement loop (GEPA, prompt evolution, dataset\n * curation), the question is \"did this generation actually improve, or are\n * we celebrating noise?\". With small N and noisy outcomes, point-estimate\n * deltas lie. Bootstrap confidence intervals tell the operator whether the\n * delta is real before code or prompts get promoted.\n *\n * This module is pure functions — no I/O, no model calls. Easy to unit-test\n * and to compose into any verdict gate.\n *\n * Default gate:\n * - Bootstrap mean baseline vs candidate (1k resamples).\n * - Compute the delta distribution; pass if the lower CI bound > 0.\n * - Tunable confidence (default 95%) and resample count.\n *\n * Verdict semantics intentionally match the existing `experiments.jsonl`\n * vocabulary:\n * - ADVANCE: candidate's CI lower bound > baseline mean (real win)\n * - KEEP: overlap, but candidate point estimate >= baseline (neutral)\n * - REVERT: candidate's CI upper bound < baseline mean (real regression)\n * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal\n */\n\nexport type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE'\n\nexport interface BootstrapResult {\n baselineMean: number\n candidateMean: number\n /** candidateMean - baselineMean, point estimate. */\n delta: number\n /** Lower bound of the (1 - alpha) CI on the delta. */\n ciLower: number\n /** Upper bound of the (1 - alpha) CI on the delta. */\n ciUpper: number\n /** Number of bootstrap resamples used. */\n iterations: number\n alpha: number\n verdict: Verdict\n}\n\nexport interface BootstrapOptions {\n /** Confidence level alpha (default 0.05 → 95% CI). */\n alpha?: number\n /** Number of resamples (default 1000). */\n iterations?: number\n /**\n * Minimum total samples (baseline + candidate) below which we always\n * return INCONCLUSIVE — bootstrap with too few samples is meaningless.\n * Default 6 (combined).\n */\n minTotalSamples?: number\n /** RNG seed for reproducibility. Default: Math.random. */\n seed?: number\n}\n\n/**\n * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.\n *\n * Uses simple percentile bootstrap on the difference of resampled means.\n * That's the standard non-parametric primitive — no distributional\n * assumptions, robust to skew, easy to reason about.\n */\nexport function bootstrapCi(\n baseline: number[],\n candidate: number[],\n options: BootstrapOptions = {},\n): BootstrapResult {\n const alpha = options.alpha ?? 0.05\n const iterations = options.iterations ?? 1000\n const minTotal = options.minTotalSamples ?? 6\n const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate))\n\n const baselineMean = mean(baseline)\n const candidateMean = mean(candidate)\n const delta = candidateMean - baselineMean\n\n if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {\n return {\n baselineMean,\n candidateMean,\n delta,\n ciLower: -Infinity,\n ciUpper: Infinity,\n iterations: 0,\n alpha,\n verdict: 'INCONCLUSIVE',\n }\n }\n\n const deltas: number[] = new Array(iterations)\n for (let i = 0; i < iterations; i++) {\n const bResample = resample(baseline, rng)\n const cResample = resample(candidate, rng)\n deltas[i] = mean(cResample) - mean(bResample)\n }\n deltas.sort((a, b) => a - b)\n const lowerIdx = Math.floor((alpha / 2) * iterations)\n const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1\n const ciLower = deltas[Math.max(0, lowerIdx)]!\n const ciUpper = deltas[Math.min(iterations - 1, upperIdx)]!\n\n let verdict: Verdict\n if (ciLower > 0) verdict = 'ADVANCE'\n else if (ciUpper < 0) verdict = 'REVERT'\n else if (delta >= 0) verdict = 'KEEP'\n else verdict = 'INCONCLUSIVE'\n\n return {\n baselineMean,\n candidateMean,\n delta,\n ciLower,\n ciUpper,\n iterations,\n alpha,\n verdict,\n }\n}\n\nfunction mean(xs: number[]): number {\n if (xs.length === 0) return 0\n let s = 0\n for (const x of xs) s += x\n return s / xs.length\n}\n\nfunction resample(xs: number[], rng: () => number): number[] {\n const out = new Array(xs.length)\n for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)]\n return out\n}\n\n/** Mulberry32 — fast deterministic PRNG. Stable across runs given the same seed. */\nfunction mulberry32(seed: number): () => number {\n let t = seed >>> 0\n return () => {\n t += 0x6d2b79f5\n let r = t\n r = Math.imul(r ^ (r >>> 15), r | 1)\n r ^= r + Math.imul(r ^ (r >>> 7), r | 61)\n return ((r ^ (r >>> 14)) >>> 0) / 4294967296\n }\n}\n\n/** Stable seed derived from the inputs — same data → same CI bounds. */\nfunction hashSeed(a: number[], b: number[]): number {\n let h = 2166136261\n for (const x of [...a, ...b]) {\n const view = new Float64Array([x])\n const bytes = new Uint8Array(view.buffer)\n for (const byte of bytes) {\n h ^= byte\n h = Math.imul(h, 16777619)\n }\n }\n return h >>> 0\n}\n\n/**\n * Judge-replay promotion gate.\n *\n * The cheap inner-loop judge that drives an evolution run is by definition\n * fast and noisy. When you're about to promote a winning variant to the\n * canonical default, you want a STRONGER judge (a more expensive model, a\n * human grader, a separately-trained reward model) to confirm the win\n * generalises beyond the inner loop.\n *\n * This helper takes raw winner + baseline outputs, scores both through the\n * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger\n * judge agrees the winner is real with the configured confidence. Doesn't\n * matter what shape your \"output\" is — pass a string, an object, anything\n * the judge can read.\n */\nexport interface JudgeReplayGateArgs<TOutput> {\n baselineOutputs: TOutput[]\n candidateOutputs: TOutput[]\n /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */\n judge: (output: TOutput) => Promise<number> | number\n alpha?: number\n iterations?: number\n /** RNG seed for reproducibility. */\n seed?: number\n /** Maximum concurrent judge calls. Default 4. */\n judgeConcurrency?: number\n}\n\nexport async function judgeReplayGate<TOutput>(\n args: JudgeReplayGateArgs<TOutput>,\n): Promise<BootstrapResult & { baselineSamples: number; candidateSamples: number }> {\n const concurrency = args.judgeConcurrency ?? 4\n const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency)\n const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency)\n const ci = bootstrapCi(baselineScores, candidateScores, {\n ...(args.alpha !== undefined ? { alpha: args.alpha } : {}),\n ...(args.iterations !== undefined ? { iterations: args.iterations } : {}),\n ...(args.seed !== undefined ? { seed: args.seed } : {}),\n })\n return {\n ...ci,\n baselineSamples: baselineScores.length,\n candidateSamples: candidateScores.length,\n }\n}\n\nasync function scoreAll<TOutput>(\n outputs: TOutput[],\n judge: (output: TOutput) => Promise<number> | number,\n concurrency: number,\n): Promise<number[]> {\n const results: number[] = new Array(outputs.length)\n let next = 0\n async function worker(): Promise<void> {\n while (true) {\n const i = next++\n if (i >= outputs.length) return\n const v = await judge(outputs[i]!)\n results[i] = Number.isFinite(v) ? v : 0\n }\n }\n await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()))\n return results\n}\n"],"mappings":";;;;;AA0HA,IAAM,qBAA4D;AAAA,EAChE,eAAe;AAAA,EACf,kBAAkB;AAAA,EAClB,eAAe;AAAA,EACf,gBAAgB;AAAA,EAChB,gBAAgB;AAAA,EAChB,aAAa;AAAA,EACb,cAAc;AAAA,EACd,eAAe;AAAA,EACf,gBAAgB,OAAO;AAAA,EACvB,cAAc,OAAO;AAAA,EACrB,uBAAuB;AAAA,EACvB,uBAAuB;AACzB;AAEO,SAAS,wCACd,QACwB;AACxB,SAAO,OAAO,IAAI,CAAC,WAAW;AAAA,IAC5B,YAAY,MAAM;AAAA,IAClB,aAAa,MAAM;AAAA,IACnB,OAAO,MAAM,UAAU,YAAY,YAAY,MAAM,UAAU,QAAQ,QAAQ;AAAA,IAC/E,OAAO,MAAM;AAAA,IACb,IAAI,MAAM;AAAA,IACV,WAAW,MAAM,QAAQ,MAAM,OAAO,KAAK,IAAI,MAAM,MAAM,MAAM,SAAS;AAAA,IAC1E,SAAS,MAAM;AAAA,IACf,YAAY,MAAM;AAAA,IAClB,aAAa,MAAM,QAAQ,kBAAkB;AAAA,IAC7C,KAAK,MAAM;AAAA,IACX,UAAU,MAAM;AAAA,EAClB,EAAE;AACJ;AAEO,SAAS,0BAA0B,OAA2D;AACnG,QAAM,aAAa,EAAE,GAAG,oBAAoB,GAAG,MAAM,WAAW;AAChE,QAAM,cAAc,MAAM,eAAe;AACzC,QAAM,OAAO,gBAAgB,MAAM,QAAQ,CAAC,GAAG,aAAa,MAAM,UAAU;AAC5E,QAAM,SAAS,qBAAqB,MAAM,UAAU,CAAC,GAAG,aAAa,MAAM,UAAU;AACrF,QAAM,YAAY,MAAM,aAAa,CAAC;AACtC,QAAM,gBAAgB,MAAM,SAAS,iBAAiB,UAAU;AAChE,QAAM,cAAc,MAAM,SAAS,eAAe,oBAAoB,SAAS;AAC/E,QAAM,eAAe,UAAU,MAAM,QAAQ;AAC7C,QAAM,gBAAgB,UAAU,MAAM,SAAS;AAC/C,QAAM,YAAY,CAAC,GAAG,cAAc,GAAG,aAAa;AACpD,QAAM,cAAc,OAAO,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE,OAAO,cAAc;AACpE,QAAM,gBAAgB,UAAU,SAAS,IAAI,YAAY;AACzD,QAAM,aAAa,KAAK,OAAO,CAAC,MAAM,EAAE,aAAa,QAAQ,EAAE;AAC/D,QAAM,cAAc,KAAK,OAAO,CAAC,MAAM,EAAE,aAAa,SAAS,EAAE;AACjE,QAAM,kBAAkB,KAAK,YAAY;AACzC,QAAM,mBAAmB,KAAK,aAAa;AAC3C,QAAM,UAAoC;AAAA,IACxC;AAAA,IACA;AAAA,IACA;AAAA,IACA,UAAU,SAAS,MAAM,QAAQ,WAAW,qBAAqB;AAAA,IACjE,WAAW,KAAK,aAAa;AAAA,IAC7B;AAAA,IACA;AAAA,IACA,YAAY,SAAS,iBAAiB,gBAAgB;AAAA,IACtD,aAAa,KAAK,CAAC,GAAG,KAAK,IAAI,CAAC,MAAM,EAAE,OAAO,GAAG,GAAG,OAAO,IAAI,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,cAAc,CAAC,CAAC;AAAA,IACzG,WAAW,WAAW,CAAC,GAAG,KAAK,IAAI,CAAC,MAAM,EAAE,MAAM,GAAG,GAAG,OAAO,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,OAAO,cAAc,CAAC,GAAG,IAAI;AAAA,IACrH,YAAY,WAAW,MAAM,QAAQ,WAAW,qBAAqB,EAAE;AAAA,IACvE,iBAAiB,WAAW,MAAM,QAAQ,WAAW,qBAAqB,EAAE,OAAO,CAAC,QAAQ,IAAI,MAAM,EAAE;AAAA,IACxG,kBAAkB,OAAO,OAAO,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE;AAAA,IAC1D,iBAAiB,OAAO,OAAO,CAAC,OAAO,EAAE,aAAa,KAAK,CAAC,EAAE;AAAA,IAC9D;AAAA,IACA,cAAc,aAAa,SAAS;AAAA,IACpC,mBAAmB,kBAAkB,MAAM,QAAQ,WAAW,qBAAqB;AAAA,IACnF,0BAA0B,yBAAyB,MAAM;AAAA,EAC3D;AAEA,QAAM,SAAmC,CAAC;AAC1C,cAAY,OAAO,YAAY,SAAS,MAAM;AAC9C,eAAa,YAAY,SAAS,MAAM;AACxC,sBAAoB,MAAM,gBAAgB,MAAM,YAAY,SAAS,MAAM;AAC3E,mBAAiB,YAAY,SAAS,MAAM;AAC5C,kBAAgB,YAAY,SAAS,MAAM;AAE3C,QAAM,OAAO,UAAU,SAAS,YAAY,MAAM,gBAAgB,MAAM,MAAM;AAC9E,QAAM,SAAS,OAAO,KAAK,CAAC,MAAM,EAAE,aAAa,UAAU,IAAI,SAC3D,OAAO,SAAS,IAAI,SACpB;AAEJ,SAAO;AAAA,IACL,QAAQ,MAAM;AAAA,IACd;AAAA,IACA,YAAY,MAAM,cAAc;AAAA,IAChC;AAAA,IACA,SAAS,WAAW,WAAW,MAAM,eAAe,MAAM,aAAa,UAAU;AAAA,IACjF;AAAA,IACA;AAAA,IACA;AAAA,IACA,SAAS,MAAM,WAAW;AAAA,IAC1B,cAAc,MAAM,gBAAgB;AAAA,IACpC,SAAS,cAAc,MAAM,QAAQ,QAAQ,SAAS,MAAM;AAAA,EAC9D;AACF;AAEO,SAAS,wBAAwB,OAA2D;AACjG,QAAM,YAAY,0BAA0B,KAAK;AACjD,MAAI,UAAU,WAAW,QAAQ;AAC/B,UAAM,IAAI,MAAM,UAAU,OAAO;AAAA,EACnC;AACA,SAAO;AACT;AAEA,SAAS,gBACP,MACA,aACA,YACa;AACb,MAAI,YAAa,QAAO,KAAK,OAAO,CAAC,MAAM,EAAE,gBAAgB,WAAW;AACxE,MAAI,WAAY,QAAO,KAAK,OAAO,CAAC,MAAM,EAAE,gBAAgB,UAAU;AACtE,SAAO,CAAC,GAAG,IAAI;AACjB;AAEA,SAAS,qBACP,QACA,aACA,YACwB;AACxB,MAAI,YAAa,QAAO,OAAO,OAAO,CAAC,MAAM,EAAE,gBAAgB,UAAa,EAAE,gBAAgB,WAAW;AACzG,MAAI,WAAY,QAAO,OAAO,OAAO,CAAC,MAAM,EAAE,gBAAgB,UAAa,EAAE,gBAAgB,UAAU;AACvG,SAAO,CAAC,GAAG,MAAM;AACnB;AAEA,SAAS,YACP,OACA,YACA,SACA,QACM;AACN,MAAI,WAAW,iBAAiB,CAAC,MAAM,YAAY,MAAM,WAAW,UAAU,OAAO,GAAG;AACtF,WAAO,KAAK,EAAE,MAAM,UAAU,UAAU,YAAY,MAAM,kBAAkB,QAAQ,6CAA6C,CAAC;AAAA,EACpI;AACA,MAAI,QAAQ,gBAAgB,WAAW,kBAAkB;AACvD,WAAO,KAAK,EAAE,MAAM,UAAU,UAAU,YAAY,MAAM,iBAAiB,QAAQ,GAAG,QAAQ,aAAa,sBAAsB,WAAW,gBAAgB,IAAI,CAAC;AAAA,EACnK;AACA,MAAI,WAAW,kBAAkB,QAAQ,YAAY,YAAY,GAAG;AAClE,WAAO,KAAK,EAAE,MAAM,UAAU,UAAU,YAAY,MAAM,yBAAyB,QAAQ,mCAAmC,CAAC;AAAA,EACjI;AACF;AAEA,SAAS,aACP,YACA,SACA,QACM;AACN,MAAI,QAAQ,aAAa,WAAW,eAAe;AACjD,WAAO,KAAK,EAAE,MAAM,WAAW,UAAU,YAAY,MAAM,mBAAmB,QAAQ,GAAG,QAAQ,UAAU,wBAAwB,WAAW,aAAa,IAAI,CAAC;AAAA,EAClK;AACA,MAAI,QAAQ,WAAW,WAAW,aAAa;AAC7C,WAAO,KAAK,EAAE,MAAM,WAAW,UAAU,YAAY,MAAM,iBAAiB,QAAQ,YAAY,IAAI,QAAQ,QAAQ,CAAC,MAAM,IAAI,WAAW,WAAW,CAAC,IAAI,CAAC;AAAA,EAC7J;AACA,MAAI,QAAQ,YAAY,WAAW,cAAc;AAC/C,WAAO,KAAK,EAAE,MAAM,WAAW,UAAU,YAAY,MAAM,kBAAkB,QAAQ,aAAa,IAAI,QAAQ,SAAS,CAAC,MAAM,IAAI,WAAW,YAAY,CAAC,IAAI,CAAC;AAAA,EACjK;AACF;AAEA,SAAS,oBACP,cACA,YACA,SACA,QACM;AACN,MAAI,WAAW,kBAAkB,QAAQ,cAAc,WAAW,gBAAgB;AAChF,WAAO,KAAK,EAAE,MAAM,kBAAkB,UAAU,YAAY,MAAM,oBAAoB,QAAQ,GAAG,QAAQ,WAAW,yBAAyB,WAAW,cAAc,IAAI,CAAC;AAAA,EAC7K;AACA,MAAI,OAAO,SAAS,QAAQ,UAAU,KAAK,QAAQ,aAAa,WAAW,eAAe;AACxF,WAAO,KAAK,EAAE,MAAM,kBAAkB,UAAU,YAAY,MAAM,eAAe,QAAQ,sBAAsB,IAAI,QAAQ,UAAU,CAAC,MAAM,IAAI,WAAW,aAAa,CAAC,IAAI,CAAC;AAAA,EAChL;AACA,MAAI,gBAAgB,CAAC,aAAa,SAAS;AACzC,WAAO,KAAK,EAAE,MAAM,kBAAkB,UAAU,YAAY,MAAM,QAAQ,aAAa,iBAAiB,QAAQ,IAAI,QAAQ,aAAa,OAAO,CAAC;AAAA,EACnJ;AACF;AAEA,SAAS,iBACP,YACA,SACA,QACM;AACN,MAAI,CAAC,WAAW,sBAAuB;AACvC,MAAI,QAAQ,aAAa,QAAQ,iBAAiB;AAChD,WAAO,KAAK;AAAA,MACV,MAAM;AAAA,MACN,UAAU;AAAA,MACV,MAAM;AAAA,MACN,QAAQ,GAAG,QAAQ,aAAa,QAAQ,eAAe;AAAA,IACzD,CAAC;AAAA,EACH;AACF;AAEA,SAAS,gBACP,YACA,SACA,QACM;AACN,MAAI,QAAQ,cAAc,WAAW,gBAAgB;AACnD,WAAO,KAAK,EAAE,MAAM,cAAc,UAAU,YAAY,MAAM,eAAe,QAAQ,eAAe,IAAI,QAAQ,WAAW,CAAC,MAAM,IAAI,WAAW,cAAc,CAAC,IAAI,CAAC;AAAA,EACvK;AACA,MAAI,QAAQ,YAAY,WAAW,cAAc;AAC/C,WAAO,KAAK,EAAE,MAAM,cAAc,UAAU,YAAY,MAAM,kBAAkB,QAAQ,aAAa,IAAI,QAAQ,SAAS,CAAC,MAAM,IAAI,WAAW,YAAY,CAAC,IAAI,CAAC;AAAA,EACpK;AACF;AAEA,SAAS,UACP,SACA,YACA,cACA,QACyB;AACzB,SAAO;AAAA,IACL,KAAK,UAAU,QAAQ,QAAQ,QAAQ,gBAAgB,KAAK,IAAI,GAAG,WAAW,gBAAgB,CAAC,GAAG,GAAG,QAAQ,aAAa,uBAAuB,QAAQ,YAAY,OAAO,EAAE;AAAA,IAC9K,KAAK,WAAW,QAAQ,KAAK,IAAI,QAAQ,UAAU,QAAQ,SAAS,GAAG,YAAY,IAAI,QAAQ,QAAQ,CAAC,cAAc,IAAI,QAAQ,SAAS,CAAC,EAAE;AAAA,IAC9I,KAAK,kBAAkB,QAAQ,gBAAgB,CAAC,aAAa,UAAU,IAAI,SAAS,QAAQ,YAAY,WAAW,aAAa,GAAG,eAAe,QAAQ,WAAW,eAAe,IAAI,QAAQ,UAAU,CAAC,EAAE;AAAA,IAC7M,KAAK,eAAe,QAAQ,QAAQ,eAAe,IAAI,IAAI,QAAQ,kBAAkB,QAAQ,YAAY,mBAAmB,QAAQ,eAAe,IAAI,QAAQ,UAAU,EAAE;AAAA,IAC3K,KAAK,cAAc,QAAQ,gBAAgB,SAAS,UAAU,GAAG,eAAe,IAAI,QAAQ,WAAW,CAAC,cAAc,IAAI,QAAQ,SAAS,CAAC,EAAE;AAAA,EAChJ;AACF;AAEA,SAAS,KACP,MACA,QACA,OACA,QACuB;AACvB,QAAM,MAAM,OAAO,OAAO,CAAC,MAAM,EAAE,SAAS,IAAI;AAChD,QAAM,SAAS,IAAI,KAAK,CAAC,MAAM,EAAE,aAAa,UAAU,IAAI,SACxD,IAAI,SAAS,IAAI,SACjB;AACJ,SAAO,EAAE,MAAM,QAAQ,OAAO,QAAQ,KAAK,GAAG,OAAO;AACvD;AAEA,SAAS,oBAAoB,WAAqE;AAChG,QAAM,SAAuC,EAAE,OAAO,GAAG,KAAK,GAAG,MAAM,GAAG,SAAS,EAAE;AACrF,aAAW,YAAY,UAAW,QAAO,SAAS,SAAS,OAAO;AAClE,SAAO;AACT;AAEA,SAAS,aAAa,WAA+D;AACnF,QAAM,MAA8B,CAAC;AACrC,aAAW,YAAY,WAAW;AAChC,UAAM,SAAS,SAAS,MAAM,UAAU,SAAS,MAAM,YAAY;AACnE,QAAI,MAAM,KAAK,IAAI,MAAM,KAAK,KAAK;AAAA,EACrC;AACA,SAAO;AACT;AAEA,SAAS,kBACP,MACA,QACA,WACwB;AACxB,QAAM,MAA8B,CAAC;AACrC,aAAW,OAAO,MAAM;AACtB,UAAM,QAAQ,IAAI,QAAQ,gBAAgB,IAAI,QAAQ;AACtD,QAAI,IAAI,eAAgB,UAAU,UAAa,QAAQ,WAAY;AACjE,YAAM,OAAO,IAAI,eAAe;AAChC,UAAI,IAAI,KAAK,IAAI,IAAI,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AACA,aAAW,SAAS,QAAQ;AAC1B,QAAI,MAAM,eAAe,MAAM,OAAO,SAAU,MAAM,UAAU,UAAa,MAAM,QAAQ,WAAY;AACrG,YAAM,OAAO,MAAM,gBAAgB,MAAM,OAAO,QAAQ,WAAW;AACnE,UAAI,IAAI,KAAK,IAAI,IAAI,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,yBAAyB,QAAiE;AACjG,QAAM,MAA8B,CAAC;AACrC,aAAW,SAAS,QAAQ;AAC1B,eAAW,OAAO,MAAM,OAAO,CAAC,GAAG;AACjC,YAAM,UAAU,IAAI,sBAAsB;AAC1C,UAAI,OAAO,KAAK,IAAI,OAAO,KAAK,KAAK;AAAA,IACvC;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,WACP,MACA,QACA,WAC4B;AAC5B,QAAM,MAAkC,CAAC;AACzC,aAAW,OAAO,MAAM;AACtB,UAAM,QAAQ,IAAI,QAAQ,gBAAgB,IAAI,QAAQ;AACtD,QAAI,IAAI,eAAgB,UAAU,UAAa,QAAQ,WAAY;AACjE,YAAM,YAAY,IAAI,QAAQ,IAAI;AAClC,UAAI,KAAK,EAAE,QAAQ,OAAO,cAAc,YAAY,YAAY,EAAE,CAAC;AAAA,IACrE;AAAA,EACF;AACA,aAAW,SAAS,QAAQ;AAC1B,QAAI,MAAM,eAAe,MAAM,OAAO,SAAU,MAAM,UAAU,UAAa,MAAM,QAAQ,WAAY;AACrG,UAAI,KAAK,EAAE,SAAS,MAAM,KAAK,UAAU,KAAK,EAAE,CAAC;AAAA,IACnD;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,SACP,MACA,QACA,WACQ;AACR,QAAM,WAAW;AAAA,IACf,GAAG,KAAK,IAAI,CAAC,QAAQ;AACnB,YAAM,QAAQ,IAAI,QAAQ,gBAAgB,IAAI,QAAQ;AACtD,aAAO,CAAC,IAAI,eAAe,UAAU,UAAa,SAAS;AAAA,IAC7D,CAAC;AAAA,IACD,GAAG,OAAO,IAAI,CAAC,UAAU,MAAM,OAAO,UAAU,MAAM,UAAU,UAAa,MAAM,SAAS,UAAU;AAAA,EACxG;AACA,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,SAAO,SAAS,OAAO,OAAO,EAAE,SAAS,SAAS;AACpD;AAEA,SAAS,UAAU,MAA4B,OAA8B;AAC3E,SAAO,KACJ,OAAO,CAAC,QAAQ,IAAI,aAAa,KAAK,EACtC,IAAI,CAAC,QAAQ,UAAU,YAAY,IAAI,QAAQ,eAAe,IAAI,QAAQ,WAAW,EACrF,OAAO,cAAc;AAC1B;AAEA,SAAS,KAAK,IAA+B;AAC3C,MAAI,GAAG,WAAW,EAAG,QAAO,OAAO;AACnC,SAAO,GAAG,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC,IAAI,GAAG;AAChD;AAEA,SAAS,WAAW,IAAuB,GAAmB;AAC5D,MAAI,GAAG,WAAW,EAAG,QAAO,OAAO;AACnC,QAAM,SAAS,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3C,SAAO,OAAO,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,IAAI,GAAG,KAAK,KAAK,IAAI,OAAO,MAAM,IAAI,CAAC,CAAC,CAAC;AAC1F;AAEA,SAAS,eAAe,OAAiC;AACvD,SAAO,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK;AAC3D;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,MAAI,CAAC,OAAO,SAAS,CAAC,KAAK,CAAC,OAAO,SAAS,CAAC,EAAG,QAAO,OAAO;AAC9D,SAAO,IAAI;AACb;AAEA,SAAS,SAAS,KAAa,QAAwB;AACrD,MAAI,CAAC,OAAO,SAAS,GAAG,EAAG,QAAO;AAClC,MAAI,UAAU,EAAG,QAAO,OAAO,IAAI,IAAI;AACvC,SAAO,QAAQ,IAAI,KAAK,IAAI,GAAG,GAAG,IAAI,MAAM;AAC9C;AAEA,SAAS,gBACP,SACA,YACQ;AACR,QAAM,OAAO,OAAO,SAAS,WAAW,cAAc,KAAK,OAAO,SAAS,QAAQ,WAAW,IAC1F,QAAQ,WAAW,iBAAiB,KAAK,IAAI,QAAQ,aAAa,KAAK,CAAC,IACxE;AACJ,QAAM,UAAU,OAAO,SAAS,WAAW,YAAY,KAAK,OAAO,SAAS,QAAQ,SAAS,IACzF,QAAQ,WAAW,eAAe,KAAK,IAAI,QAAQ,WAAW,KAAK,CAAC,IACpE;AACJ,SAAO,KAAK,IAAI,MAAM,OAAO;AAC/B;AAEA,SAAS,QAAQ,GAAmB;AAClC,MAAI,CAAC,OAAO,SAAS,CAAC,EAAG,QAAO;AAChC,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,CAAC,CAAC;AACnC;AAEA,SAAS,cACP,QACA,QACA,SACA,QACQ;AACR,QAAM,SAAS,sBAAsB,MAAM,KAAK,MAAM;AACtD,QAAM,aAAa,aAAa,QAAQ,aAAa,eAAe,QAAQ,UAAU,gBAAgB,QAAQ,WAAW,aAAa,IAAI,QAAQ,QAAQ,CAAC,cAAc,IAAI,QAAQ,SAAS,CAAC;AAC/L,MAAI,OAAO,WAAW,EAAG,QAAO,GAAG,MAAM,KAAK,UAAU;AACxD,SAAO,GAAG,MAAM,KAAK,UAAU,YAAY,OAAO,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,GAAG,CAAC;AAChF;AAEA,SAAS,IAAI,GAAmB;AAC9B,MAAI,CAAC,OAAO,SAAS,CAAC,EAAG,QAAO,OAAO,CAAC;AACxC,SAAO,EAAE,QAAQ,CAAC;AACpB;;;ACnZA,eAAsB,yBACpB,OACyC;AACzC,QAAM,aAAa,MAAM,cAAc;AACvC,QAAM,YAAY,MAAM,aAAa;AACrC,QAAM,YAAY,MAAM,sBAAsB;AAC9C,QAAM,MAAM,QAAQ,MAAM,IAAI;AAE9B,QAAM,WAAW,MAAM,MAAM,SAAS,KAAK;AAC3C,QAAM,gBAAgB,oBAAI,IAAiC;AAC3D,aAAW,KAAK,UAAU;AACxB,UAAM,MAAM,cAAc,IAAI,EAAE,KAAK,KAAK,CAAC;AAC3C,QAAI,KAAK,CAAC;AACV,kBAAc,IAAI,EAAE,OAAO,GAAG;AAAA,EAChC;AAIA,QAAM,kBAAkB,oBAAI,IAAY;AACxC,aAAW,KAAK,MAAM,MAAM;AAC1B,eAAW,KAAK,OAAO,KAAK,EAAE,QAAQ,GAAG,EAAG,iBAAgB,IAAI,CAAC;AAAA,EACnE;AACA,QAAM,UAAU,MAAM,WAAW,CAAC,GAAG,eAAe;AAIpD,QAAM,UAAoB,CAAC;AAC3B,aAAW,KAAK,SAAS;AACvB,eAAW,KAAK,MAAM,gBAAgB;AACpC,cAAQ,KAAK,EAAE,QAAQ,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,EAAE,CAAC;AAAA,IACxD;AAAA,EACF;AAEA,MAAI,SAAS;AACb,MAAI,UAAU;AACd,aAAW,OAAO,MAAM,MAAM;AAC5B,UAAM,KAAK,cAAc,IAAI,IAAI,KAAK;AACtC,QAAI,CAAC,MAAM,GAAG,WAAW,GAAG;AAAE;AAAW;AAAA,IAAS;AAClD,QAAI,gBAAgB;AACpB,eAAW,KAAK,SAAS;AACvB,YAAM,IAAI,IAAI,QAAQ,IAAI,CAAC;AAC3B,UAAI,OAAO,MAAM,YAAY,CAAC,OAAO,SAAS,CAAC,EAAG;AAClD,iBAAW,KAAK,MAAM,gBAAgB;AACpC,cAAM,SAAS,GACZ,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,CAAC,EAC3B,OAAO,CAAC,MAAmB,OAAO,MAAM,YAAY,OAAO,SAAS,CAAC,CAAC;AACzE,YAAI,OAAO,WAAW,EAAG;AACzB,cAAM,IAAI,OAAO,QAAQ,IAAI,GAAG,SAAS;AACzC,YAAI,MAAM,KAAM;AAChB,cAAM,SAAS,QAAQ,KAAK,CAAC,MAAM,EAAE,WAAW,KAAK,EAAE,YAAY,CAAC;AACpE,eAAO,GAAG,KAAK,CAAC;AAChB,eAAO,GAAG,KAAK,CAAC;AAChB,wBAAgB;AAAA,MAClB;AAAA,IACF;AACA,QAAI,cAAe;AAAA,EACrB;AAEA,QAAM,QAA6B,CAAC;AACpC,aAAW,KAAK,SAAS;AACvB,QAAI,EAAE,GAAG,SAAS,WAAY;AAC9B,UAAM,UAAU,SAAS,EAAE,IAAI,EAAE,EAAE;AACnC,UAAM,WAAW,SAAS,aAAa,EAAE,EAAE,GAAG,aAAa,EAAE,EAAE,CAAC;AAChE,UAAM,KAAK,YAAY,EAAE,IAAI,EAAE,IAAI,WAAW,GAAG;AACjD,UAAM,UACJ,KAAK,IAAI,QAAQ,KAAK,MAAM,iBAC1B,KAAK,IAAI,QAAQ,KAAK,MAAM,gBAC5B;AACJ,UAAM,KAAK;AAAA,MACT,QAAQ,EAAE;AAAA,MAAQ,SAAS,EAAE;AAAA,MAAS,GAAG,EAAE,GAAG;AAAA,MAC9C;AAAA,MAAS;AAAA,MAAU,MAAM;AAAA,MAAI;AAAA,IAC/B,CAAC;AAAA,EACH;AAEA,QAAM,WAAW,oBAAI,IAAiC;AACtD,aAAW,KAAK,OAAO;AACrB,UAAM,MAAM,SAAS,IAAI,EAAE,MAAM,KAAK,CAAC;AACvC,QAAI,KAAK,CAAC;AACV,aAAS,IAAI,EAAE,QAAQ,GAAG;AAAA,EAC5B;AACA,QAAM,SAA0B,CAAC,GAAG,SAAS,QAAQ,CAAC,EACnD,IAAI,CAAC,CAAC,QAAQ,EAAE,MAAM;AACrB,UAAM,OAAO,GAAG,OAAO,CAAC,GAAG,MAAO,KAAK,IAAI,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,QAAQ,IAAI,IAAI,CAAE;AACtF,WAAO;AAAA,MACL;AAAA,MACA,aAAa,KAAK;AAAA,MAClB,UAAU,KAAK;AAAA,MACf,SAAS,KAAK;AAAA,MACd,GAAG,KAAK;AAAA,MACR,SAAS,KAAK;AAAA,IAChB;AAAA,EACF,CAAC,EACA,KAAK,CAAC,GAAG,MAAM,KAAK,IAAI,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,QAAQ,CAAC;AAE7D,QAAM,qBAAqB,QAAQ,OAAO,CAAC,MAAM,CAAC,SAAS,IAAI,CAAC,CAAC;AAEjE,SAAO,EAAE,OAAO,QAAQ,eAAe,QAAQ,aAAa,SAAS,mBAAmB;AAC1F;AAIA,SAAS,OACP,QACA,UACA,QACA,MACe;AACf,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,SAAS,OAAQ,QAAO,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AACvE,MAAI,SAAS,MAAO,QAAO,KAAK,IAAI,GAAG,MAAM;AAE7C,QAAM,SAAS,CAAC,GAAG,QAAQ,EACxB,OAAO,CAAC,MAAM,OAAO,EAAE,QAAQ,MAAM,MAAM,QAAQ,EACnD,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,EAAE,UAAU;AAC7C,SAAO,OAAO,CAAC,GAAG,QAAQ,MAAM,KAAK;AACvC;AAEA,SAAS,SAAS,GAAa,GAAqB;AAClD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO,OAAO;AACzD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAIA,OAAM,GAAG,KAAK,GAAG,KAAK;AAC1B,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,IAAAA,QAAO,KAAK;AAAI,UAAM,KAAK;AAAI,UAAM,KAAK;AAAA,EAC5C;AACA,MAAI,OAAO,KAAK,OAAO,EAAG,QAAO,OAAO,KAAK,OAAO,IAAI,IAAI;AAC5D,SAAOA,OAAM,KAAK,KAAK,KAAK,EAAE;AAChC;AAEA,SAAS,aAAa,IAAwB;AAC5C,QAAM,UAAU,GAAG,IAAI,CAAC,GAAG,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,IAAI,EAAE,CAAC;AACrE,QAAM,IAAI,IAAI,MAAc,GAAG,MAAM;AACrC,WAAS,IAAI,GAAG,IAAI,QAAQ,UAAU;AACpC,QAAI,IAAI;AACR,WAAO,IAAI,IAAI,QAAQ,UAAU,QAAQ,IAAI,CAAC,EAAG,MAAM,QAAQ,CAAC,EAAG,EAAG;AACtE,UAAM,OAAO,IAAI,IAAI,KAAK;AAC1B,aAAS,IAAI,GAAG,KAAK,GAAG,IAAK,GAAE,QAAQ,CAAC,EAAG,CAAC,IAAI;AAChD,QAAI,IAAI;AAAA,EACV;AACA,SAAO;AACT;AAEA,SAAS,YACP,IACA,IACA,YACA,KAC+B;AAC/B,QAAM,IAAI,GAAG;AACb,MAAI,IAAI,EAAG,QAAO,EAAE,KAAK,OAAO,KAAK,MAAM,OAAO,IAAI;AACtD,QAAM,UAAoB,CAAC;AAC3B,WAAS,IAAI,GAAG,IAAI,YAAY,KAAK;AACnC,UAAM,KAAK,IAAI,MAAc,CAAC;AAC9B,UAAM,KAAK,IAAI,MAAc,CAAC;AAC9B,aAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,YAAM,MAAM,KAAK,MAAM,IAAI,IAAI,CAAC;AAChC,SAAG,CAAC,IAAI,GAAG,GAAG;AACd,SAAG,CAAC,IAAI,GAAG,GAAG;AAAA,IAChB;AACA,UAAM,IAAI,SAAS,IAAI,EAAE;AACzB,QAAI,OAAO,SAAS,CAAC,EAAG,SAAQ,KAAK,CAAC;AAAA,EACxC;AACA,UAAQ,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC5B,MAAI,QAAQ,WAAW,EAAG,QAAO,EAAE,KAAK,OAAO,KAAK,MAAM,OAAO,IAAI;AACrE,SAAO;AAAA,IACL,KAAK,QAAQ,KAAK,MAAM,QAAQ,QAAQ,MAAM,CAAC;AAAA,IAC/C,MAAM,QAAQ,KAAK,IAAI,QAAQ,SAAS,GAAG,KAAK,MAAM,QAAQ,QAAQ,MAAM,CAAC,CAAC;AAAA,EAChF;AACF;AAEA,SAAS,QAAQ,MAA6B;AAC5C,MAAI,SAAS,OAAW,QAAO,KAAK;AACpC,MAAI,IAAI,SAAS;AACjB,SAAO,MAAM;AACX,QAAK,IAAI,eAAgB;AACzB,QAAI,IAAI;AACR,QAAI,KAAK,KAAK,IAAK,MAAM,IAAK,IAAI,CAAC;AACnC,SAAK,IAAI,KAAK,KAAK,IAAK,MAAM,GAAI,IAAI,EAAE;AACxC,aAAS,IAAK,MAAM,QAAS,KAAK;AAAA,EACpC;AACF;;;AC/LO,SAAS,qBACd,QACA,OAA4B,CAAC,GACP;AACtB,QAAM,IAAI,KAAK,SAAS;AACxB,QAAM,QAAQ,KAAK,SAAS;AAC5B,QAAM,gBAAgB,KAAK,uBAAuB;AAClD,QAAM,OAAO,KAAK,QAAQ;AAC1B,MAAI,KAAK,EAAG,OAAM,IAAI,MAAM,yCAAyC;AACrE,MAAI,SAAS,KAAK,SAAS,EAAG,OAAM,IAAI,MAAM,8CAA8C;AAC5F,MAAI,QAAQ,EAAE,OAAO,SAAS,KAAK,GAAG,KAAK,OAAO,SAAS,KAAK,IAAI,KAAK,KAAK,OAAO,KAAK,OAAO;AAC/F,UAAM,IAAI,MAAM,yDAAoD;AAAA,EACtE;AAEA,QAAM,QAA4B,CAAC;AACnC,MAAI,UAAU;AACd,MAAI,SAAS;AACb,MAAI,kBAAiC;AAGrC,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,QAAQ;AAEZ,WAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,QAAI,IAAI,OAAO,CAAC;AAChB,QAAI,IAAI,CAAC,KAAK,IAAI,GAAG;AACnB,UAAI,KAAK,IAAI,CAAC,GAAG,KAAK,IAAI,GAAG,CAAC,CAAC;AAC/B,gBAAU;AAAA,IACZ;AAKA,UAAM,QAAQ,UAAU,IAAI,IAAI,MAAM;AACtC,UAAM,SAAS,UAAU,IAAI,IAAI,IAAI,KAAK,IAAI,OAAO,QAAQ,QAAQ,QAAQ,KAAK;AAClF,UAAM,IAAI,IAAI;AACd,UAAM,SAAS,gBAAgB,KAAK,IAAI,GAAG,QAAQ,EAAE;AACrD,QAAI,SAAU,SAAS,SAAS,IAAI,KAAM;AAE1C,UAAM,YAAY,OAAO;AACzB,QAAI,SAAS,UAAW,UAAS;AACjC,QAAI,SAAS,CAAC,UAAW,UAAS,CAAC;AAEnC,aAAS,UAAU,IAAI,SAAS;AAChC,QAAI,CAAC,OAAO,SAAS,MAAM,KAAK,SAAS,EAAG,UAAS;AAErD,WAAO;AACP,aAAS,IAAI;AACb,aAAS;AAET,UAAM,SAAS,KAAK,IAAI,GAAG,IAAI,KAAK,IAAI,QAAQ,MAAM,CAAC;AAKvD,UAAM,KAAK,qBAAqB,KAAK,OAAO,OAAO,GAAG,KAAK;AAE3D,QAAI,WAA+B;AACnC,QAAI,QAAQ,GAAG,OAAO,KAAK,OAAO,GAAG,QAAQ,KAAK,KAAM,YAAW;AAAA,aAC1D,UAAU,IAAI,SAAS,QAAQ,EAAG,YAAW;AAAA,aAC7C,UAAU,IAAI,SAAS,QAAQ,EAAG,YAAW;AAAA,aAC7C,QAAQ,GAAG,OAAO,KAAK,IAAK,YAAW;AAEhD,QAAI,aAAa,cAAc,oBAAoB,KAAM,mBAAkB;AAE3E,UAAM,KAAK,EAAE,GAAG,OAAO,GAAG,QAAQ,QAAQ,OAAO,GAAG,KAAK,QAAQ,GAAG,MAAM,SAAS,CAAC;AAAA,EACtF;AAEA,QAAM,gBAAgB,MAAM,WAAW,IAAI,aAAa,MAAM,MAAM,SAAS,CAAC,EAAG;AACjF,SAAO,EAAE,OAAO,eAAe,iBAAiB,QAAQ;AAC1D;AAsCO,SAAS,iCACd,OAC0B;AAC1B,QAAM,aAAa,MAAM,YAAY,IAAI,CAAC,MAAM;AAC9C,UAAM,MAAM,qBAAqB,EAAE,QAAQ;AAAA,MACzC,OAAO,MAAM;AAAA,MACb,OAAO,MAAM;AAAA,MACb,MAAM,MAAM;AAAA,IACd,CAAC;AACD,UAAM,OAAO,IAAI,MAAM,IAAI,MAAM,SAAS,CAAC;AAC3C,WAAO;AAAA,MACL,aAAa,EAAE;AAAA,MACf,UAAU,IAAI;AAAA,MACd,iBAAiB,IAAI;AAAA,MACrB,aAAa,MAAM,UAAU;AAAA,MAC7B,aAAa,MAAM,UAAU;AAAA,MAC7B,OAAO,IAAI,MAAM;AAAA,MACjB,OAAO,MAAM,SAAS,OAAO;AAAA,MAC7B,QAAQ,MAAM,UAAU,OAAO;AAAA,IACjC;AAAA,EACF,CAAC;AAED,QAAM,UAAU,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,aAAa;AACnE,MAAI,QAAS,QAAO,EAAE,YAAY,gBAAgB,EAAE,UAAU,eAAe,aAAa,QAAQ,YAAY,EAAE;AAChH,QAAM,OAAO,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,UAAU;AAC7D,MAAI,KAAM,QAAO,EAAE,YAAY,gBAAgB,EAAE,UAAU,YAAY,aAAa,KAAK,EAAE;AAC3F,QAAM,QAAQ,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,YAAY;AAChE,MAAI,MAAO,QAAO,EAAE,YAAY,gBAAgB,EAAE,UAAU,cAAc,aAAa,MAAM,YAAY,EAAE;AAC3G,SAAO,EAAE,YAAY,gBAAgB,EAAE,UAAU,cAAc,aAAa,KAAK,EAAE;AACrF;AASA,SAAS,qBACP,KACA,OACA,GACA,OACA,OAC+B;AAC/B,MAAI,MAAM,EAAG,QAAO,EAAE,KAAK,CAAC,OAAO,MAAM,MAAM;AAC/C,QAAMC,QAAO,MAAM;AACnB,QAAM,WAAW,KAAK,IAAI,GAAG,QAAQ,IAAIA,QAAOA,KAAI;AAKpD,QAAM,MAAM,KAAK,IAAI,IAAI,KAAK,IAAI,MAAM,KAAK,IAAI,KAAK,IAAI,KAAK,IAAI,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC;AAClF,QAAM,SAAS,KAAK,KAAM,IAAI,WAAW,MAAO,CAAC,IAAK,IAAI,QAAQ,MAAO;AACzE,SAAO,EAAE,KAAKA,QAAO,QAAQ,MAAMA,QAAO,OAAO;AACnD;;;ACtPO,SAAS,oBACd,WACA,UAAsC,CAAC,GAC/B;AACR,QAAM,QAAQ,QAAQ,SAAS,mBAAmB,UAAU,MAAM;AAClE,QAAM,QAAkB,CAAC;AACzB,QAAM,KAAK,KAAK,KAAK,EAAE;AACvB,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,aAAa,UAAU,OAAO,YAAY,CAAC,IAAI;AAC1D,QAAM,KAAK,cAAc,UAAU,UAAU,QAAQ,IAAI,IAAI;AAC7D,MAAI,UAAU,YAAa,OAAM,KAAK,gBAAgB,UAAU,WAAW,IAAI;AAC/E,MAAI,UAAU,WAAY,OAAM,KAAK,eAAe,UAAU,UAAU,IAAI;AAC5E,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,UAAU,OAAO;AAC5B,QAAM,KAAK,EAAE;AAEb,QAAM,KAAK,YAAY;AACvB,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,oBAAoB;AAC/B,QAAM,KAAK,YAAY;AACvB,QAAM,KAAK,iBAAiB,UAAU,QAAQ,aAAa,IAAI;AAC/D,QAAM,KAAK,mBAAmB,UAAU,QAAQ,UAAU,IAAI;AAC9D,QAAM,KAAK,oBAAoB,UAAU,QAAQ,WAAW,IAAI;AAChE,QAAM,KAAK,iBAAiB,IAAI,UAAU,QAAQ,QAAQ,CAAC,IAAI;AAC/D,QAAM,KAAK,kBAAkB,IAAI,UAAU,QAAQ,SAAS,CAAC,IAAI;AACjE,QAAM,KAAK,mBAAmB,IAAI,UAAU,QAAQ,eAAe,CAAC,IAAI;AACxE,QAAM,KAAK,oBAAoB,IAAI,UAAU,QAAQ,gBAAgB,CAAC,IAAI;AAC1E,QAAM,KAAK,mBAAmB,IAAI,UAAU,QAAQ,UAAU,CAAC,IAAI;AACnE,QAAM,KAAK,kBAAkB,IAAI,UAAU,QAAQ,WAAW,CAAC,IAAI;AACnE,QAAM,KAAK,qBAAqB,KAAK,MAAM,UAAU,QAAQ,SAAS,CAAC,OAAO;AAC9E,QAAM,KAAK,EAAE;AAEb,MAAI,UAAU,OAAO,SAAS,GAAG;AAC/B,UAAM,KAAK,WAAW;AACtB,UAAM,KAAK,EAAE;AACb,eAAW,SAAS,UAAU,QAAQ;AACpC,YAAM,KAAK,OAAO,MAAM,QAAQ,QAAQ,MAAM,IAAI,OAAO,MAAM,IAAI,MAAM,MAAM,MAAM,EAAE;AAAA,IACzF;AACA,UAAM,KAAK,EAAE;AAAA,EACf;AAEA,QAAM,WAAW,QAAQ,UAAU,QAAQ,wBAAwB;AACnE,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,yBAAyB;AACpC,UAAM,KAAK,EAAE;AACb,eAAW,CAAC,SAAS,KAAK,KAAK,SAAU,OAAM,KAAK,KAAK,OAAO,KAAK,KAAK,EAAE;AAC5E,UAAM,KAAK,EAAE;AAAA,EACf;AAEA,QAAM,WAAW,QAAQ,UAAU,QAAQ,iBAAiB;AAC5D,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,kBAAkB;AAC7B,UAAM,KAAK,EAAE;AACb,eAAW,CAAC,MAAM,KAAK,KAAK,SAAU,OAAM,KAAK,KAAK,IAAI,KAAK,KAAK,EAAE;AACtE,UAAM,KAAK,EAAE;AAAA,EACf;AAEA,MAAI,QAAQ,QAAQ,QAAQ,KAAK,SAAS,GAAG;AAC3C,UAAM,KAAK,gBAAgB;AAC3B,UAAM,KAAK,EAAE;AACb,UAAM,KAAK,aAAa,CAAC,GAAG,QAAQ,IAAI,GAAG;AAAA,MACzC,YAAY,QAAQ,cAAc,UAAU,cAAc;AAAA,MAC1D,OAAO;AAAA,IACT,CAAC,EAAE,QAAQ;AACX,UAAM,KAAK,EAAE;AAAA,EACf;AAEA,MAAI,QAAQ,wBAAwB,QAAQ,qBAAqB,SAAS,GAAG;AAC3E,UAAM,KAAK,0BAA0B;AACrC,UAAM,KAAK,EAAE;AACb,eAAW,WAAW,QAAQ,qBAAsB,OAAM,KAAK,KAAK,OAAO,EAAE;AAC7E,UAAM,KAAK,EAAE;AAAA,EACf;AAEA,QAAM,cAAc,QAAQ,eAAe,mBAAmB,SAAS;AACvE,MAAI,YAAY,SAAS,GAAG;AAC1B,UAAM,KAAK,iBAAiB;AAC5B,UAAM,KAAK,EAAE;AACb,eAAW,UAAU,YAAa,OAAM,KAAK,KAAK,MAAM,EAAE;AAC1D,UAAM,KAAK,EAAE;AAAA,EACf;AAEA,SAAO,MAAM,KAAK,IAAI,EAAE,QAAQ,IAAI;AACtC;AAEA,SAAS,mBAAmB,WAAiD;AAC3E,MAAI,UAAU,QAAS,QAAO,CAAC,kDAAkD;AACjF,SAAO,UAAU,OACd,OAAO,CAAC,UAAU,MAAM,aAAa,UAAU,EAC/C,IAAI,CAAC,UAAU,WAAW,MAAM,IAAI,KAAK,MAAM,MAAM,EAAE;AAC5D;AAEA,SAAS,QAAQ,QAAyD;AACxE,SAAO,OAAO,QAAQ,MAAM,EACzB,OAAO,CAAC,CAAC,EAAE,KAAK,MAAM,QAAQ,CAAC,EAC/B,KAAK,CAAC,GAAG,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,CAAC;AAC3D;AAEA,SAAS,IAAI,OAAuB;AAClC,SAAO,OAAO,SAAS,KAAK,IAAI,IAAI,QAAQ,KAAK,QAAQ,CAAC,CAAC,MAAM;AACnE;AAEA,SAAS,IAAI,OAAuB;AAClC,SAAO,OAAO,SAAS,KAAK,IAAI,MAAM,QAAQ,CAAC,IAAI;AACrD;;;ACpDO,SAASC,aACd,UACA,WACA,UAA4B,CAAC,GACZ;AACjB,QAAM,QAAQ,QAAQ,SAAS;AAC/B,QAAM,aAAa,QAAQ,cAAc;AACzC,QAAM,WAAW,QAAQ,mBAAmB;AAC5C,QAAM,MAAM,WAAW,QAAQ,QAAQ,SAAS,UAAU,SAAS,CAAC;AAEpE,QAAM,eAAeC,MAAK,QAAQ;AAClC,QAAM,gBAAgBA,MAAK,SAAS;AACpC,QAAM,QAAQ,gBAAgB;AAE9B,MAAI,SAAS,SAAS,UAAU,SAAS,YAAY,SAAS,WAAW,KAAK,UAAU,WAAW,GAAG;AACpG,WAAO;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA,SAAS;AAAA,MACT,SAAS;AAAA,MACT,YAAY;AAAA,MACZ;AAAA,MACA,SAAS;AAAA,IACX;AAAA,EACF;AAEA,QAAM,SAAmB,IAAI,MAAM,UAAU;AAC7C,WAAS,IAAI,GAAG,IAAI,YAAY,KAAK;AACnC,UAAM,YAAY,SAAS,UAAU,GAAG;AACxC,UAAM,YAAY,SAAS,WAAW,GAAG;AACzC,WAAO,CAAC,IAAIA,MAAK,SAAS,IAAIA,MAAK,SAAS;AAAA,EAC9C;AACA,SAAO,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3B,QAAM,WAAW,KAAK,MAAO,QAAQ,IAAK,UAAU;AACpD,QAAM,WAAW,KAAK,OAAO,IAAI,QAAQ,KAAK,UAAU,IAAI;AAC5D,QAAM,UAAU,OAAO,KAAK,IAAI,GAAG,QAAQ,CAAC;AAC5C,QAAM,UAAU,OAAO,KAAK,IAAI,aAAa,GAAG,QAAQ,CAAC;AAEzD,MAAI;AACJ,MAAI,UAAU,EAAG,WAAU;AAAA,WAClB,UAAU,EAAG,WAAU;AAAA,WACvB,SAAS,EAAG,WAAU;AAAA,MAC1B,WAAU;AAEf,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAEA,SAASA,MAAK,IAAsB;AAClC,MAAI,GAAG,WAAW,EAAG,QAAO;AAC5B,MAAI,IAAI;AACR,aAAW,KAAK,GAAI,MAAK;AACzB,SAAO,IAAI,GAAG;AAChB;AAEA,SAAS,SAAS,IAAc,KAA6B;AAC3D,QAAM,MAAM,IAAI,MAAM,GAAG,MAAM;AAC/B,WAAS,IAAI,GAAG,IAAI,GAAG,QAAQ,IAAK,KAAI,CAAC,IAAI,GAAG,KAAK,MAAM,IAAI,IAAI,GAAG,MAAM,CAAC;AAC7E,SAAO;AACT;AAGA,SAAS,WAAW,MAA4B;AAC9C,MAAI,IAAI,SAAS;AACjB,SAAO,MAAM;AACX,SAAK;AACL,QAAI,IAAI;AACR,QAAI,KAAK,KAAK,IAAK,MAAM,IAAK,IAAI,CAAC;AACnC,SAAK,IAAI,KAAK,KAAK,IAAK,MAAM,GAAI,IAAI,EAAE;AACxC,aAAS,IAAK,MAAM,QAAS,KAAK;AAAA,EACpC;AACF;AAGA,SAAS,SAAS,GAAa,GAAqB;AAClD,MAAI,IAAI;AACR,aAAW,KAAK,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG;AAC5B,UAAM,OAAO,IAAI,aAAa,CAAC,CAAC,CAAC;AACjC,UAAM,QAAQ,IAAI,WAAW,KAAK,MAAM;AACxC,eAAW,QAAQ,OAAO;AACxB,WAAK;AACL,UAAI,KAAK,KAAK,GAAG,QAAQ;AAAA,IAC3B;AAAA,EACF;AACA,SAAO,MAAM;AACf;AA8BA,eAAsB,gBACpB,MACkF;AAClF,QAAM,cAAc,KAAK,oBAAoB;AAC7C,QAAM,iBAAiB,MAAM,SAAS,KAAK,iBAAiB,KAAK,OAAO,WAAW;AACnF,QAAM,kBAAkB,MAAM,SAAS,KAAK,kBAAkB,KAAK,OAAO,WAAW;AACrF,QAAM,KAAKD,aAAY,gBAAgB,iBAAiB;AAAA,IACtD,GAAI,KAAK,UAAU,SAAY,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,IACxD,GAAI,KAAK,eAAe,SAAY,EAAE,YAAY,KAAK,WAAW,IAAI,CAAC;AAAA,IACvE,GAAI,KAAK,SAAS,SAAY,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,EACvD,CAAC;AACD,SAAO;AAAA,IACL,GAAG;AAAA,IACH,iBAAiB,eAAe;AAAA,IAChC,kBAAkB,gBAAgB;AAAA,EACpC;AACF;AAEA,eAAe,SACb,SACA,OACA,aACmB;AACnB,QAAM,UAAoB,IAAI,MAAM,QAAQ,MAAM;AAClD,MAAI,OAAO;AACX,iBAAe,SAAwB;AACrC,WAAO,MAAM;AACX,YAAM,IAAI;AACV,UAAI,KAAK,QAAQ,OAAQ;AACzB,YAAM,IAAI,MAAM,MAAM,QAAQ,CAAC,CAAE;AACjC,cAAQ,CAAC,IAAI,OAAO,SAAS,CAAC,IAAI,IAAI;AAAA,IACxC;AAAA,EACF;AACA,QAAM,QAAQ,IAAI,MAAM,KAAK,EAAE,QAAQ,KAAK,IAAI,GAAG,WAAW,EAAE,GAAG,MAAM,OAAO,CAAC,CAAC;AAClF,SAAO;AACT;","names":["num","mean","bootstrapCi","mean"]}
|
|
@@ -3,8 +3,26 @@ import {
|
|
|
3
3
|
} from "./chunk-YUFXO3TU.js";
|
|
4
4
|
import {
|
|
5
5
|
pairedBootstrap,
|
|
6
|
-
pairedWilcoxon
|
|
7
|
-
|
|
6
|
+
pairedWilcoxon,
|
|
7
|
+
researchReport
|
|
8
|
+
} from "./chunk-IOXMGMHQ.js";
|
|
9
|
+
import {
|
|
10
|
+
RunIntegrityError,
|
|
11
|
+
assertRunCaptured
|
|
12
|
+
} from "./chunk-QUKKGHTZ.js";
|
|
13
|
+
import {
|
|
14
|
+
TraceEmitter
|
|
15
|
+
} from "./chunk-5IIQKMD5.js";
|
|
16
|
+
import {
|
|
17
|
+
canonicalize,
|
|
18
|
+
hashJson
|
|
19
|
+
} from "./chunk-6M774GY6.js";
|
|
20
|
+
import {
|
|
21
|
+
assertLlmRoute
|
|
22
|
+
} from "./chunk-KAO3Q65R.js";
|
|
23
|
+
import {
|
|
24
|
+
FileSystemRawProviderSink
|
|
25
|
+
} from "./chunk-SQQLHODJ.js";
|
|
8
26
|
|
|
9
27
|
// src/feedback-trajectory.ts
|
|
10
28
|
var DEFAULT_SPLIT_POLICY = {
|
|
@@ -253,7 +271,7 @@ function renderPreferenceMemoryMarkdown(entries) {
|
|
|
253
271
|
return lines.join("\n").trim() + "\n";
|
|
254
272
|
}
|
|
255
273
|
function serializeFeedbackTrajectoriesJsonl(trajectories) {
|
|
256
|
-
return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(
|
|
274
|
+
return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize2(trajectory))).join("\n") + "\n";
|
|
257
275
|
}
|
|
258
276
|
function parseFeedbackTrajectoriesJsonl(jsonl) {
|
|
259
277
|
const trajectories = [];
|
|
@@ -370,12 +388,12 @@ function stableHash(input) {
|
|
|
370
388
|
}
|
|
371
389
|
return hash >>> 0;
|
|
372
390
|
}
|
|
373
|
-
function
|
|
391
|
+
function canonicalize2(value) {
|
|
374
392
|
if (value === null || typeof value !== "object") return value;
|
|
375
|
-
if (Array.isArray(value)) return value.map(
|
|
393
|
+
if (Array.isArray(value)) return value.map(canonicalize2);
|
|
376
394
|
const out = {};
|
|
377
395
|
for (const key of Object.keys(value).sort()) {
|
|
378
|
-
out[key] =
|
|
396
|
+
out[key] = canonicalize2(value[key]);
|
|
379
397
|
}
|
|
380
398
|
return out;
|
|
381
399
|
}
|
|
@@ -867,6 +885,263 @@ function samePopulation(a, b) {
|
|
|
867
885
|
return b.every((id) => setA.has(id));
|
|
868
886
|
}
|
|
869
887
|
|
|
888
|
+
// src/eval-campaign.ts
|
|
889
|
+
var DEFAULT_INTEGRITY = {
|
|
890
|
+
llmSpansMin: 1,
|
|
891
|
+
requireRawCoverageOfLlmSpans: true,
|
|
892
|
+
requireOutcome: true
|
|
893
|
+
};
|
|
894
|
+
var DEFAULT_ROUTE = {
|
|
895
|
+
requireExplicitBaseUrl: true,
|
|
896
|
+
requireAuth: true
|
|
897
|
+
};
|
|
898
|
+
async function runEvalCampaign(opts) {
|
|
899
|
+
assertLlmRoute(opts.llmOpts, opts.routeRequirements ?? DEFAULT_ROUTE);
|
|
900
|
+
if (opts.variants.length === 0) {
|
|
901
|
+
throw new Error("runEvalCampaign: variants must be non-empty.");
|
|
902
|
+
}
|
|
903
|
+
if (opts.scenarios.length === 0) {
|
|
904
|
+
throw new Error("runEvalCampaign: scenarios must be non-empty.");
|
|
905
|
+
}
|
|
906
|
+
const variantIds = /* @__PURE__ */ new Set();
|
|
907
|
+
for (const v of opts.variants) {
|
|
908
|
+
if (variantIds.has(v.id)) {
|
|
909
|
+
throw new Error(`runEvalCampaign: duplicate variant id "${v.id}".`);
|
|
910
|
+
}
|
|
911
|
+
variantIds.add(v.id);
|
|
912
|
+
}
|
|
913
|
+
const scenarioIds = /* @__PURE__ */ new Set();
|
|
914
|
+
for (const s of opts.scenarios) {
|
|
915
|
+
if (scenarioIds.has(s.scenarioId)) {
|
|
916
|
+
throw new Error(`runEvalCampaign: duplicate scenarioId "${s.scenarioId}".`);
|
|
917
|
+
}
|
|
918
|
+
scenarioIds.add(s.scenarioId);
|
|
919
|
+
}
|
|
920
|
+
if (opts.report?.comparator && !variantIds.has(opts.report.comparator)) {
|
|
921
|
+
throw new Error(`runEvalCampaign: report.comparator "${opts.report.comparator}" is not a configured variantId.`);
|
|
922
|
+
}
|
|
923
|
+
if (!opts.commitSha) {
|
|
924
|
+
throw new Error("runEvalCampaign: commitSha is required (every RunRecord needs it).");
|
|
925
|
+
}
|
|
926
|
+
const seeds = opts.seeds ?? [0, 1, 2];
|
|
927
|
+
const splitTag = opts.splitTag ?? "holdout";
|
|
928
|
+
const concurrency = Math.max(1, opts.concurrency ?? 1);
|
|
929
|
+
const integrity = { ...DEFAULT_INTEGRITY, ...opts.integrity ?? {} };
|
|
930
|
+
const onIntegrityFailure = opts.onIntegrityFailure ?? "mark_failed";
|
|
931
|
+
const now = opts.now ?? (() => Date.now());
|
|
932
|
+
const baseUrl = (opts.llmOpts.baseUrl ?? "").replace(/\/+$/, "");
|
|
933
|
+
const provider = opts.llmOpts.provider ?? null;
|
|
934
|
+
const preregistrationHash = opts.preregistrationHash ?? null;
|
|
935
|
+
const rawSinkFactory = opts.rawSinkFactory ?? defaultRawSinkFactory(opts.workDir);
|
|
936
|
+
const campaignFingerprint = await hashJson(canonicalize({
|
|
937
|
+
campaignId: opts.campaignId,
|
|
938
|
+
variants: opts.variants.map((v) => v.id).sort(),
|
|
939
|
+
scenarios: opts.scenarios.map((s) => s.scenarioId).sort(),
|
|
940
|
+
seeds: [...seeds].sort((a, b) => a - b),
|
|
941
|
+
splitTag,
|
|
942
|
+
comparator: opts.report?.comparator ?? null,
|
|
943
|
+
baseUrl,
|
|
944
|
+
provider,
|
|
945
|
+
preregistrationHash
|
|
946
|
+
}));
|
|
947
|
+
const cells = [];
|
|
948
|
+
for (const variant of opts.variants) {
|
|
949
|
+
for (const scenario of opts.scenarios) {
|
|
950
|
+
for (const seed of seeds) {
|
|
951
|
+
cells.push({ variant, scenario, seed });
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
const startedAt = new Date(now()).toISOString();
|
|
956
|
+
const runs = [];
|
|
957
|
+
const integrityReports = [];
|
|
958
|
+
const failedRuns = [];
|
|
959
|
+
let cursor = 0;
|
|
960
|
+
async function worker() {
|
|
961
|
+
while (true) {
|
|
962
|
+
const i = cursor++;
|
|
963
|
+
if (i >= cells.length) return;
|
|
964
|
+
const cell = cells[i];
|
|
965
|
+
try {
|
|
966
|
+
const result = await runOneCell(cell);
|
|
967
|
+
runs.push(result.record);
|
|
968
|
+
integrityReports.push(result.integrity);
|
|
969
|
+
} catch (err) {
|
|
970
|
+
if (err instanceof CellExecutionError) {
|
|
971
|
+
failedRuns.push(err.failed);
|
|
972
|
+
if (err.integrity) integrityReports.push(err.integrity);
|
|
973
|
+
} else {
|
|
974
|
+
throw err;
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
async function runOneCell(cell) {
|
|
980
|
+
const runId = (opts.runId ?? defaultRunId)({
|
|
981
|
+
campaignId: opts.campaignId,
|
|
982
|
+
runId: "",
|
|
983
|
+
// unused by default generator
|
|
984
|
+
variantId: cell.variant.id,
|
|
985
|
+
scenarioId: cell.scenario.scenarioId,
|
|
986
|
+
seed: cell.seed
|
|
987
|
+
});
|
|
988
|
+
const factoryParams = {
|
|
989
|
+
campaignId: opts.campaignId,
|
|
990
|
+
runId,
|
|
991
|
+
variantId: cell.variant.id,
|
|
992
|
+
scenarioId: cell.scenario.scenarioId,
|
|
993
|
+
seed: cell.seed
|
|
994
|
+
};
|
|
995
|
+
const store = opts.storeFactory(factoryParams);
|
|
996
|
+
const rawSink = rawSinkFactory(factoryParams);
|
|
997
|
+
const emitter = new TraceEmitter(store, {
|
|
998
|
+
runId,
|
|
999
|
+
now: opts.now,
|
|
1000
|
+
onRunComplete: opts.onRunComplete
|
|
1001
|
+
});
|
|
1002
|
+
const llmOpts = {
|
|
1003
|
+
...opts.llmOpts,
|
|
1004
|
+
rawSink,
|
|
1005
|
+
traceContext: { runId }
|
|
1006
|
+
};
|
|
1007
|
+
const ctx = {
|
|
1008
|
+
runId,
|
|
1009
|
+
experimentId: opts.campaignId,
|
|
1010
|
+
variant: cell.variant.payload,
|
|
1011
|
+
variantId: cell.variant.id,
|
|
1012
|
+
scenarioId: cell.scenario.scenarioId,
|
|
1013
|
+
scenarioTags: cell.scenario.tags ?? {},
|
|
1014
|
+
seed: cell.seed,
|
|
1015
|
+
splitTag,
|
|
1016
|
+
emitter,
|
|
1017
|
+
store,
|
|
1018
|
+
rawSink,
|
|
1019
|
+
llmOpts
|
|
1020
|
+
};
|
|
1021
|
+
const wallStart = now();
|
|
1022
|
+
let outcome;
|
|
1023
|
+
try {
|
|
1024
|
+
outcome = await opts.runner(ctx);
|
|
1025
|
+
} catch (err) {
|
|
1026
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1027
|
+
try {
|
|
1028
|
+
await emitter.abortRun(message);
|
|
1029
|
+
} catch {
|
|
1030
|
+
}
|
|
1031
|
+
throw new CellExecutionError({
|
|
1032
|
+
runId,
|
|
1033
|
+
variantId: cell.variant.id,
|
|
1034
|
+
scenarioId: cell.scenario.scenarioId,
|
|
1035
|
+
seed: cell.seed,
|
|
1036
|
+
reason: "runner_threw",
|
|
1037
|
+
error: message
|
|
1038
|
+
});
|
|
1039
|
+
}
|
|
1040
|
+
const wallMs = now() - wallStart;
|
|
1041
|
+
const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink });
|
|
1042
|
+
if (!integrityReport.ok) {
|
|
1043
|
+
switch (onIntegrityFailure) {
|
|
1044
|
+
case "throw":
|
|
1045
|
+
throw new RunIntegrityError(integrityReport);
|
|
1046
|
+
case "mark_failed":
|
|
1047
|
+
throw new CellExecutionError(
|
|
1048
|
+
{
|
|
1049
|
+
runId,
|
|
1050
|
+
variantId: cell.variant.id,
|
|
1051
|
+
scenarioId: cell.scenario.scenarioId,
|
|
1052
|
+
seed: cell.seed,
|
|
1053
|
+
reason: "integrity_failed",
|
|
1054
|
+
error: integrityReport.issues.map((i) => i.code).join(", ")
|
|
1055
|
+
},
|
|
1056
|
+
integrityReport
|
|
1057
|
+
);
|
|
1058
|
+
case "log":
|
|
1059
|
+
break;
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
const recordOutcome = {
|
|
1063
|
+
raw: outcome.raw ?? {}
|
|
1064
|
+
};
|
|
1065
|
+
if (splitTag === "holdout") recordOutcome.holdoutScore = outcome.score;
|
|
1066
|
+
else recordOutcome.searchScore = outcome.score;
|
|
1067
|
+
const record = {
|
|
1068
|
+
runId,
|
|
1069
|
+
experimentId: opts.campaignId,
|
|
1070
|
+
candidateId: cell.variant.id,
|
|
1071
|
+
seed: cell.seed,
|
|
1072
|
+
model: outcome.model,
|
|
1073
|
+
promptHash: outcome.promptHash,
|
|
1074
|
+
configHash: outcome.configHash,
|
|
1075
|
+
commitSha: opts.commitSha,
|
|
1076
|
+
wallMs,
|
|
1077
|
+
costUsd: outcome.costUsd,
|
|
1078
|
+
tokenUsage: outcome.tokenUsage,
|
|
1079
|
+
judgeMetadata: outcome.judgeMetadata,
|
|
1080
|
+
outcome: recordOutcome,
|
|
1081
|
+
failureMode: outcome.failureMode,
|
|
1082
|
+
splitTag
|
|
1083
|
+
};
|
|
1084
|
+
return { record, integrity: integrityReport };
|
|
1085
|
+
}
|
|
1086
|
+
const workers = Array.from({ length: Math.min(concurrency, cells.length) }, () => worker());
|
|
1087
|
+
await Promise.all(workers);
|
|
1088
|
+
let report;
|
|
1089
|
+
if (opts.report) {
|
|
1090
|
+
const reportOpts = {
|
|
1091
|
+
...opts.report,
|
|
1092
|
+
comparator: opts.report.comparator,
|
|
1093
|
+
split: splitTag === "dev" ? "search" : splitTag,
|
|
1094
|
+
generatedAt: new Date(now()).toISOString(),
|
|
1095
|
+
preregistrationHash: preregistrationHash ?? void 0
|
|
1096
|
+
};
|
|
1097
|
+
report = await researchReport(runs, reportOpts);
|
|
1098
|
+
}
|
|
1099
|
+
const endedAt = new Date(now()).toISOString();
|
|
1100
|
+
return {
|
|
1101
|
+
campaignId: opts.campaignId,
|
|
1102
|
+
campaignFingerprint,
|
|
1103
|
+
preregistrationHash,
|
|
1104
|
+
runs,
|
|
1105
|
+
integrityReports,
|
|
1106
|
+
failedRuns,
|
|
1107
|
+
report,
|
|
1108
|
+
startedAt,
|
|
1109
|
+
endedAt
|
|
1110
|
+
};
|
|
1111
|
+
}
|
|
1112
|
+
var CellExecutionError = class extends Error {
|
|
1113
|
+
failed;
|
|
1114
|
+
integrity;
|
|
1115
|
+
constructor(failed, integrity) {
|
|
1116
|
+
super(`cell ${failed.variantId}/${failed.scenarioId}@${failed.seed} failed: ${failed.reason}`);
|
|
1117
|
+
this.failed = failed;
|
|
1118
|
+
this.integrity = integrity;
|
|
1119
|
+
}
|
|
1120
|
+
};
|
|
1121
|
+
function defaultRawSinkFactory(workDir) {
|
|
1122
|
+
return (params) => {
|
|
1123
|
+
if (!workDir) {
|
|
1124
|
+
throw new Error(
|
|
1125
|
+
"runEvalCampaign: rawSinkFactory not supplied and workDir not set. Pass either to enable raw provider capture, or pass `new NoopRawProviderSink()` via rawSinkFactory to opt out explicitly."
|
|
1126
|
+
);
|
|
1127
|
+
}
|
|
1128
|
+
return new FileSystemRawProviderSink({
|
|
1129
|
+
dir: `${workDir}/raw-events/${params.runId}`
|
|
1130
|
+
});
|
|
1131
|
+
};
|
|
1132
|
+
}
|
|
1133
|
+
function defaultRunId(params) {
|
|
1134
|
+
const base = `${params.campaignId}::${params.variantId}::${params.scenarioId}::${params.seed}`;
|
|
1135
|
+
let h1 = 2166136261;
|
|
1136
|
+
let h2 = 305419896;
|
|
1137
|
+
for (let i = 0; i < base.length; i++) {
|
|
1138
|
+
const c = base.charCodeAt(i);
|
|
1139
|
+
h1 = Math.imul(h1 ^ c, 16777619) >>> 0;
|
|
1140
|
+
h2 = Math.imul(h2 ^ c, 2654435761) >>> 0;
|
|
1141
|
+
}
|
|
1142
|
+
return `run-${h1.toString(16).padStart(8, "0")}${h2.toString(16).padStart(8, "0")}`;
|
|
1143
|
+
}
|
|
1144
|
+
|
|
870
1145
|
// src/multi-shot-optimization.ts
|
|
871
1146
|
async function runMultiShotOptimization(config) {
|
|
872
1147
|
validateConfig(config);
|
|
@@ -1344,6 +1619,7 @@ export {
|
|
|
1344
1619
|
NoopResearcher,
|
|
1345
1620
|
InMemoryTrialCache,
|
|
1346
1621
|
runPromptEvolution,
|
|
1622
|
+
runEvalCampaign,
|
|
1347
1623
|
runMultiShotOptimization,
|
|
1348
1624
|
defaultMultiShotObjectives,
|
|
1349
1625
|
trialTraceFromMultiShotTrial,
|
|
@@ -1351,4 +1627,4 @@ export {
|
|
|
1351
1627
|
buildReflectionPrompt,
|
|
1352
1628
|
parseReflectionResponse
|
|
1353
1629
|
};
|
|
1354
|
-
//# sourceMappingURL=chunk-
|
|
1630
|
+
//# sourceMappingURL=chunk-USHQBPMH.js.map
|