@tangle-network/agent-eval 0.77.0 → 0.80.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -19
- package/dist/adapters/http.d.ts +2 -2
- package/dist/adapters/langchain.d.ts +2 -2
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
- package/dist/analyst/index.d.ts +42 -8
- package/dist/analyst/index.js +32 -2
- package/dist/analyst/index.js.map +1 -1
- package/dist/authenticity/index.d.ts +54 -1
- package/dist/authenticity/index.js +88 -1
- package/dist/authenticity/index.js.map +1 -1
- package/dist/belief-state/index.d.ts +188 -0
- package/dist/belief-state/index.js +486 -0
- package/dist/belief-state/index.js.map +1 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/calibration-Cpr3WaX3.d.ts +101 -0
- package/dist/campaign/index.d.ts +11 -11
- package/dist/campaign/index.js +4 -4
- package/dist/chunk-4DIJWVUT.js +131 -0
- package/dist/chunk-4DIJWVUT.js.map +1 -0
- package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
- package/dist/chunk-5LVWPNS5.js.map +1 -0
- package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
- package/dist/chunk-CF67I6QY.js.map +1 -0
- package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
- package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
- package/dist/chunk-KWRRMR3J.js.map +1 -0
- package/dist/chunk-NPCTHQIO.js +91 -0
- package/dist/chunk-NPCTHQIO.js.map +1 -0
- package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
- package/dist/chunk-RPLZ4OIB.js.map +1 -0
- package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
- package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
- package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
- package/dist/contract/index.d.ts +128 -15
- package/dist/contract/index.js +118 -2
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +2 -2
- package/dist/governance/index.d.ts +1 -1
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
- package/dist/index.d.ts +127 -26
- package/dist/index.js +32 -7
- package/dist/index.js.map +1 -1
- package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
- package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
- package/dist/meta-eval/index.d.ts +6 -99
- package/dist/meta-eval/index.js +7 -76
- package/dist/meta-eval/index.js.map +1 -1
- package/dist/off-policy-DiwuKKg7.d.ts +132 -0
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
- package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
- package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
- package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
- package/dist/reporting.d.ts +5 -5
- package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
- package/dist/rl.d.ts +10 -140
- package/dist/rl.js +8 -122
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
- package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
- package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
- package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
- package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
- package/dist/traces.d.ts +1 -1
- package/dist/traces.js +2 -2
- package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
- package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
- package/dist/workflow/index.d.ts +4 -4
- package/dist/workflow/index.js +1 -1
- package/docs/auto-research-loop-end-to-end.md +1 -1
- package/docs/feature-guide.md +4 -4
- package/docs/multi-shot-optimization.md +61 -115
- package/docs/product-eval-adoption.md +1 -1
- package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
- package/docs/research/research-roadmap.md +1 -0
- package/docs/three-package-architecture.md +1 -1
- package/docs/trace-analysis.md +19 -0
- package/package.json +7 -2
- package/dist/chunk-7W4SM7FD.js.map +0 -1
- package/dist/chunk-F3SRAAZO.js.map +0 -1
- package/dist/chunk-JYE3WOTE.js.map +0 -1
- package/dist/chunk-WYIHD6EB.js.map +0 -1
- /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
- /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
- /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
- /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/agent-profile-cell.ts","../src/run-record.ts"],"sourcesContent":["import { ValidationError } from './errors'\nimport { hashJson } from './pre-registration'\n\nexport type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1'\n\nexport type AgentProfileJson =\n | string\n | number\n | boolean\n | null\n | AgentProfileJson[]\n | { [key: string]: AgentProfileJson }\n\nexport type AgentProfileDimensionValue = string | number | boolean | null\n\nexport interface AgentProfileSource {\n /** Runtime/profile contract being fingerprinted, e.g. `sandbox-agent-profile`. */\n kind: string\n /** sha256 over the canonical source profile object. */\n hash: string\n}\n\nexport interface AgentProfileSourceInput {\n kind: string\n /** Precomputed sha256 for callers that already sign their profile artifact. */\n hash?: string\n /** Full canonical runtime profile; hashed and then discarded from the cell. */\n profile?: AgentProfileJson\n}\n\nexport interface AgentProfileHarness {\n id: string\n version?: string\n hash?: string\n}\n\nexport interface AgentProfileCellInput {\n profileId: string\n sourceProfile: AgentProfileSourceInput\n harness?: AgentProfileHarness\n model?: string\n promptHash?: string\n dimensions?: Record<string, AgentProfileDimensionValue>\n}\n\nexport interface AgentProfileCell {\n schemaVersion: AgentProfileCellSchemaVersion\n cellId: string\n profileId: string\n sourceProfile: AgentProfileSource\n harness?: AgentProfileHarness\n model?: string\n promptHash?: string\n dimensions?: Record<string, AgentProfileDimensionValue>\n}\n\nexport class AgentProfileCellValidationError extends ValidationError {\n readonly path: string\n constructor(message: string, path = '') {\n super(path ? `${message} (at ${path})` : message)\n this.path = path\n }\n}\n\nconst SHA256_HEX = /^[0-9a-f]{64}$/\nconst CELL_ID = /^agent-profile-cell:sha256:[0-9a-f]{64}$/\n\nexport async function buildAgentProfileCell(\n input: AgentProfileCellInput,\n): Promise<AgentProfileCell> {\n const material = await normalizeAgentProfileCellInput(input)\n const cellId = `agent-profile-cell:sha256:${await hashJson(material)}`\n return { ...material, cellId }\n}\n\nexport function agentProfileCellHashMaterial(\n cell: AgentProfileCell,\n): Omit<AgentProfileCell, 'cellId'> {\n const { cellId: _cellId, ...material } = cell\n void _cellId\n return normalizeAgentProfileCell(material)\n}\n\nexport async function verifyAgentProfileCell(cell: AgentProfileCell): Promise<boolean> {\n validateAgentProfileCell(cell)\n return (\n cell.cellId ===\n `agent-profile-cell:sha256:${await hashJson(agentProfileCellHashMaterial(cell))}`\n )\n}\n\nexport function validateAgentProfileCell(input: unknown): AgentProfileCell {\n if (input === null || typeof input !== 'object') {\n throw new AgentProfileCellValidationError('expected object')\n }\n const obj = input as Record<string, unknown>\n expectLiteral(obj.schemaVersion, 'agent-profile-cell/v1', 'schemaVersion')\n if (typeof obj.cellId !== 'string' || !CELL_ID.test(obj.cellId)) {\n throw new AgentProfileCellValidationError(\n 'cellId must match agent-profile-cell:sha256:<64 lowercase hex chars>',\n 'cellId',\n )\n }\n expectString(obj.profileId, 'profileId')\n validateSource(obj.sourceProfile, 'sourceProfile')\n if (obj.harness !== undefined) validateHarness(obj.harness, 'harness')\n if (obj.model !== undefined) expectString(obj.model, 'model')\n if (obj.promptHash !== undefined) expectString(obj.promptHash, 'promptHash')\n if (obj.dimensions !== undefined) validateDimensions(obj.dimensions, 'dimensions')\n return input as AgentProfileCell\n}\n\nexport function requireAgentProfileCell(record: {\n runId: string\n agentProfile?: AgentProfileCell\n}): AgentProfileCell {\n if (!record.agentProfile) {\n throw new AgentProfileCellValidationError(\n `run \"${record.runId}\" is missing agentProfile; profile-cell grouping requires explicit profile identity`,\n 'agentProfile',\n )\n }\n return validateAgentProfileCell(record.agentProfile)\n}\n\nexport function agentProfileCellKey(record: {\n runId: string\n agentProfile?: AgentProfileCell\n}): string {\n return requireAgentProfileCell(record).cellId\n}\n\nexport async function assertRunAgentProfileCell(record: {\n runId: string\n model: string\n promptHash: string\n agentProfile?: AgentProfileCell\n}): Promise<AgentProfileCell> {\n const profile = requireAgentProfileCell(record)\n if (!(await verifyAgentProfileCell(profile))) {\n throw new AgentProfileCellValidationError(\n `run \"${record.runId}\" has an agentProfile.cellId that does not match its content`,\n 'agentProfile.cellId',\n )\n }\n if (profile.model !== undefined && profile.model !== record.model) {\n throw new AgentProfileCellValidationError(\n `run \"${record.runId}\" agentProfile.model \"${profile.model}\" does not match model \"${record.model}\"`,\n 'agentProfile.model',\n )\n }\n if (profile.promptHash !== undefined && profile.promptHash !== record.promptHash) {\n throw new AgentProfileCellValidationError(\n `run \"${record.runId}\" agentProfile.promptHash \"${profile.promptHash}\" does not match promptHash \"${record.promptHash}\"`,\n 'agentProfile.promptHash',\n )\n }\n return profile\n}\n\nexport function groupRunsByAgentProfileCell<\n T extends { runId: string; agentProfile?: AgentProfileCell },\n>(records: readonly T[]): Map<string, T[]> {\n const groups = new Map<string, T[]>()\n for (const record of records) {\n const key = agentProfileCellKey(record)\n const bucket = groups.get(key)\n if (bucket) bucket.push(record)\n else groups.set(key, [record])\n }\n return groups\n}\n\nasync function normalizeAgentProfileCellInput(\n input: AgentProfileCellInput,\n): Promise<Omit<AgentProfileCell, 'cellId'>> {\n return normalizeAgentProfileCell({\n schemaVersion: 'agent-profile-cell/v1',\n profileId: input.profileId,\n sourceProfile: await normalizeSourceInput(input.sourceProfile),\n harness: input.harness,\n model: input.model,\n promptHash: input.promptHash,\n dimensions: input.dimensions,\n })\n}\n\nfunction normalizeAgentProfileCell(\n input: Omit<AgentProfileCell, 'cellId'>,\n): Omit<AgentProfileCell, 'cellId'> {\n return compactObject({\n schemaVersion: 'agent-profile-cell/v1' as const,\n profileId: requireNonEmpty(input.profileId, 'profileId'),\n sourceProfile: normalizeSource(input.sourceProfile),\n harness: input.harness ? normalizeHarness(input.harness, 'harness') : undefined,\n model: optionalNonEmpty(input.model, 'model'),\n promptHash: optionalNonEmpty(input.promptHash, 'promptHash'),\n dimensions: input.dimensions\n ? nonEmptyRecord(normalizeDimensions(input.dimensions))\n : undefined,\n })\n}\n\nasync function normalizeSourceInput(input: AgentProfileSourceInput): Promise<AgentProfileSource> {\n const kind = requireNonEmpty(input.kind, 'sourceProfile.kind')\n if (input.hash !== undefined && input.profile !== undefined) {\n throw new AgentProfileCellValidationError(\n 'sourceProfile must provide either hash or profile, not both',\n 'sourceProfile',\n )\n }\n if (input.hash !== undefined) {\n return { kind, hash: requireSha256Hex(input.hash, 'sourceProfile.hash') }\n }\n if (input.profile === undefined) {\n throw new AgentProfileCellValidationError(\n 'sourceProfile must provide hash or profile',\n 'sourceProfile',\n )\n }\n assertJson(input.profile, 'sourceProfile.profile')\n return { kind, hash: await hashJson(input.profile) }\n}\n\nfunction normalizeSource(input: AgentProfileSource): AgentProfileSource {\n return {\n kind: requireNonEmpty(input.kind, 'sourceProfile.kind'),\n hash: requireSha256Hex(input.hash, 'sourceProfile.hash'),\n }\n}\n\nfunction normalizeHarness(input: AgentProfileHarness, path: string): AgentProfileHarness {\n return compactObject({\n id: requireNonEmpty(input.id, `${path}.id`),\n version: optionalNonEmpty(input.version, `${path}.version`),\n hash: optionalNonEmpty(input.hash, `${path}.hash`),\n })\n}\n\nfunction normalizeDimensions(\n input: Record<string, AgentProfileDimensionValue>,\n): Record<string, AgentProfileDimensionValue> {\n const out: Record<string, AgentProfileDimensionValue> = {}\n for (const key of Object.keys(input).sort()) {\n const value = input[key]\n requireNonEmpty(key, 'dimensions.<key>')\n if (\n value !== null &&\n typeof value !== 'string' &&\n typeof value !== 'number' &&\n typeof value !== 'boolean'\n ) {\n throw new AgentProfileCellValidationError(\n 'expected primitive dimension value',\n `dimensions.${key}`,\n )\n }\n if (typeof value === 'number' && !Number.isFinite(value)) {\n throw new AgentProfileCellValidationError('expected finite number', `dimensions.${key}`)\n }\n out[key] = value\n }\n return out\n}\n\nfunction compactObject<T extends Record<string, unknown>>(input: T): T {\n const out: Record<string, unknown> = {}\n for (const [key, value] of Object.entries(input)) {\n if (value !== undefined) out[key] = value\n }\n return out as T\n}\n\nfunction nonEmptyRecord<T extends Record<string, unknown>>(input: T): T | undefined {\n return Object.keys(input).length > 0 ? input : undefined\n}\n\nfunction validateSource(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object' || Array.isArray(value)) {\n throw new AgentProfileCellValidationError('expected object', path)\n }\n const rec = value as Record<string, unknown>\n expectString(rec.kind, `${path}.kind`)\n requireSha256Hex(rec.hash, `${path}.hash`)\n}\n\nfunction validateHarness(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object' || Array.isArray(value)) {\n throw new AgentProfileCellValidationError('expected object', path)\n }\n const rec = value as Record<string, unknown>\n expectString(rec.id, `${path}.id`)\n if (rec.version !== undefined) expectString(rec.version, `${path}.version`)\n if (rec.hash !== undefined) expectString(rec.hash, `${path}.hash`)\n}\n\nfunction validateDimensions(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object' || Array.isArray(value)) {\n throw new AgentProfileCellValidationError('expected object', path)\n }\n normalizeDimensions(value as Record<string, AgentProfileDimensionValue>)\n}\n\nfunction assertJson(value: AgentProfileJson, path: string): void {\n if (value === null) return\n const type = typeof value\n if (type === 'string' || type === 'boolean') return\n if (type === 'number') {\n if (!Number.isFinite(value)) {\n throw new AgentProfileCellValidationError('expected finite number', path)\n }\n return\n }\n if (Array.isArray(value)) {\n value.forEach((item, index) => {\n assertJson(item, `${path}[${index}]`)\n })\n return\n }\n if (type === 'object') {\n for (const [key, nested] of Object.entries(value)) {\n requireNonEmpty(key, `${path}.<key>`)\n assertJson(nested, `${path}.${key}`)\n }\n return\n }\n throw new AgentProfileCellValidationError('expected JSON-compatible value', path)\n}\n\nfunction expectLiteral(value: unknown, expected: string, path: string): void {\n if (value !== expected) {\n throw new AgentProfileCellValidationError(`expected ${expected}`, path)\n }\n}\n\nfunction expectString(value: unknown, path: string): void {\n if (typeof value !== 'string' || value.length === 0) {\n throw new AgentProfileCellValidationError('expected non-empty string', path)\n }\n}\n\nfunction requireNonEmpty(value: string, path: string): string {\n if (typeof value !== 'string' || value.length === 0) {\n throw new AgentProfileCellValidationError('expected non-empty string', path)\n }\n return value\n}\n\nfunction optionalNonEmpty(value: string | undefined, path: string): string | undefined {\n if (value === undefined) return undefined\n return requireNonEmpty(value, path)\n}\n\nfunction requireSha256Hex(value: unknown, path: string): string {\n if (typeof value !== 'string' || !SHA256_HEX.test(value)) {\n throw new AgentProfileCellValidationError('expected 64 lowercase sha256 hex chars', path)\n }\n return value\n}\n\n// ── Consumer helpers ─────────────────────────────────────────────────\n//\n// Two pieces of boilerplate every product consuming `buildAgentProfileCell`\n// has been duplicating (gtm-agent #137, blueprint-agent #1756/#1757):\n//\n// 1. A `JSON.parse(JSON.stringify(value))` helper that canonicalizes an\n// arbitrary sandbox-SDK `AgentProfile` into the recursive\n// `AgentProfileJson` shape, with a fail-loud error when the profile\n// is not JSON-serializable.\n//\n// 2. The magic string `'sandbox-agent-profile'` for `sourceProfile.kind`.\n//\n// Both belong here so the cross-product cell join (same canonical profile\n// hashes to the same `sourceProfile.hash` across products) is enforced by\n// the type system, not by every consumer remembering to do it right.\n// See blueprint-agent issue tangle-network/agent-eval#82.\n\n/** Canonical `sourceProfile.kind` values. Two products fingerprinting the\n * same canonical profile MUST use the same kind for their cells to share\n * `sourceProfile.hash`. Extend rather than create new strings — adding a\n * new kind is a deliberate cross-product schema change. */\nexport const AGENT_PROFILE_KINDS = {\n /** A profile declared via `defineAgentProfile(...)` from\n * `@tangle-network/sandbox`. The default kind for sandbox-hosted\n * products (gtm-agent, blueprint-agent, sandbox, evals). */\n SANDBOX_AGENT_PROFILE: 'sandbox-agent-profile',\n} as const\n\nexport type AgentProfileKind = (typeof AGENT_PROFILE_KINDS)[keyof typeof AGENT_PROFILE_KINDS]\n\n/** Canonicalize an arbitrary value into `AgentProfileJson` by JSON\n * round-trip. Throws when the value contains anything not representable\n * as JSON (functions, BigInt, cycles) — non-portable profiles fail loud\n * rather than silently dropping fields. */\nexport function toAgentProfileJson(value: unknown): AgentProfileJson {\n let serialized: string | undefined\n try {\n serialized = JSON.stringify(value)\n } catch (err) {\n throw new AgentProfileCellValidationError(\n `agent profile must be JSON-serializable: ${err instanceof Error ? err.message : String(err)}`,\n 'sourceProfile.profile',\n )\n }\n if (serialized === undefined) {\n throw new AgentProfileCellValidationError(\n 'agent profile must be JSON-serializable (got undefined after JSON.stringify)',\n 'sourceProfile.profile',\n )\n }\n return JSON.parse(serialized) as AgentProfileJson\n}\n\n/** Minimal shape required of any sandbox-SDK `AgentProfile` — anything\n * with a non-empty `name` and `version` plus JSON-serializable contents.\n * Compatible with `defineAgentProfile(...)` output from\n * `@tangle-network/sandbox`; products that have not yet declared a real\n * profile can pass a `{ name, version, ...metadata }` stub. */\nexport interface SandboxAgentProfileLike {\n name: string\n version: string\n [key: string]: unknown\n}\n\n/** Higher-level helper that hard-codes the canonical\n * `sandbox-agent-profile` kind plus the JSON canonicalization. Equivalent\n * to calling `buildAgentProfileCell` with `profileId = \\`${name}@${version}\\``\n * and `sourceProfile = { kind: SANDBOX_AGENT_PROFILE, profile: <round-tripped> }`.\n *\n * Use this from any product consuming a sandbox-SDK `AgentProfile`; the\n * manual `buildAgentProfileCell` call is reserved for advanced cases\n * (custom kinds, pre-computed source hashes, alternate profileId\n * conventions). */\nexport async function buildSandboxAgentProfileCell(\n profile: SandboxAgentProfileLike,\n input: Omit<AgentProfileCellInput, 'profileId' | 'sourceProfile'>,\n): Promise<AgentProfileCell> {\n if (!profile || typeof profile !== 'object') {\n throw new AgentProfileCellValidationError('sandbox AgentProfile must be an object', 'profile')\n }\n if (typeof profile.name !== 'string' || profile.name.length === 0) {\n throw new AgentProfileCellValidationError(\n 'sandbox AgentProfile must have a non-empty `name`',\n 'profile.name',\n )\n }\n if (typeof profile.version !== 'string' || profile.version.length === 0) {\n throw new AgentProfileCellValidationError(\n 'sandbox AgentProfile must have a non-empty `version`',\n 'profile.version',\n )\n }\n return buildAgentProfileCell({\n ...input,\n profileId: `${profile.name}@${profile.version}`,\n sourceProfile: {\n kind: AGENT_PROFILE_KINDS.SANDBOX_AGENT_PROFILE,\n profile: toAgentProfileJson(profile),\n },\n })\n}\n","/**\n * Paper-grade RunRecord schema + runtime validator.\n *\n * Every run that participates in a promotion gate, paper table, or\n * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory\n * fields are exactly those the paper \"Two Loops, Three Roles\" requires\n * for reproducibility: who/what/when/cost/seed/hash, plus the search vs\n * holdout split tag and either a `searchScore` or a `holdoutScore`.\n *\n * This is intentionally NOT a replacement for the rich `Run` /\n * `ProposeReviewReport` / `ScenarioResult` types already in the\n * package. Those are runtime structures with full provenance. A\n * `RunRecord` is the analysis-time projection — the JSON-friendly\n * row you'd put in a parquet file or paste into a notebook.\n *\n * Validate at the boundary:\n *\n * const rec = validateRunRecord(rawJson) // throws on missing\n * const ok = isRunRecord(rawJson) // boolean check\n * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }\n *\n * The validator runs in pure TS — zod is intentionally NOT a\n * dependency. Round-trip tested in `tests/run-record.test.ts`.\n */\n\nimport type { AgentProfileCell } from './agent-profile-cell'\nimport { validateAgentProfileCell } from './agent-profile-cell'\nimport { ValidationError } from './errors'\nimport type { FailureClass } from './trace/schema'\n\n/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the\n * combined train+test pool that the optimizer is allowed to read. */\nexport type RunSplitTag = 'search' | 'dev' | 'holdout'\n\nexport interface RunTokenUsage {\n input: number\n output: number\n cached?: number\n}\n\nexport interface RunJudgeMetadata {\n model: string\n promptVersion: string\n /** [0,1] confidence the judge declared. Constant judge confidence\n * across many runs is a fallback signal (see `canary.ts`). */\n confidence: number\n /** True if the judge degraded to a fallback path (rules-only,\n * prior-call cache, etc.). The canary uses this to alert. */\n fallback: boolean\n}\n\n/**\n * Per-judge / per-dimension breakdown for runs scored by an ensemble of\n * judges over a multi-dimensional rubric.\n *\n * The collapsed `outcome.searchScore` / `holdoutScore` carries the\n * composite the gate uses. The full breakdown belongs here so consumers\n * can answer \"which judge disagreed?\", \"which dimension dragged the\n * composite down?\", and \"did half the panel fail?\" without re-running.\n *\n * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and\n * `composite` are convenience projections — derivable but precomputed so\n * downstream IRR primitives (`interRaterReliability`,\n * `corpusInterRaterAgreement`) and reporters don't pay the same\n * aggregation twice.\n *\n * Fail-loud discipline: judges that errored out land in `failedJudges`\n * by id. A missing key in `perJudge` is ambiguous (silent zero vs not\n * run); the explicit list makes a partial-failure recorded as such.\n */\nexport interface JudgeScoresRecord {\n /** Per-judge per-dimension scores. `{ \"kimi-k2.6\": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */\n perJudge: Record<string, Record<string, number>>\n /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */\n perDimMean: Record<string, number>\n /** Composite mean across all dims and judges. Mirrors the score\n * the gate sees on `outcome.searchScore` / `holdoutScore`. */\n composite: number\n /** Judges that errored or returned an unparseable verdict. Recorded\n * by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,\n * not inferred from missing keys in `perJudge`. */\n failedJudges?: string[]\n /** Free-form notes the judges emitted (joined across judges or\n * first-judge only — consumer's choice). */\n notes?: string\n}\n\nexport interface RunOutcome {\n /** Score on the search/optimization split. Optional because a\n * holdout-only evaluation only fills `holdoutScore`. */\n searchScore?: number\n /** Score on the held-out split. Optional because a search-only run\n * only fills `searchScore`. At least one must be present. */\n holdoutScore?: number\n /** Bag of any other metric the run produced — judge dimensions,\n * pass/fail counters, latency stats, etc. Numeric only — keeps\n * reporters honest. */\n raw: Record<string, number>\n /** Per-judge / per-dim breakdown. Consumers writing ensemble\n * judgements populate this; substrate primitives like\n * `interRaterReliability` and `corpusInterRaterAgreement` accept\n * these records as input. Optional — single-judge or scalar-only\n * runs leave it unset. */\n judgeScores?: JudgeScoresRecord\n /** Authenticity / realness verdict — did the run build the REAL thing on the\n * intended infra, or fake it (see `./authenticity`)? Optional: only domains\n * with an authenticity config populate it. Carried in the corpus so the\n * flywheel / off-policy learning can optimize for real completion, not gamed\n * pass-rate. `score` is 0-1; `gated` is the anti-Goodhart flag — a gated run\n * must not count as a real success regardless of `score`. */\n realness?: { score: number; gated: boolean; reason?: string }\n}\n\n/**\n * Mandatory paper-grade fields for a single evaluation run. Optional\n * fields are extension points; mandatory fields throw if missing.\n *\n * Hash discipline:\n * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the\n * model (after any steering bundle merge).\n * - `configHash` is the sha256 of the effective run config (model,\n * temperature, tools, judges, splits). The pair (promptHash,\n * configHash) uniquely identifies an experimental cell.\n *\n * Model snapshot discipline:\n * - `model` MUST encode a snapshot version. Bare aliases like\n * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.\n * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.\n */\nexport interface RunRecord {\n /** UUID for the run. */\n runId: string\n /** Logical experiment grouping (a treatment vs a baseline within\n * the same sweep should share `experimentId`). */\n experimentId: string\n /** Stable identifier for the candidate (variant) being run. The\n * promotion gate compares two `candidateId`s on matched items. */\n candidateId: string\n /** RNG seed for the run. Always recorded — silent re-seeding is\n * the most common cause of non-reproducible numbers. */\n seed: number\n /** Model identifier WITH snapshot version. */\n model: string\n /** sha256 of the effective prompt (post-steering). */\n promptHash: string\n /** sha256 of the effective config. */\n configHash: string\n /** Git SHA the harness was run from. */\n commitSha: string\n /** End-to-end wall-clock duration in milliseconds. */\n wallMs: number\n /** Time spent queued before execution started, if known. */\n queueMs?: number\n /** Total USD cost. Mandatory — runs without a cost number are\n * unbounded by definition and must not be admitted into the gate. */\n costUsd: number\n /** Token usage breakdown. */\n tokenUsage: RunTokenUsage\n /** Judge-side metadata, if a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /** Per-split scores + raw bag. */\n outcome: RunOutcome\n /** Canonical, cross-agent failure class drawn from the shared\n * `FAILURE_CLASSES` taxonomy. This is the aggregation key that makes\n * \"which failure dominates across the whole fleet\" answerable in ONE\n * vocabulary — every agent classifies against the same enum. Producers\n * set it via the substrate classifier; leave unset only when the failure\n * genuinely can't be classified. */\n failureClass?: FailureClass\n /** Free-form domain-specific failure detail, scoped UNDER `failureClass`\n * (e.g. failureClass='tool_recovery_failure', failureMode='forge_build_unsatisfied').\n * The within-agent drill-down; `failureClass` is the cross-agent key. */\n failureMode?: string\n /** Which split this run was drawn from. */\n splitTag: RunSplitTag\n /**\n * Stable scenario identifier the run was scored against. Optional for\n * backwards compatibility, but **strongly recommended**: every primitive\n * that pairs runs by scenario (preferences, paired stats, BT tournament)\n * keys on this. The campaign artifact populates it canonically; legacy\n * runs without it fall back to inference from `outcome.raw.scenario_id`\n * or `experimentId`.\n */\n scenarioId?: string\n /**\n * Canonical identity for the agent profile cell that produced this row:\n * profile artifact hash plus optional harness/model/prompt/reporting\n * dimensions. Use `agentProfile.cellId` to group persona sweeps and\n * longitudinal reports by the complete source profile, not by a loose\n * candidate label or opaque config hash.\n */\n agentProfile?: AgentProfileCell\n}\n\n// ── Validation ───────────────────────────────────────────────────────\n\nconst MANDATORY_TOP_LEVEL = [\n 'runId',\n 'experimentId',\n 'candidateId',\n 'seed',\n 'model',\n 'promptHash',\n 'configHash',\n 'commitSha',\n 'wallMs',\n 'costUsd',\n 'tokenUsage',\n 'outcome',\n 'splitTag',\n] as const\n\nconst SPLIT_TAGS: ReadonlyArray<RunSplitTag> = ['search', 'dev', 'holdout']\n\nexport class RunRecordValidationError extends ValidationError {\n readonly path: string\n constructor(message: string, path = '') {\n super(path ? `${message} (at ${path})` : message)\n this.path = path\n }\n}\n\n/**\n * Strict validator. Throws `RunRecordValidationError` on the first\n * missing or wrongly-typed field. Returns the input cast to\n * `RunRecord` on success — the validator does not coerce.\n */\nexport function validateRunRecord(input: unknown): RunRecord {\n if (input === null || typeof input !== 'object') {\n throw new RunRecordValidationError('expected object')\n }\n const obj = input as Record<string, unknown>\n\n for (const key of MANDATORY_TOP_LEVEL) {\n if (!(key in obj)) {\n throw new RunRecordValidationError(`missing mandatory field \"${key}\"`)\n }\n }\n\n expectString(obj.runId, 'runId')\n expectString(obj.experimentId, 'experimentId')\n expectString(obj.candidateId, 'candidateId')\n expectFiniteNumber(obj.seed, 'seed')\n expectString(obj.model, 'model')\n expectString(obj.promptHash, 'promptHash')\n expectString(obj.configHash, 'configHash')\n expectString(obj.commitSha, 'commitSha')\n expectFiniteNumber(obj.wallMs, 'wallMs')\n if (obj.queueMs !== undefined) expectFiniteNumber(obj.queueMs, 'queueMs')\n expectFiniteNumber(obj.costUsd, 'costUsd')\n\n // Snapshot discipline: bare model aliases are not paper-grade.\n if (!modelHasSnapshot(obj.model as string)) {\n throw new RunRecordValidationError(\n `model \"${obj.model}\" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,\n 'model',\n )\n }\n\n // Token usage.\n const tu = obj.tokenUsage\n if (tu === null || typeof tu !== 'object') {\n throw new RunRecordValidationError('tokenUsage must be an object', 'tokenUsage')\n }\n const tuRec = tu as Record<string, unknown>\n expectFiniteNumber(tuRec.input, 'tokenUsage.input')\n expectFiniteNumber(tuRec.output, 'tokenUsage.output')\n if (tuRec.cached !== undefined) expectFiniteNumber(tuRec.cached, 'tokenUsage.cached')\n\n // Judge metadata, optional.\n if (obj.judgeMetadata !== undefined) {\n const jm = obj.judgeMetadata\n if (jm === null || typeof jm !== 'object') {\n throw new RunRecordValidationError('judgeMetadata must be an object', 'judgeMetadata')\n }\n const jmRec = jm as Record<string, unknown>\n expectString(jmRec.model, 'judgeMetadata.model')\n expectString(jmRec.promptVersion, 'judgeMetadata.promptVersion')\n expectFiniteNumber(jmRec.confidence, 'judgeMetadata.confidence')\n if (typeof jmRec.fallback !== 'boolean') {\n throw new RunRecordValidationError(\n 'judgeMetadata.fallback must be boolean',\n 'judgeMetadata.fallback',\n )\n }\n }\n\n // Outcome.\n const out = obj.outcome\n if (out === null || typeof out !== 'object') {\n throw new RunRecordValidationError('outcome must be an object', 'outcome')\n }\n const outRec = out as Record<string, unknown>\n if (outRec.searchScore !== undefined)\n expectFiniteNumber(outRec.searchScore, 'outcome.searchScore')\n if (outRec.holdoutScore !== undefined)\n expectFiniteNumber(outRec.holdoutScore, 'outcome.holdoutScore')\n if (outRec.searchScore === undefined && outRec.holdoutScore === undefined) {\n throw new RunRecordValidationError(\n 'outcome must define searchScore or holdoutScore (or both)',\n 'outcome',\n )\n }\n const raw = outRec.raw\n if (raw === null || typeof raw !== 'object') {\n throw new RunRecordValidationError('outcome.raw must be an object', 'outcome.raw')\n }\n for (const [k, v] of Object.entries(raw as Record<string, unknown>)) {\n expectFiniteNumber(v, `outcome.raw.${k}`)\n }\n // Realness verdict, optional.\n if (outRec.realness !== undefined) {\n const r = outRec.realness\n if (r === null || typeof r !== 'object') {\n throw new RunRecordValidationError('outcome.realness must be an object', 'outcome.realness')\n }\n const rr = r as Record<string, unknown>\n expectFiniteNumber(rr.score, 'outcome.realness.score')\n if (typeof rr.gated !== 'boolean') {\n throw new RunRecordValidationError(\n 'outcome.realness.gated must be a boolean',\n 'outcome.realness.gated',\n )\n }\n }\n\n // Per-judge / per-dim breakdown, optional.\n if (outRec.judgeScores !== undefined) {\n validateJudgeScores(outRec.judgeScores, 'outcome.judgeScores')\n }\n\n // Failure mode optional.\n if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode')\n\n if (obj.agentProfile !== undefined) {\n try {\n const profile = validateAgentProfileCell(obj.agentProfile)\n if (profile.model !== undefined && profile.model !== obj.model) {\n throw new RunRecordValidationError(\n `agentProfile.model \"${profile.model}\" does not match model \"${obj.model}\"`,\n 'agentProfile.model',\n )\n }\n if (profile.promptHash !== undefined && profile.promptHash !== obj.promptHash) {\n throw new RunRecordValidationError(\n `agentProfile.promptHash \"${profile.promptHash}\" does not match promptHash \"${obj.promptHash}\"`,\n 'agentProfile.promptHash',\n )\n }\n } catch (error) {\n if (error instanceof RunRecordValidationError) throw error\n if (error instanceof Error) {\n throw new RunRecordValidationError(error.message, 'agentProfile')\n }\n throw error\n }\n }\n\n // Split tag.\n if (typeof obj.splitTag !== 'string' || !SPLIT_TAGS.includes(obj.splitTag as RunSplitTag)) {\n throw new RunRecordValidationError(\n `splitTag must be one of ${SPLIT_TAGS.join(', ')}, got ${String(obj.splitTag)}`,\n 'splitTag',\n )\n }\n\n return input as RunRecord\n}\n\n/** Boolean validator — convenience for filtering arrays. */\nexport function isRunRecord(input: unknown): input is RunRecord {\n try {\n validateRunRecord(input)\n return true\n } catch {\n return false\n }\n}\n\n/** Non-throwing validator — returns a discriminated union. */\nexport function parseRunRecordSafe(\n input: unknown,\n): { ok: true; value: RunRecord } | { ok: false; error: RunRecordValidationError } {\n try {\n return { ok: true, value: validateRunRecord(input) }\n } catch (e) {\n if (e instanceof RunRecordValidationError) return { ok: false, error: e }\n throw e\n }\n}\n\n/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */\nexport function roundTripRunRecord(record: RunRecord): RunRecord {\n const json = JSON.stringify(record)\n return validateRunRecord(JSON.parse(json))\n}\n\n// ── Internals ────────────────────────────────────────────────────────\n\nfunction expectString(value: unknown, path: string): void {\n if (typeof value !== 'string' || value.length === 0) {\n throw new RunRecordValidationError(`expected non-empty string`, path)\n }\n}\n\nfunction expectFiniteNumber(value: unknown, path: string): void {\n if (typeof value !== 'number' || !Number.isFinite(value)) {\n throw new RunRecordValidationError(`expected finite number`, path)\n }\n}\n\nfunction validateJudgeScores(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object') {\n throw new RunRecordValidationError('judgeScores must be an object', path)\n }\n const rec = value as Record<string, unknown>\n\n const perJudge = rec.perJudge\n if (perJudge === null || typeof perJudge !== 'object') {\n throw new RunRecordValidationError('perJudge must be an object', `${path}.perJudge`)\n }\n for (const [judgeId, dims] of Object.entries(perJudge as Record<string, unknown>)) {\n if (dims === null || typeof dims !== 'object') {\n throw new RunRecordValidationError(\n 'per-judge entry must be an object of dimension scores',\n `${path}.perJudge.${judgeId}`,\n )\n }\n for (const [dim, score] of Object.entries(dims as Record<string, unknown>)) {\n expectFiniteNumber(score, `${path}.perJudge.${judgeId}.${dim}`)\n }\n }\n\n const perDimMean = rec.perDimMean\n if (perDimMean === null || typeof perDimMean !== 'object') {\n throw new RunRecordValidationError('perDimMean must be an object', `${path}.perDimMean`)\n }\n for (const [dim, mean] of Object.entries(perDimMean as Record<string, unknown>)) {\n expectFiniteNumber(mean, `${path}.perDimMean.${dim}`)\n }\n\n expectFiniteNumber(rec.composite, `${path}.composite`)\n\n if (rec.failedJudges !== undefined) {\n if (!Array.isArray(rec.failedJudges)) {\n throw new RunRecordValidationError(\n 'failedJudges must be an array of strings',\n `${path}.failedJudges`,\n )\n }\n for (let i = 0; i < rec.failedJudges.length; i++) {\n const id = rec.failedJudges[i]\n if (typeof id !== 'string' || id.length === 0) {\n throw new RunRecordValidationError(\n 'failedJudges entry must be a non-empty string',\n `${path}.failedJudges[${i}]`,\n )\n }\n }\n }\n\n if (rec.notes !== undefined && typeof rec.notes !== 'string') {\n throw new RunRecordValidationError('notes must be a string', `${path}.notes`)\n }\n}\n\n/**\n * Heuristic snapshot check. Accepts:\n * - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`)\n * - `name-YYYYMMDD` (OpenAI style: `gpt-4o-2024-11-20`)\n * - `name@<arbitrary-token>` (allow opaque snapshots like `@v3`)\n * - explicit `:date-...` Vertex-style tags\n *\n * Rejects bare aliases like `claude-sonnet-4` or `gpt-4o` that remap\n * silently as providers ship new snapshots.\n */\nfunction modelHasSnapshot(model: string): boolean {\n if (model.includes('@')) return true\n if (/-\\d{8}$/.test(model)) return true\n if (/-\\d{4}-\\d{2}-\\d{2}$/.test(model)) return true\n if (/:date-/.test(model)) return true\n return false\n}\n"],"mappings":";;;;;;;;AAwDO,IAAM,kCAAN,cAA8C,gBAAgB;AAAA,EAC1D;AAAA,EACT,YAAY,SAAiB,OAAO,IAAI;AACtC,UAAM,OAAO,GAAG,OAAO,QAAQ,IAAI,MAAM,OAAO;AAChD,SAAK,OAAO;AAAA,EACd;AACF;AAEA,IAAM,aAAa;AACnB,IAAM,UAAU;AAEhB,eAAsB,sBACpB,OAC2B;AAC3B,QAAM,WAAW,MAAM,+BAA+B,KAAK;AAC3D,QAAM,SAAS,6BAA6B,MAAM,SAAS,QAAQ,CAAC;AACpE,SAAO,EAAE,GAAG,UAAU,OAAO;AAC/B;AAEO,SAAS,6BACd,MACkC;AAClC,QAAM,EAAE,QAAQ,SAAS,GAAG,SAAS,IAAI;AACzC,OAAK;AACL,SAAO,0BAA0B,QAAQ;AAC3C;AAEA,eAAsB,uBAAuB,MAA0C;AACrF,2BAAyB,IAAI;AAC7B,SACE,KAAK,WACL,6BAA6B,MAAM,SAAS,6BAA6B,IAAI,CAAC,CAAC;AAEnF;AAEO,SAAS,yBAAyB,OAAkC;AACzE,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,gCAAgC,iBAAiB;AAAA,EAC7D;AACA,QAAM,MAAM;AACZ,gBAAc,IAAI,eAAe,yBAAyB,eAAe;AACzE,MAAI,OAAO,IAAI,WAAW,YAAY,CAAC,QAAQ,KAAK,IAAI,MAAM,GAAG;AAC/D,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,eAAa,IAAI,WAAW,WAAW;AACvC,iBAAe,IAAI,eAAe,eAAe;AACjD,MAAI,IAAI,YAAY,OAAW,iBAAgB,IAAI,SAAS,SAAS;AACrE,MAAI,IAAI,UAAU,OAAW,cAAa,IAAI,OAAO,OAAO;AAC5D,MAAI,IAAI,eAAe,OAAW,cAAa,IAAI,YAAY,YAAY;AAC3E,MAAI,IAAI,eAAe,OAAW,oBAAmB,IAAI,YAAY,YAAY;AACjF,SAAO;AACT;AAEO,SAAS,wBAAwB,QAGnB;AACnB,MAAI,CAAC,OAAO,cAAc;AACxB,UAAM,IAAI;AAAA,MACR,QAAQ,OAAO,KAAK;AAAA,MACpB;AAAA,IACF;AAAA,EACF;AACA,SAAO,yBAAyB,OAAO,YAAY;AACrD;AAEO,SAAS,oBAAoB,QAGzB;AACT,SAAO,wBAAwB,MAAM,EAAE;AACzC;AAEA,eAAsB,0BAA0B,QAKlB;AAC5B,QAAM,UAAU,wBAAwB,MAAM;AAC9C,MAAI,CAAE,MAAM,uBAAuB,OAAO,GAAI;AAC5C,UAAM,IAAI;AAAA,MACR,QAAQ,OAAO,KAAK;AAAA,MACpB;AAAA,IACF;AAAA,EACF;AACA,MAAI,QAAQ,UAAU,UAAa,QAAQ,UAAU,OAAO,OAAO;AACjE,UAAM,IAAI;AAAA,MACR,QAAQ,OAAO,KAAK,yBAAyB,QAAQ,KAAK,2BAA2B,OAAO,KAAK;AAAA,MACjG;AAAA,IACF;AAAA,EACF;AACA,MAAI,QAAQ,eAAe,UAAa,QAAQ,eAAe,OAAO,YAAY;AAChF,UAAM,IAAI;AAAA,MACR,QAAQ,OAAO,KAAK,8BAA8B,QAAQ,UAAU,gCAAgC,OAAO,UAAU;AAAA,MACrH;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAEO,SAAS,4BAEd,SAAyC;AACzC,QAAM,SAAS,oBAAI,IAAiB;AACpC,aAAW,UAAU,SAAS;AAC5B,UAAM,MAAM,oBAAoB,MAAM;AACtC,UAAM,SAAS,OAAO,IAAI,GAAG;AAC7B,QAAI,OAAQ,QAAO,KAAK,MAAM;AAAA,QACzB,QAAO,IAAI,KAAK,CAAC,MAAM,CAAC;AAAA,EAC/B;AACA,SAAO;AACT;AAEA,eAAe,+BACb,OAC2C;AAC3C,SAAO,0BAA0B;AAAA,IAC/B,eAAe;AAAA,IACf,WAAW,MAAM;AAAA,IACjB,eAAe,MAAM,qBAAqB,MAAM,aAAa;AAAA,IAC7D,SAAS,MAAM;AAAA,IACf,OAAO,MAAM;AAAA,IACb,YAAY,MAAM;AAAA,IAClB,YAAY,MAAM;AAAA,EACpB,CAAC;AACH;AAEA,SAAS,0BACP,OACkC;AAClC,SAAO,cAAc;AAAA,IACnB,eAAe;AAAA,IACf,WAAW,gBAAgB,MAAM,WAAW,WAAW;AAAA,IACvD,eAAe,gBAAgB,MAAM,aAAa;AAAA,IAClD,SAAS,MAAM,UAAU,iBAAiB,MAAM,SAAS,SAAS,IAAI;AAAA,IACtE,OAAO,iBAAiB,MAAM,OAAO,OAAO;AAAA,IAC5C,YAAY,iBAAiB,MAAM,YAAY,YAAY;AAAA,IAC3D,YAAY,MAAM,aACd,eAAe,oBAAoB,MAAM,UAAU,CAAC,IACpD;AAAA,EACN,CAAC;AACH;AAEA,eAAe,qBAAqB,OAA6D;AAC/F,QAAM,OAAO,gBAAgB,MAAM,MAAM,oBAAoB;AAC7D,MAAI,MAAM,SAAS,UAAa,MAAM,YAAY,QAAW;AAC3D,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,MAAI,MAAM,SAAS,QAAW;AAC5B,WAAO,EAAE,MAAM,MAAM,iBAAiB,MAAM,MAAM,oBAAoB,EAAE;AAAA,EAC1E;AACA,MAAI,MAAM,YAAY,QAAW;AAC/B,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,aAAW,MAAM,SAAS,uBAAuB;AACjD,SAAO,EAAE,MAAM,MAAM,MAAM,SAAS,MAAM,OAAO,EAAE;AACrD;AAEA,SAAS,gBAAgB,OAA+C;AACtE,SAAO;AAAA,IACL,MAAM,gBAAgB,MAAM,MAAM,oBAAoB;AAAA,IACtD,MAAM,iBAAiB,MAAM,MAAM,oBAAoB;AAAA,EACzD;AACF;AAEA,SAAS,iBAAiB,OAA4B,MAAmC;AACvF,SAAO,cAAc;AAAA,IACnB,IAAI,gBAAgB,MAAM,IAAI,GAAG,IAAI,KAAK;AAAA,IAC1C,SAAS,iBAAiB,MAAM,SAAS,GAAG,IAAI,UAAU;AAAA,IAC1D,MAAM,iBAAiB,MAAM,MAAM,GAAG,IAAI,OAAO;AAAA,EACnD,CAAC;AACH;AAEA,SAAS,oBACP,OAC4C;AAC5C,QAAM,MAAkD,CAAC;AACzD,aAAW,OAAO,OAAO,KAAK,KAAK,EAAE,KAAK,GAAG;AAC3C,UAAM,QAAQ,MAAM,GAAG;AACvB,oBAAgB,KAAK,kBAAkB;AACvC,QACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,YACjB,OAAO,UAAU,WACjB;AACA,YAAM,IAAI;AAAA,QACR;AAAA,QACA,cAAc,GAAG;AAAA,MACnB;AAAA,IACF;AACA,QAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,GAAG;AACxD,YAAM,IAAI,gCAAgC,0BAA0B,cAAc,GAAG,EAAE;AAAA,IACzF;AACA,QAAI,GAAG,IAAI;AAAA,EACb;AACA,SAAO;AACT;AAEA,SAAS,cAAiD,OAAa;AACrE,QAAM,MAA+B,CAAC;AACtC,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,KAAK,GAAG;AAChD,QAAI,UAAU,OAAW,KAAI,GAAG,IAAI;AAAA,EACtC;AACA,SAAO;AACT;AAEA,SAAS,eAAkD,OAAyB;AAClF,SAAO,OAAO,KAAK,KAAK,EAAE,SAAS,IAAI,QAAQ;AACjD;AAEA,SAAS,eAAe,OAAgB,MAAoB;AAC1D,MAAI,UAAU,QAAQ,OAAO,UAAU,YAAY,MAAM,QAAQ,KAAK,GAAG;AACvE,UAAM,IAAI,gCAAgC,mBAAmB,IAAI;AAAA,EACnE;AACA,QAAM,MAAM;AACZ,eAAa,IAAI,MAAM,GAAG,IAAI,OAAO;AACrC,mBAAiB,IAAI,MAAM,GAAG,IAAI,OAAO;AAC3C;AAEA,SAAS,gBAAgB,OAAgB,MAAoB;AAC3D,MAAI,UAAU,QAAQ,OAAO,UAAU,YAAY,MAAM,QAAQ,KAAK,GAAG;AACvE,UAAM,IAAI,gCAAgC,mBAAmB,IAAI;AAAA,EACnE;AACA,QAAM,MAAM;AACZ,eAAa,IAAI,IAAI,GAAG,IAAI,KAAK;AACjC,MAAI,IAAI,YAAY,OAAW,cAAa,IAAI,SAAS,GAAG,IAAI,UAAU;AAC1E,MAAI,IAAI,SAAS,OAAW,cAAa,IAAI,MAAM,GAAG,IAAI,OAAO;AACnE;AAEA,SAAS,mBAAmB,OAAgB,MAAoB;AAC9D,MAAI,UAAU,QAAQ,OAAO,UAAU,YAAY,MAAM,QAAQ,KAAK,GAAG;AACvE,UAAM,IAAI,gCAAgC,mBAAmB,IAAI;AAAA,EACnE;AACA,sBAAoB,KAAmD;AACzE;AAEA,SAAS,WAAW,OAAyB,MAAoB;AAC/D,MAAI,UAAU,KAAM;AACpB,QAAM,OAAO,OAAO;AACpB,MAAI,SAAS,YAAY,SAAS,UAAW;AAC7C,MAAI,SAAS,UAAU;AACrB,QAAI,CAAC,OAAO,SAAS,KAAK,GAAG;AAC3B,YAAM,IAAI,gCAAgC,0BAA0B,IAAI;AAAA,IAC1E;AACA;AAAA,EACF;AACA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,UAAM,QAAQ,CAAC,MAAM,UAAU;AAC7B,iBAAW,MAAM,GAAG,IAAI,IAAI,KAAK,GAAG;AAAA,IACtC,CAAC;AACD;AAAA,EACF;AACA,MAAI,SAAS,UAAU;AACrB,eAAW,CAAC,KAAK,MAAM,KAAK,OAAO,QAAQ,KAAK,GAAG;AACjD,sBAAgB,KAAK,GAAG,IAAI,QAAQ;AACpC,iBAAW,QAAQ,GAAG,IAAI,IAAI,GAAG,EAAE;AAAA,IACrC;AACA;AAAA,EACF;AACA,QAAM,IAAI,gCAAgC,kCAAkC,IAAI;AAClF;AAEA,SAAS,cAAc,OAAgB,UAAkB,MAAoB;AAC3E,MAAI,UAAU,UAAU;AACtB,UAAM,IAAI,gCAAgC,YAAY,QAAQ,IAAI,IAAI;AAAA,EACxE;AACF;AAEA,SAAS,aAAa,OAAgB,MAAoB;AACxD,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,gCAAgC,6BAA6B,IAAI;AAAA,EAC7E;AACF;AAEA,SAAS,gBAAgB,OAAe,MAAsB;AAC5D,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,gCAAgC,6BAA6B,IAAI;AAAA,EAC7E;AACA,SAAO;AACT;AAEA,SAAS,iBAAiB,OAA2B,MAAkC;AACrF,MAAI,UAAU,OAAW,QAAO;AAChC,SAAO,gBAAgB,OAAO,IAAI;AACpC;AAEA,SAAS,iBAAiB,OAAgB,MAAsB;AAC9D,MAAI,OAAO,UAAU,YAAY,CAAC,WAAW,KAAK,KAAK,GAAG;AACxD,UAAM,IAAI,gCAAgC,0CAA0C,IAAI;AAAA,EAC1F;AACA,SAAO;AACT;AAuBO,IAAM,sBAAsB;AAAA;AAAA;AAAA;AAAA,EAIjC,uBAAuB;AACzB;AAQO,SAAS,mBAAmB,OAAkC;AACnE,MAAI;AACJ,MAAI;AACF,iBAAa,KAAK,UAAU,KAAK;AAAA,EACnC,SAAS,KAAK;AACZ,UAAM,IAAI;AAAA,MACR,4CAA4C,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,MAC5F;AAAA,IACF;AAAA,EACF;AACA,MAAI,eAAe,QAAW;AAC5B,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,SAAO,KAAK,MAAM,UAAU;AAC9B;AAsBA,eAAsB,6BACpB,SACA,OAC2B;AAC3B,MAAI,CAAC,WAAW,OAAO,YAAY,UAAU;AAC3C,UAAM,IAAI,gCAAgC,0CAA0C,SAAS;AAAA,EAC/F;AACA,MAAI,OAAO,QAAQ,SAAS,YAAY,QAAQ,KAAK,WAAW,GAAG;AACjE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,MAAI,OAAO,QAAQ,YAAY,YAAY,QAAQ,QAAQ,WAAW,GAAG;AACvE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,SAAO,sBAAsB;AAAA,IAC3B,GAAG;AAAA,IACH,WAAW,GAAG,QAAQ,IAAI,IAAI,QAAQ,OAAO;AAAA,IAC7C,eAAe;AAAA,MACb,MAAM,oBAAoB;AAAA,MAC1B,SAAS,mBAAmB,OAAO;AAAA,IACrC;AAAA,EACF,CAAC;AACH;;;ACxQA,IAAM,sBAAsB;AAAA,EAC1B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,IAAM,aAAyC,CAAC,UAAU,OAAO,SAAS;AAEnE,IAAM,2BAAN,cAAuC,gBAAgB;AAAA,EACnD;AAAA,EACT,YAAY,SAAiB,OAAO,IAAI;AACtC,UAAM,OAAO,GAAG,OAAO,QAAQ,IAAI,MAAM,OAAO;AAChD,SAAK,OAAO;AAAA,EACd;AACF;AAOO,SAAS,kBAAkB,OAA2B;AAC3D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iBAAiB;AAAA,EACtD;AACA,QAAM,MAAM;AAEZ,aAAW,OAAO,qBAAqB;AACrC,QAAI,EAAE,OAAO,MAAM;AACjB,YAAM,IAAI,yBAAyB,4BAA4B,GAAG,GAAG;AAAA,IACvE;AAAA,EACF;AAEA,EAAAA,cAAa,IAAI,OAAO,OAAO;AAC/B,EAAAA,cAAa,IAAI,cAAc,cAAc;AAC7C,EAAAA,cAAa,IAAI,aAAa,aAAa;AAC3C,qBAAmB,IAAI,MAAM,MAAM;AACnC,EAAAA,cAAa,IAAI,OAAO,OAAO;AAC/B,EAAAA,cAAa,IAAI,YAAY,YAAY;AACzC,EAAAA,cAAa,IAAI,YAAY,YAAY;AACzC,EAAAA,cAAa,IAAI,WAAW,WAAW;AACvC,qBAAmB,IAAI,QAAQ,QAAQ;AACvC,MAAI,IAAI,YAAY,OAAW,oBAAmB,IAAI,SAAS,SAAS;AACxE,qBAAmB,IAAI,SAAS,SAAS;AAGzC,MAAI,CAAC,iBAAiB,IAAI,KAAe,GAAG;AAC1C,UAAM,IAAI;AAAA,MACR,UAAU,IAAI,KAAK;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AAGA,QAAM,KAAK,IAAI;AACf,MAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,UAAM,IAAI,yBAAyB,gCAAgC,YAAY;AAAA,EACjF;AACA,QAAM,QAAQ;AACd,qBAAmB,MAAM,OAAO,kBAAkB;AAClD,qBAAmB,MAAM,QAAQ,mBAAmB;AACpD,MAAI,MAAM,WAAW,OAAW,oBAAmB,MAAM,QAAQ,mBAAmB;AAGpF,MAAI,IAAI,kBAAkB,QAAW;AACnC,UAAM,KAAK,IAAI;AACf,QAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,YAAM,IAAI,yBAAyB,mCAAmC,eAAe;AAAA,IACvF;AACA,UAAM,QAAQ;AACd,IAAAA,cAAa,MAAM,OAAO,qBAAqB;AAC/C,IAAAA,cAAa,MAAM,eAAe,6BAA6B;AAC/D,uBAAmB,MAAM,YAAY,0BAA0B;AAC/D,QAAI,OAAO,MAAM,aAAa,WAAW;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,QAAM,MAAM,IAAI;AAChB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,6BAA6B,SAAS;AAAA,EAC3E;AACA,QAAM,SAAS;AACf,MAAI,OAAO,gBAAgB;AACzB,uBAAmB,OAAO,aAAa,qBAAqB;AAC9D,MAAI,OAAO,iBAAiB;AAC1B,uBAAmB,OAAO,cAAc,sBAAsB;AAChE,MAAI,OAAO,gBAAgB,UAAa,OAAO,iBAAiB,QAAW;AACzE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,QAAM,MAAM,OAAO;AACnB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,iCAAiC,aAAa;AAAA,EACnF;AACA,aAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,GAA8B,GAAG;AACnE,uBAAmB,GAAG,eAAe,CAAC,EAAE;AAAA,EAC1C;AAEA,MAAI,OAAO,aAAa,QAAW;AACjC,UAAM,IAAI,OAAO;AACjB,QAAI,MAAM,QAAQ,OAAO,MAAM,UAAU;AACvC,YAAM,IAAI,yBAAyB,sCAAsC,kBAAkB;AAAA,IAC7F;AACA,UAAM,KAAK;AACX,uBAAmB,GAAG,OAAO,wBAAwB;AACrD,QAAI,OAAO,GAAG,UAAU,WAAW;AACjC,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,MAAI,OAAO,gBAAgB,QAAW;AACpC,wBAAoB,OAAO,aAAa,qBAAqB;AAAA,EAC/D;AAGA,MAAI,IAAI,gBAAgB,OAAW,CAAAA,cAAa,IAAI,aAAa,aAAa;AAE9E,MAAI,IAAI,iBAAiB,QAAW;AAClC,QAAI;AACF,YAAM,UAAU,yBAAyB,IAAI,YAAY;AACzD,UAAI,QAAQ,UAAU,UAAa,QAAQ,UAAU,IAAI,OAAO;AAC9D,cAAM,IAAI;AAAA,UACR,uBAAuB,QAAQ,KAAK,2BAA2B,IAAI,KAAK;AAAA,UACxE;AAAA,QACF;AAAA,MACF;AACA,UAAI,QAAQ,eAAe,UAAa,QAAQ,eAAe,IAAI,YAAY;AAC7E,cAAM,IAAI;AAAA,UACR,4BAA4B,QAAQ,UAAU,gCAAgC,IAAI,UAAU;AAAA,UAC5F;AAAA,QACF;AAAA,MACF;AAAA,IACF,SAAS,OAAO;AACd,UAAI,iBAAiB,yBAA0B,OAAM;AACrD,UAAI,iBAAiB,OAAO;AAC1B,cAAM,IAAI,yBAAyB,MAAM,SAAS,cAAc;AAAA,MAClE;AACA,YAAM;AAAA,IACR;AAAA,EACF;AAGA,MAAI,OAAO,IAAI,aAAa,YAAY,CAAC,WAAW,SAAS,IAAI,QAAuB,GAAG;AACzF,UAAM,IAAI;AAAA,MACR,2BAA2B,WAAW,KAAK,IAAI,CAAC,SAAS,OAAO,IAAI,QAAQ,CAAC;AAAA,MAC7E;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,YAAY,OAAoC;AAC9D,MAAI;AACF,sBAAkB,KAAK;AACvB,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAGO,SAAS,mBACd,OACiF;AACjF,MAAI;AACF,WAAO,EAAE,IAAI,MAAM,OAAO,kBAAkB,KAAK,EAAE;AAAA,EACrD,SAAS,GAAG;AACV,QAAI,aAAa,yBAA0B,QAAO,EAAE,IAAI,OAAO,OAAO,EAAE;AACxE,UAAM;AAAA,EACR;AACF;AAGO,SAAS,mBAAmB,QAA8B;AAC/D,QAAM,OAAO,KAAK,UAAU,MAAM;AAClC,SAAO,kBAAkB,KAAK,MAAM,IAAI,CAAC;AAC3C;AAIA,SAASA,cAAa,OAAgB,MAAoB;AACxD,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,yBAAyB,6BAA6B,IAAI;AAAA,EACtE;AACF;AAEA,SAAS,mBAAmB,OAAgB,MAAoB;AAC9D,MAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,GAAG;AACxD,UAAM,IAAI,yBAAyB,0BAA0B,IAAI;AAAA,EACnE;AACF;AAEA,SAAS,oBAAoB,OAAgB,MAAoB;AAC/D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iCAAiC,IAAI;AAAA,EAC1E;AACA,QAAM,MAAM;AAEZ,QAAM,WAAW,IAAI;AACrB,MAAI,aAAa,QAAQ,OAAO,aAAa,UAAU;AACrD,UAAM,IAAI,yBAAyB,8BAA8B,GAAG,IAAI,WAAW;AAAA,EACrF;AACA,aAAW,CAAC,SAAS,IAAI,KAAK,OAAO,QAAQ,QAAmC,GAAG;AACjF,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,IAAI;AAAA,QACR;AAAA,QACA,GAAG,IAAI,aAAa,OAAO;AAAA,MAC7B;AAAA,IACF;AACA,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,IAA+B,GAAG;AAC1E,yBAAmB,OAAO,GAAG,IAAI,aAAa,OAAO,IAAI,GAAG,EAAE;AAAA,IAChE;AAAA,EACF;AAEA,QAAM,aAAa,IAAI;AACvB,MAAI,eAAe,QAAQ,OAAO,eAAe,UAAU;AACzD,UAAM,IAAI,yBAAyB,gCAAgC,GAAG,IAAI,aAAa;AAAA,EACzF;AACA,aAAW,CAAC,KAAK,IAAI,KAAK,OAAO,QAAQ,UAAqC,GAAG;AAC/E,uBAAmB,MAAM,GAAG,IAAI,eAAe,GAAG,EAAE;AAAA,EACtD;AAEA,qBAAmB,IAAI,WAAW,GAAG,IAAI,YAAY;AAErD,MAAI,IAAI,iBAAiB,QAAW;AAClC,QAAI,CAAC,MAAM,QAAQ,IAAI,YAAY,GAAG;AACpC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,GAAG,IAAI;AAAA,MACT;AAAA,IACF;AACA,aAAS,IAAI,GAAG,IAAI,IAAI,aAAa,QAAQ,KAAK;AAChD,YAAM,KAAK,IAAI,aAAa,CAAC;AAC7B,UAAI,OAAO,OAAO,YAAY,GAAG,WAAW,GAAG;AAC7C,cAAM,IAAI;AAAA,UACR;AAAA,UACA,GAAG,IAAI,iBAAiB,CAAC;AAAA,QAC3B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,IAAI,UAAU,UAAa,OAAO,IAAI,UAAU,UAAU;AAC5D,UAAM,IAAI,yBAAyB,0BAA0B,GAAG,IAAI,QAAQ;AAAA,EAC9E;AACF;AAYA,SAAS,iBAAiB,OAAwB;AAChD,MAAI,MAAM,SAAS,GAAG,EAAG,QAAO;AAChC,MAAI,UAAU,KAAK,KAAK,EAAG,QAAO;AAClC,MAAI,sBAAsB,KAAK,KAAK,EAAG,QAAO;AAC9C,MAAI,SAAS,KAAK,KAAK,EAAG,QAAO;AACjC,SAAO;AACT;","names":["expectString"]}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
// src/meta-eval/calibration.ts
|
|
2
|
+
async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {
|
|
3
|
+
const runs = await traceStore.listRuns();
|
|
4
|
+
const outcomes = await outcomeStore.list();
|
|
5
|
+
const byRun = /* @__PURE__ */ new Map();
|
|
6
|
+
for (const o of outcomes) {
|
|
7
|
+
const arr = byRun.get(o.runId) ?? [];
|
|
8
|
+
arr.push(o);
|
|
9
|
+
byRun.set(o.runId, arr);
|
|
10
|
+
}
|
|
11
|
+
const extract = evalMetric.extract ?? defaultExtract(evalMetric.id);
|
|
12
|
+
const pairs = [];
|
|
13
|
+
for (const run of runs) {
|
|
14
|
+
const os = byRun.get(run.runId);
|
|
15
|
+
if (!os?.length) continue;
|
|
16
|
+
const x = await extract(run, traceStore);
|
|
17
|
+
if (x === null || !Number.isFinite(x)) continue;
|
|
18
|
+
const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0];
|
|
19
|
+
const y = latest.metrics[outcomeMetric];
|
|
20
|
+
if (typeof y !== "number" || !Number.isFinite(y)) continue;
|
|
21
|
+
pairs.push({ x, y });
|
|
22
|
+
}
|
|
23
|
+
if (pairs.length < 2) return null;
|
|
24
|
+
return calibrationFromPairs(
|
|
25
|
+
pairs.map((p) => ({ evalScore: p.x, outcome: p.y })),
|
|
26
|
+
evalMetric.id,
|
|
27
|
+
outcomeMetric,
|
|
28
|
+
options
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
function calibrationFromPairs(inputPairs, evalMetric, outcomeMetric, options = {}) {
|
|
32
|
+
const pairs = inputPairs.filter(
|
|
33
|
+
(pair) => Number.isFinite(pair.evalScore) && Number.isFinite(pair.outcome)
|
|
34
|
+
);
|
|
35
|
+
if (pairs.length < 2) return null;
|
|
36
|
+
const numBins = options.bins ?? 10;
|
|
37
|
+
const binning = options.binning ?? "equal-width";
|
|
38
|
+
const xs = pairs.map((p) => p.evalScore);
|
|
39
|
+
const lo = options.range?.lo ?? Math.min(...xs);
|
|
40
|
+
const hi = options.range?.hi ?? Math.max(...xs);
|
|
41
|
+
const bins = [];
|
|
42
|
+
if (binning === "equal-frequency") {
|
|
43
|
+
const sorted = [...pairs].sort((a, b) => a.evalScore - b.evalScore);
|
|
44
|
+
const perBin = Math.max(1, Math.floor(sorted.length / numBins));
|
|
45
|
+
for (let i = 0; i < sorted.length; i += perBin) {
|
|
46
|
+
const chunk = sorted.slice(i, i + perBin);
|
|
47
|
+
if (chunk.length === 0) continue;
|
|
48
|
+
bins.push(toBin(chunk));
|
|
49
|
+
}
|
|
50
|
+
} else {
|
|
51
|
+
const width = (hi - lo) / numBins;
|
|
52
|
+
if (width === 0) return null;
|
|
53
|
+
for (let i = 0; i < numBins; i++) {
|
|
54
|
+
const binLo = lo + i * width;
|
|
55
|
+
const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width;
|
|
56
|
+
const chunk = pairs.filter((p) => p.evalScore >= binLo && p.evalScore < binHi);
|
|
57
|
+
if (chunk.length === 0) continue;
|
|
58
|
+
bins.push(toBin(chunk, binLo, binHi));
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
const total = bins.reduce((a, b) => a + b.n, 0);
|
|
62
|
+
const ece = bins.reduce((a, b) => a + b.n / total * b.gap, 0);
|
|
63
|
+
const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0);
|
|
64
|
+
return { evalMetric, outcomeMetric, n: pairs.length, bins, ece, maxGap };
|
|
65
|
+
}
|
|
66
|
+
function toBin(chunk, lower, upper) {
|
|
67
|
+
const xs = chunk.map((c) => c.evalScore);
|
|
68
|
+
const ys = chunk.map((c) => c.outcome);
|
|
69
|
+
const evalMean = mean(xs);
|
|
70
|
+
const outcomeMean = mean(ys);
|
|
71
|
+
return {
|
|
72
|
+
lower: lower ?? Math.min(...xs),
|
|
73
|
+
upper: upper ?? Math.max(...xs),
|
|
74
|
+
n: chunk.length,
|
|
75
|
+
evalMean,
|
|
76
|
+
outcomeMean,
|
|
77
|
+
gap: Math.abs(outcomeMean - evalMean)
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
function mean(xs) {
|
|
81
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
82
|
+
}
|
|
83
|
+
function defaultExtract(metric) {
|
|
84
|
+
return async (run) => run.outcome?.score ?? (metric === "pass" ? run.outcome?.pass === true ? 1 : 0 : null);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export {
|
|
88
|
+
calibrationCurve,
|
|
89
|
+
calibrationFromPairs
|
|
90
|
+
};
|
|
91
|
+
//# sourceMappingURL=chunk-NPCTHQIO.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/meta-eval/calibration.ts"],"sourcesContent":["/**\n * Calibration curve — binned \"if eval says X, what does reality show?\"\n *\n * Companion to correlationStudy. Raw correlation is a single number;\n * the calibration curve shows *where* the eval is well-calibrated vs\n * overconfident / underconfident. Buckets the eval metric, computes\n * mean outcome per bucket, reports expected-calibration-error (ECE).\n */\n\nimport type { Run } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\nimport type { EvalMetricSpec } from './correlation-study'\nimport type { DeploymentOutcome, OutcomeStore } from './outcome-store'\n\nexport interface CalibrationBin {\n lower: number\n upper: number\n n: number\n evalMean: number\n outcomeMean: number\n /** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */\n gap: number\n}\n\nexport interface CalibrationReport {\n evalMetric: string\n outcomeMetric: string\n n: number\n bins: CalibrationBin[]\n /** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */\n ece: number\n /** Max bin gap — upper bound on miscalibration. */\n maxGap: number\n}\n\nexport interface CalibrationOptions {\n bins?: number\n /** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */\n binning?: 'equal-width' | 'equal-frequency'\n /** Clip eval values to [lo, hi] before binning. */\n range?: { lo: number; hi: number }\n}\n\nexport interface CalibrationPair {\n evalScore: number\n outcome: number\n}\n\nexport async function calibrationCurve(\n traceStore: TraceStore,\n outcomeStore: OutcomeStore,\n evalMetric: EvalMetricSpec,\n outcomeMetric: string,\n options: CalibrationOptions = {},\n): Promise<CalibrationReport | null> {\n const runs = await traceStore.listRuns()\n const outcomes = await outcomeStore.list()\n const byRun = new Map<string, DeploymentOutcome[]>()\n for (const o of outcomes) {\n const arr = byRun.get(o.runId) ?? []\n arr.push(o)\n byRun.set(o.runId, arr)\n }\n\n const extract = evalMetric.extract ?? defaultExtract(evalMetric.id)\n const pairs: Array<{ x: number; y: number }> = []\n for (const run of runs) {\n const os = byRun.get(run.runId)\n if (!os?.length) continue\n const x = await extract(run, traceStore)\n if (x === null || !Number.isFinite(x)) continue\n const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0]!\n const y = latest.metrics[outcomeMetric]\n if (typeof y !== 'number' || !Number.isFinite(y)) continue\n pairs.push({ x, y })\n }\n if (pairs.length < 2) return null\n\n return calibrationFromPairs(\n pairs.map((p) => ({ evalScore: p.x, outcome: p.y })),\n evalMetric.id,\n outcomeMetric,\n options,\n )\n}\n\nexport function calibrationFromPairs(\n inputPairs: CalibrationPair[],\n evalMetric: string,\n outcomeMetric: string,\n options: CalibrationOptions = {},\n): CalibrationReport | null {\n const pairs = inputPairs.filter(\n (pair) => Number.isFinite(pair.evalScore) && Number.isFinite(pair.outcome),\n )\n if (pairs.length < 2) return null\n\n const numBins = options.bins ?? 10\n const binning = options.binning ?? 'equal-width'\n const xs = pairs.map((p) => p.evalScore)\n const lo = options.range?.lo ?? Math.min(...xs)\n const hi = options.range?.hi ?? Math.max(...xs)\n\n const bins: CalibrationBin[] = []\n if (binning === 'equal-frequency') {\n const sorted = [...pairs].sort((a, b) => a.evalScore - b.evalScore)\n const perBin = Math.max(1, Math.floor(sorted.length / numBins))\n for (let i = 0; i < sorted.length; i += perBin) {\n const chunk = sorted.slice(i, i + perBin)\n if (chunk.length === 0) continue\n bins.push(toBin(chunk))\n }\n } else {\n const width = (hi - lo) / numBins\n if (width === 0) return null\n for (let i = 0; i < numBins; i++) {\n const binLo = lo + i * width\n const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width\n const chunk = pairs.filter((p) => p.evalScore >= binLo && p.evalScore < binHi)\n if (chunk.length === 0) continue\n bins.push(toBin(chunk, binLo, binHi))\n }\n }\n\n const total = bins.reduce((a, b) => a + b.n, 0)\n const ece = bins.reduce((a, b) => a + (b.n / total) * b.gap, 0)\n const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0)\n\n return { evalMetric, outcomeMetric, n: pairs.length, bins, ece, maxGap }\n}\n\nfunction toBin(chunk: CalibrationPair[], lower?: number, upper?: number): CalibrationBin {\n const xs = chunk.map((c) => c.evalScore)\n const ys = chunk.map((c) => c.outcome)\n const evalMean = mean(xs)\n const outcomeMean = mean(ys)\n return {\n lower: lower ?? Math.min(...xs),\n upper: upper ?? Math.max(...xs),\n n: chunk.length,\n evalMean,\n outcomeMean,\n gap: Math.abs(outcomeMean - evalMean),\n }\n}\n\nfunction mean(xs: number[]): number {\n return xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run) =>\n run.outcome?.score ?? (metric === 'pass' ? (run.outcome?.pass === true ? 1 : 0) : null)\n}\n"],"mappings":";AAgDA,eAAsB,iBACpB,YACA,cACA,YACA,eACA,UAA8B,CAAC,GACI;AACnC,QAAM,OAAO,MAAM,WAAW,SAAS;AACvC,QAAM,WAAW,MAAM,aAAa,KAAK;AACzC,QAAM,QAAQ,oBAAI,IAAiC;AACnD,aAAW,KAAK,UAAU;AACxB,UAAM,MAAM,MAAM,IAAI,EAAE,KAAK,KAAK,CAAC;AACnC,QAAI,KAAK,CAAC;AACV,UAAM,IAAI,EAAE,OAAO,GAAG;AAAA,EACxB;AAEA,QAAM,UAAU,WAAW,WAAW,eAAe,WAAW,EAAE;AAClE,QAAM,QAAyC,CAAC;AAChD,aAAW,OAAO,MAAM;AACtB,UAAM,KAAK,MAAM,IAAI,IAAI,KAAK;AAC9B,QAAI,CAAC,IAAI,OAAQ;AACjB,UAAM,IAAI,MAAM,QAAQ,KAAK,UAAU;AACvC,QAAI,MAAM,QAAQ,CAAC,OAAO,SAAS,CAAC,EAAG;AACvC,UAAM,SAAS,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,EAAE,UAAU,EAAE,CAAC;AACpE,UAAM,IAAI,OAAO,QAAQ,aAAa;AACtC,QAAI,OAAO,MAAM,YAAY,CAAC,OAAO,SAAS,CAAC,EAAG;AAClD,UAAM,KAAK,EAAE,GAAG,EAAE,CAAC;AAAA,EACrB;AACA,MAAI,MAAM,SAAS,EAAG,QAAO;AAE7B,SAAO;AAAA,IACL,MAAM,IAAI,CAAC,OAAO,EAAE,WAAW,EAAE,GAAG,SAAS,EAAE,EAAE,EAAE;AAAA,IACnD,WAAW;AAAA,IACX;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,qBACd,YACA,YACA,eACA,UAA8B,CAAC,GACL;AAC1B,QAAM,QAAQ,WAAW;AAAA,IACvB,CAAC,SAAS,OAAO,SAAS,KAAK,SAAS,KAAK,OAAO,SAAS,KAAK,OAAO;AAAA,EAC3E;AACA,MAAI,MAAM,SAAS,EAAG,QAAO;AAE7B,QAAM,UAAU,QAAQ,QAAQ;AAChC,QAAM,UAAU,QAAQ,WAAW;AACnC,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,SAAS;AACvC,QAAM,KAAK,QAAQ,OAAO,MAAM,KAAK,IAAI,GAAG,EAAE;AAC9C,QAAM,KAAK,QAAQ,OAAO,MAAM,KAAK,IAAI,GAAG,EAAE;AAE9C,QAAM,OAAyB,CAAC;AAChC,MAAI,YAAY,mBAAmB;AACjC,UAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,UAAM,SAAS,KAAK,IAAI,GAAG,KAAK,MAAM,OAAO,SAAS,OAAO,CAAC;AAC9D,aAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK,QAAQ;AAC9C,YAAM,QAAQ,OAAO,MAAM,GAAG,IAAI,MAAM;AACxC,UAAI,MAAM,WAAW,EAAG;AACxB,WAAK,KAAK,MAAM,KAAK,CAAC;AAAA,IACxB;AAAA,EACF,OAAO;AACL,UAAM,SAAS,KAAK,MAAM;AAC1B,QAAI,UAAU,EAAG,QAAO;AACxB,aAAS,IAAI,GAAG,IAAI,SAAS,KAAK;AAChC,YAAM,QAAQ,KAAK,IAAI;AACvB,YAAM,QAAQ,MAAM,UAAU,IAAI,KAAK,OAAO,MAAM,IAAI,KAAK;AAC7D,YAAM,QAAQ,MAAM,OAAO,CAAC,MAAM,EAAE,aAAa,SAAS,EAAE,YAAY,KAAK;AAC7E,UAAI,MAAM,WAAW,EAAG;AACxB,WAAK,KAAK,MAAM,OAAO,OAAO,KAAK,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,QAAM,QAAQ,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,GAAG,CAAC;AAC9C,QAAM,MAAM,KAAK,OAAO,CAAC,GAAG,MAAM,IAAK,EAAE,IAAI,QAAS,EAAE,KAAK,CAAC;AAC9D,QAAM,SAAS,KAAK,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,GAAG,EAAE,GAAG,GAAG,CAAC;AAE1D,SAAO,EAAE,YAAY,eAAe,GAAG,MAAM,QAAQ,MAAM,KAAK,OAAO;AACzE;AAEA,SAAS,MAAM,OAA0B,OAAgB,OAAgC;AACvF,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,SAAS;AACvC,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,OAAO;AACrC,QAAM,WAAW,KAAK,EAAE;AACxB,QAAM,cAAc,KAAK,EAAE;AAC3B,SAAO;AAAA,IACL,OAAO,SAAS,KAAK,IAAI,GAAG,EAAE;AAAA,IAC9B,OAAO,SAAS,KAAK,IAAI,GAAG,EAAE;AAAA,IAC9B,GAAG,MAAM;AAAA,IACT;AAAA,IACA;AAAA,IACA,KAAK,KAAK,IAAI,cAAc,QAAQ;AAAA,EACtC;AACF;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAC5C;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,QACZ,IAAI,SAAS,UAAU,WAAW,SAAU,IAAI,SAAS,SAAS,OAAO,IAAI,IAAK;AACtF;","names":[]}
|
|
@@ -1428,6 +1428,15 @@ async function runImprovementLoop(opts) {
|
|
|
1428
1428
|
if (opts.autoOnPromote === "pr" && (!opts.ghOwner || !opts.ghRepo)) {
|
|
1429
1429
|
throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
|
|
1430
1430
|
}
|
|
1431
|
+
const holdoutIds = new Set(opts.holdoutScenarios.map((s) => s.id));
|
|
1432
|
+
const leaked = opts.scenarios.filter((s) => holdoutIds.has(s.id)).map((s) => s.id);
|
|
1433
|
+
if (leaked.length > 0) {
|
|
1434
|
+
throw new Error(
|
|
1435
|
+
`runImprovementLoop: training scenarios and holdoutScenarios must be disjoint (overlap: [${leaked.join(
|
|
1436
|
+
", "
|
|
1437
|
+
)}]) \u2014 a shared scenario leaks the held-out gate axis into the optimization, inflating reported lift.`
|
|
1438
|
+
);
|
|
1439
|
+
}
|
|
1431
1440
|
const dispatchTimeoutMs = opts.dispatchTimeoutMs ?? DEFAULT_DISPATCH_TIMEOUT_MS;
|
|
1432
1441
|
const optimization = await runOptimization({ ...opts, dispatchTimeoutMs });
|
|
1433
1442
|
const winnerIsBaseline = optimization.winnerSurfaceHash === surfaceHash(opts.baselineSurface);
|
|
@@ -1832,4 +1841,4 @@ export {
|
|
|
1832
1841
|
provenanceSpansPath,
|
|
1833
1842
|
emitLoopProvenance
|
|
1834
1843
|
};
|
|
1835
|
-
//# sourceMappingURL=chunk-
|
|
1844
|
+
//# sourceMappingURL=chunk-RPLZ4OIB.js.map
|