npm - @tangle-network/agent-eval - Versions diffs - 0.32.0 → 0.33.1 - Mend

@tangle-network/agent-eval 0.32.0 → 0.33.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/CHANGELOG.md +30 -0
package/dist/benchmarks/index.d.ts +2 -2
package/dist/chunk-DCZXFOQN.js +489 -0
package/dist/chunk-DCZXFOQN.js.map +1 -0
package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
package/dist/chunk-FT3IAMQR.js.map +1 -0
package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
package/dist/chunk-SQYRO3BT.js.map +1 -0
package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
package/dist/chunk-TQL7BAOY.js.map +1 -0
package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
package/dist/chunk-VXNVVBZO.js.map +1 -0
package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
package/dist/cli.js +2 -2
package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/control.js +3 -2
package/dist/governance/index.d.ts +2 -1
package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
package/dist/index.d.ts +39 -486
package/dist/index.js +75 -68
package/dist/index.js.map +1 -1
package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
package/dist/meta-eval/index.d.ts +2 -2
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +3 -3
package/dist/optimization.js +6 -6
package/dist/pipelines/index.js +2 -2
package/dist/release-report-ChfmCmLi.d.ts +713 -0
package/dist/reporting.d.ts +6 -4
package/dist/reporting.js +10 -9
package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
package/dist/rl.d.ts +5 -5
package/dist/rl.js +6 -6
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
package/dist/wire/index.js +2 -2
package/docs/product-eval-adoption.md +18 -0
package/package.json +22 -12
package/dist/chunk-B73G44OH.js.map +0 -1
package/dist/chunk-CXJOVDJR.js.map +0 -1
package/dist/chunk-DTEJNZYK.js.map +0 -1
package/dist/chunk-M6RZ5LJN.js.map +0 -1
package/dist/chunk-ZN2CMQIW.js +0 -208
package/dist/chunk-ZN2CMQIW.js.map +0 -1
package/dist/release-report-DLWbBPtH.d.ts +0 -292
/package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
/package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
/package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
/package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0

package/dist/chunk-ZN2CMQIW.js.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"sources":["../src/run-record.ts"],"sourcesContent":["/**\n * Paper-grade RunRecord schema + runtime validator.\n *\n * Every run that participates in a promotion gate, paper table, or\n * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory\n * fields are exactly those the paper \"Two Loops, Three Roles\" requires\n * for reproducibility: who/what/when/cost/seed/hash, plus the search vs\n * holdout split tag and either a `searchScore` or a `holdoutScore`.\n *\n * This is intentionally NOT a replacement for the rich `Run` /\n * `ProposeReviewReport` / `ScenarioResult` types already in the\n * package. Those are runtime structures with full provenance. A\n * `RunRecord` is the analysis-time projection — the JSON-friendly\n * row you'd put in a parquet file or paste into a notebook.\n *\n * Validate at the boundary:\n *\n * const rec = validateRunRecord(rawJson) // throws on missing\n * const ok = isRunRecord(rawJson) // boolean check\n * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }\n *\n * The validator runs in pure TS — zod is intentionally NOT a\n * dependency. Round-trip tested in `tests/run-record.test.ts`.\n */\n\n/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the\n * combined train+test pool that the optimizer is allowed to read. */\nexport type RunSplitTag = 'search' | 'dev' | 'holdout'\n\nexport interface RunTokenUsage {\n input: number\n output: number\n cached?: number\n}\n\nexport interface RunJudgeMetadata {\n model: string\n promptVersion: string\n /** [0,1] confidence the judge declared. Constant judge confidence\n * across many runs is a fallback signal (see `canary.ts`). */\n confidence: number\n /** True if the judge degraded to a fallback path (rules-only,\n * prior-call cache, etc.). The canary uses this to alert. */\n fallback: boolean\n}\n\n/**\n * Per-judge / per-dimension breakdown for runs scored by an ensemble of\n * judges over a multi-dimensional rubric.\n *\n * The collapsed `outcome.searchScore` / `holdoutScore` carries the\n * composite the gate uses. The full breakdown belongs here so consumers\n * can answer \"which judge disagreed?\", \"which dimension dragged the\n * composite down?\", and \"did half the panel fail?\" without re-running.\n *\n * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and\n * `composite` are convenience projections — derivable but precomputed so\n * downstream IRR primitives (`interRaterReliability`,\n * `corpusInterRaterAgreement`) and reporters don't pay the same\n * aggregation twice.\n *\n * Fail-loud discipline: judges that errored out land in `failedJudges`\n * by id. A missing key in `perJudge` is ambiguous (silent zero vs not\n * run); the explicit list makes a partial-failure recorded as such.\n */\nexport interface JudgeScoresRecord {\n /** Per-judge per-dimension scores. `{ \"kimi-k2.6\": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */\n perJudge: Record<string, Record<string, number>>\n /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */\n perDimMean: Record<string, number>\n /** Composite mean across all dims and judges. Mirrors the score\n * the gate sees on `outcome.searchScore` / `holdoutScore`. */\n composite: number\n /** Judges that errored or returned an unparseable verdict. Recorded\n * by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,\n * not inferred from missing keys in `perJudge`. */\n failedJudges?: string[]\n /** Free-form notes the judges emitted (joined across judges or\n * first-judge only — consumer's choice). */\n notes?: string\n}\n\nexport interface RunOutcome {\n /** Score on the search/optimization split. Optional because a\n * holdout-only evaluation only fills `holdoutScore`. */\n searchScore?: number\n /** Score on the held-out split. Optional because a search-only run\n * only fills `searchScore`. At least one must be present. */\n holdoutScore?: number\n /** Bag of any other metric the run produced — judge dimensions,\n * pass/fail counters, latency stats, etc. Numeric only — keeps\n * reporters honest. */\n raw: Record<string, number>\n /** Per-judge / per-dim breakdown. Consumers writing ensemble\n * judgements populate this; substrate primitives like\n * `interRaterReliability` and `corpusInterRaterAgreement` accept\n * these records as input. Optional — single-judge or scalar-only\n * runs leave it unset. */\n judgeScores?: JudgeScoresRecord\n}\n\n/**\n * Mandatory paper-grade fields for a single evaluation run. Optional\n * fields are extension points; mandatory fields throw if missing.\n *\n * Hash discipline:\n * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the\n * model (after any steering bundle merge).\n * - `configHash` is the sha256 of the effective run config (model,\n * temperature, tools, judges, splits). The pair (promptHash,\n * configHash) uniquely identifies an experimental cell.\n *\n * Model snapshot discipline:\n * - `model` MUST encode a snapshot version. Bare aliases like\n * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.\n * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.\n */\nexport interface RunRecord {\n /** UUID for the run. */\n runId: string\n /** Logical experiment grouping (a treatment vs a baseline within\n * the same sweep should share `experimentId`). */\n experimentId: string\n /** Stable identifier for the candidate (variant) being run. The\n * promotion gate compares two `candidateId`s on matched items. */\n candidateId: string\n /** RNG seed for the run. Always recorded — silent re-seeding is\n * the most common cause of non-reproducible numbers. */\n seed: number\n /** Model identifier WITH snapshot version. */\n model: string\n /** sha256 of the effective prompt (post-steering). */\n promptHash: string\n /** sha256 of the effective config. */\n configHash: string\n /** Git SHA the harness was run from. */\n commitSha: string\n /** End-to-end wall-clock duration in milliseconds. */\n wallMs: number\n /** Time spent queued before execution started, if known. */\n queueMs?: number\n /** Total USD cost. Mandatory — runs without a cost number are\n * unbounded by definition and must not be admitted into the gate. */\n costUsd: number\n /** Token usage breakdown. */\n tokenUsage: RunTokenUsage\n /** Judge-side metadata, if a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /** Per-split scores + raw bag. */\n outcome: RunOutcome\n /** Categorical failure tag, when the run failed and the harness\n * classified it. Free-form string; standard tags live in\n * `failure-taxonomy.ts`. */\n failureMode?: string\n /** Which split this run was drawn from. */\n splitTag: RunSplitTag\n /**\n * Stable scenario identifier the run was scored against. Optional for\n * backwards compatibility, but **strongly recommended**: every primitive\n * that pairs runs by scenario (preferences, paired stats, BT tournament)\n * keys on this. The campaign artifact populates it canonically; legacy\n * runs without it fall back to inference from `outcome.raw.scenario_id`\n * or `experimentId`.\n */\n scenarioId?: string\n}\n\n// ── Validation ───────────────────────────────────────────────────────\n\nconst MANDATORY_TOP_LEVEL = [\n 'runId',\n 'experimentId',\n 'candidateId',\n 'seed',\n 'model',\n 'promptHash',\n 'configHash',\n 'commitSha',\n 'wallMs',\n 'costUsd',\n 'tokenUsage',\n 'outcome',\n 'splitTag',\n] as const\n\nimport { ValidationError } from './errors'\n\nconst SPLIT_TAGS: ReadonlyArray<RunSplitTag> = ['search', 'dev', 'holdout']\n\nexport class RunRecordValidationError extends ValidationError {\n readonly path: string\n constructor(message: string, path = '') {\n super(path ? `${message} (at ${path})` : message)\n this.path = path\n }\n}\n\n/**\n * Strict validator. Throws `RunRecordValidationError` on the first\n * missing or wrongly-typed field. Returns the input cast to\n * `RunRecord` on success — the validator does not coerce.\n */\nexport function validateRunRecord(input: unknown): RunRecord {\n if (input === null || typeof input !== 'object') {\n throw new RunRecordValidationError('expected object')\n }\n const obj = input as Record<string, unknown>\n\n for (const key of MANDATORY_TOP_LEVEL) {\n if (!(key in obj)) {\n throw new RunRecordValidationError(`missing mandatory field \"${key}\"`)\n }\n }\n\n expectString(obj.runId, 'runId')\n expectString(obj.experimentId, 'experimentId')\n expectString(obj.candidateId, 'candidateId')\n expectFiniteNumber(obj.seed, 'seed')\n expectString(obj.model, 'model')\n expectString(obj.promptHash, 'promptHash')\n expectString(obj.configHash, 'configHash')\n expectString(obj.commitSha, 'commitSha')\n expectFiniteNumber(obj.wallMs, 'wallMs')\n if (obj.queueMs !== undefined) expectFiniteNumber(obj.queueMs, 'queueMs')\n expectFiniteNumber(obj.costUsd, 'costUsd')\n\n // Snapshot discipline: bare model aliases are not paper-grade.\n if (!modelHasSnapshot(obj.model as string)) {\n throw new RunRecordValidationError(\n `model \"${obj.model}\" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,\n 'model',\n )\n }\n\n // Token usage.\n const tu = obj.tokenUsage\n if (tu === null || typeof tu !== 'object') {\n throw new RunRecordValidationError('tokenUsage must be an object', 'tokenUsage')\n }\n const tuRec = tu as Record<string, unknown>\n expectFiniteNumber(tuRec.input, 'tokenUsage.input')\n expectFiniteNumber(tuRec.output, 'tokenUsage.output')\n if (tuRec.cached !== undefined) expectFiniteNumber(tuRec.cached, 'tokenUsage.cached')\n\n // Judge metadata, optional.\n if (obj.judgeMetadata !== undefined) {\n const jm = obj.judgeMetadata\n if (jm === null || typeof jm !== 'object') {\n throw new RunRecordValidationError('judgeMetadata must be an object', 'judgeMetadata')\n }\n const jmRec = jm as Record<string, unknown>\n expectString(jmRec.model, 'judgeMetadata.model')\n expectString(jmRec.promptVersion, 'judgeMetadata.promptVersion')\n expectFiniteNumber(jmRec.confidence, 'judgeMetadata.confidence')\n if (typeof jmRec.fallback !== 'boolean') {\n throw new RunRecordValidationError(\n 'judgeMetadata.fallback must be boolean',\n 'judgeMetadata.fallback',\n )\n }\n }\n\n // Outcome.\n const out = obj.outcome\n if (out === null || typeof out !== 'object') {\n throw new RunRecordValidationError('outcome must be an object', 'outcome')\n }\n const outRec = out as Record<string, unknown>\n if (outRec.searchScore !== undefined)\n expectFiniteNumber(outRec.searchScore, 'outcome.searchScore')\n if (outRec.holdoutScore !== undefined)\n expectFiniteNumber(outRec.holdoutScore, 'outcome.holdoutScore')\n if (outRec.searchScore === undefined && outRec.holdoutScore === undefined) {\n throw new RunRecordValidationError(\n 'outcome must define searchScore or holdoutScore (or both)',\n 'outcome',\n )\n }\n const raw = outRec.raw\n if (raw === null || typeof raw !== 'object') {\n throw new RunRecordValidationError('outcome.raw must be an object', 'outcome.raw')\n }\n for (const [k, v] of Object.entries(raw as Record<string, unknown>)) {\n expectFiniteNumber(v, `outcome.raw.${k}`)\n }\n\n // Per-judge / per-dim breakdown, optional.\n if (outRec.judgeScores !== undefined) {\n validateJudgeScores(outRec.judgeScores, 'outcome.judgeScores')\n }\n\n // Failure mode optional.\n if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode')\n\n // Split tag.\n if (typeof obj.splitTag !== 'string' || !SPLIT_TAGS.includes(obj.splitTag as RunSplitTag)) {\n throw new RunRecordValidationError(\n `splitTag must be one of ${SPLIT_TAGS.join(', ')}, got ${String(obj.splitTag)}`,\n 'splitTag',\n )\n }\n\n return input as RunRecord\n}\n\n/** Boolean validator — convenience for filtering arrays. */\nexport function isRunRecord(input: unknown): input is RunRecord {\n try {\n validateRunRecord(input)\n return true\n } catch {\n return false\n }\n}\n\n/** Non-throwing validator — returns a discriminated union. */\nexport function parseRunRecordSafe(\n input: unknown,\n): { ok: true; value: RunRecord } | { ok: false; error: RunRecordValidationError } {\n try {\n return { ok: true, value: validateRunRecord(input) }\n } catch (e) {\n if (e instanceof RunRecordValidationError) return { ok: false, error: e }\n throw e\n }\n}\n\n/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */\nexport function roundTripRunRecord(record: RunRecord): RunRecord {\n const json = JSON.stringify(record)\n return validateRunRecord(JSON.parse(json))\n}\n\n// ── Internals ────────────────────────────────────────────────────────\n\nfunction expectString(value: unknown, path: string): void {\n if (typeof value !== 'string' || value.length === 0) {\n throw new RunRecordValidationError(`expected non-empty string`, path)\n }\n}\n\nfunction expectFiniteNumber(value: unknown, path: string): void {\n if (typeof value !== 'number' || !Number.isFinite(value)) {\n throw new RunRecordValidationError(`expected finite number`, path)\n }\n}\n\nfunction validateJudgeScores(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object') {\n throw new RunRecordValidationError('judgeScores must be an object', path)\n }\n const rec = value as Record<string, unknown>\n\n const perJudge = rec.perJudge\n if (perJudge === null || typeof perJudge !== 'object') {\n throw new RunRecordValidationError('perJudge must be an object', `${path}.perJudge`)\n }\n for (const [judgeId, dims] of Object.entries(perJudge as Record<string, unknown>)) {\n if (dims === null || typeof dims !== 'object') {\n throw new RunRecordValidationError(\n 'per-judge entry must be an object of dimension scores',\n `${path}.perJudge.${judgeId}`,\n )\n }\n for (const [dim, score] of Object.entries(dims as Record<string, unknown>)) {\n expectFiniteNumber(score, `${path}.perJudge.${judgeId}.${dim}`)\n }\n }\n\n const perDimMean = rec.perDimMean\n if (perDimMean === null || typeof perDimMean !== 'object') {\n throw new RunRecordValidationError('perDimMean must be an object', `${path}.perDimMean`)\n }\n for (const [dim, mean] of Object.entries(perDimMean as Record<string, unknown>)) {\n expectFiniteNumber(mean, `${path}.perDimMean.${dim}`)\n }\n\n expectFiniteNumber(rec.composite, `${path}.composite`)\n\n if (rec.failedJudges !== undefined) {\n if (!Array.isArray(rec.failedJudges)) {\n throw new RunRecordValidationError(\n 'failedJudges must be an array of strings',\n `${path}.failedJudges`,\n )\n }\n for (let i = 0; i < rec.failedJudges.length; i++) {\n const id = rec.failedJudges[i]\n if (typeof id !== 'string' || id.length === 0) {\n throw new RunRecordValidationError(\n 'failedJudges entry must be a non-empty string',\n `${path}.failedJudges[${i}]`,\n )\n }\n }\n }\n\n if (rec.notes !== undefined && typeof rec.notes !== 'string') {\n throw new RunRecordValidationError('notes must be a string', `${path}.notes`)\n }\n}\n\n/**\n * Heuristic snapshot check. Accepts:\n * - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`)\n * - `name-YYYYMMDD` (OpenAI style: `gpt-4o-2024-11-20`)\n * - `name@<arbitrary-token>` (allow opaque snapshots like `@v3`)\n * - explicit `:date-...` Vertex-style tags\n *\n * Rejects bare aliases like `claude-sonnet-4` or `gpt-4o` that remap\n * silently as providers ship new snapshots.\n */\nfunction modelHasSnapshot(model: string): boolean {\n if (model.includes('@')) return true\n if (/-\\d{8}$/.test(model)) return true\n if (/-\\d{4}-\\d{2}-\\d{2}$/.test(model)) return true\n if (/:date-/.test(model)) return true\n return false\n}\n"],"mappings":";;;;;AAyKA,IAAM,sBAAsB;AAAA,EAC1B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAIA,IAAM,aAAyC,CAAC,UAAU,OAAO,SAAS;AAEnE,IAAM,2BAAN,cAAuC,gBAAgB;AAAA,EACnD;AAAA,EACT,YAAY,SAAiB,OAAO,IAAI;AACtC,UAAM,OAAO,GAAG,OAAO,QAAQ,IAAI,MAAM,OAAO;AAChD,SAAK,OAAO;AAAA,EACd;AACF;AAOO,SAAS,kBAAkB,OAA2B;AAC3D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iBAAiB;AAAA,EACtD;AACA,QAAM,MAAM;AAEZ,aAAW,OAAO,qBAAqB;AACrC,QAAI,EAAE,OAAO,MAAM;AACjB,YAAM,IAAI,yBAAyB,4BAA4B,GAAG,GAAG;AAAA,IACvE;AAAA,EACF;AAEA,eAAa,IAAI,OAAO,OAAO;AAC/B,eAAa,IAAI,cAAc,cAAc;AAC7C,eAAa,IAAI,aAAa,aAAa;AAC3C,qBAAmB,IAAI,MAAM,MAAM;AACnC,eAAa,IAAI,OAAO,OAAO;AAC/B,eAAa,IAAI,YAAY,YAAY;AACzC,eAAa,IAAI,YAAY,YAAY;AACzC,eAAa,IAAI,WAAW,WAAW;AACvC,qBAAmB,IAAI,QAAQ,QAAQ;AACvC,MAAI,IAAI,YAAY,OAAW,oBAAmB,IAAI,SAAS,SAAS;AACxE,qBAAmB,IAAI,SAAS,SAAS;AAGzC,MAAI,CAAC,iBAAiB,IAAI,KAAe,GAAG;AAC1C,UAAM,IAAI;AAAA,MACR,UAAU,IAAI,KAAK;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AAGA,QAAM,KAAK,IAAI;AACf,MAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,UAAM,IAAI,yBAAyB,gCAAgC,YAAY;AAAA,EACjF;AACA,QAAM,QAAQ;AACd,qBAAmB,MAAM,OAAO,kBAAkB;AAClD,qBAAmB,MAAM,QAAQ,mBAAmB;AACpD,MAAI,MAAM,WAAW,OAAW,oBAAmB,MAAM,QAAQ,mBAAmB;AAGpF,MAAI,IAAI,kBAAkB,QAAW;AACnC,UAAM,KAAK,IAAI;AACf,QAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,YAAM,IAAI,yBAAyB,mCAAmC,eAAe;AAAA,IACvF;AACA,UAAM,QAAQ;AACd,iBAAa,MAAM,OAAO,qBAAqB;AAC/C,iBAAa,MAAM,eAAe,6BAA6B;AAC/D,uBAAmB,MAAM,YAAY,0BAA0B;AAC/D,QAAI,OAAO,MAAM,aAAa,WAAW;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,QAAM,MAAM,IAAI;AAChB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,6BAA6B,SAAS;AAAA,EAC3E;AACA,QAAM,SAAS;AACf,MAAI,OAAO,gBAAgB;AACzB,uBAAmB,OAAO,aAAa,qBAAqB;AAC9D,MAAI,OAAO,iBAAiB;AAC1B,uBAAmB,OAAO,cAAc,sBAAsB;AAChE,MAAI,OAAO,gBAAgB,UAAa,OAAO,iBAAiB,QAAW;AACzE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,QAAM,MAAM,OAAO;AACnB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,iCAAiC,aAAa;AAAA,EACnF;AACA,aAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,GAA8B,GAAG;AACnE,uBAAmB,GAAG,eAAe,CAAC,EAAE;AAAA,EAC1C;AAGA,MAAI,OAAO,gBAAgB,QAAW;AACpC,wBAAoB,OAAO,aAAa,qBAAqB;AAAA,EAC/D;AAGA,MAAI,IAAI,gBAAgB,OAAW,cAAa,IAAI,aAAa,aAAa;AAG9E,MAAI,OAAO,IAAI,aAAa,YAAY,CAAC,WAAW,SAAS,IAAI,QAAuB,GAAG;AACzF,UAAM,IAAI;AAAA,MACR,2BAA2B,WAAW,KAAK,IAAI,CAAC,SAAS,OAAO,IAAI,QAAQ,CAAC;AAAA,MAC7E;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,YAAY,OAAoC;AAC9D,MAAI;AACF,sBAAkB,KAAK;AACvB,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAGO,SAAS,mBACd,OACiF;AACjF,MAAI;AACF,WAAO,EAAE,IAAI,MAAM,OAAO,kBAAkB,KAAK,EAAE;AAAA,EACrD,SAAS,GAAG;AACV,QAAI,aAAa,yBAA0B,QAAO,EAAE,IAAI,OAAO,OAAO,EAAE;AACxE,UAAM;AAAA,EACR;AACF;AAGO,SAAS,mBAAmB,QAA8B;AAC/D,QAAM,OAAO,KAAK,UAAU,MAAM;AAClC,SAAO,kBAAkB,KAAK,MAAM,IAAI,CAAC;AAC3C;AAIA,SAAS,aAAa,OAAgB,MAAoB;AACxD,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,yBAAyB,6BAA6B,IAAI;AAAA,EACtE;AACF;AAEA,SAAS,mBAAmB,OAAgB,MAAoB;AAC9D,MAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,GAAG;AACxD,UAAM,IAAI,yBAAyB,0BAA0B,IAAI;AAAA,EACnE;AACF;AAEA,SAAS,oBAAoB,OAAgB,MAAoB;AAC/D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iCAAiC,IAAI;AAAA,EAC1E;AACA,QAAM,MAAM;AAEZ,QAAM,WAAW,IAAI;AACrB,MAAI,aAAa,QAAQ,OAAO,aAAa,UAAU;AACrD,UAAM,IAAI,yBAAyB,8BAA8B,GAAG,IAAI,WAAW;AAAA,EACrF;AACA,aAAW,CAAC,SAAS,IAAI,KAAK,OAAO,QAAQ,QAAmC,GAAG;AACjF,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,IAAI;AAAA,QACR;AAAA,QACA,GAAG,IAAI,aAAa,OAAO;AAAA,MAC7B;AAAA,IACF;AACA,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,IAA+B,GAAG;AAC1E,yBAAmB,OAAO,GAAG,IAAI,aAAa,OAAO,IAAI,GAAG,EAAE;AAAA,IAChE;AAAA,EACF;AAEA,QAAM,aAAa,IAAI;AACvB,MAAI,eAAe,QAAQ,OAAO,eAAe,UAAU;AACzD,UAAM,IAAI,yBAAyB,gCAAgC,GAAG,IAAI,aAAa;AAAA,EACzF;AACA,aAAW,CAAC,KAAK,IAAI,KAAK,OAAO,QAAQ,UAAqC,GAAG;AAC/E,uBAAmB,MAAM,GAAG,IAAI,eAAe,GAAG,EAAE;AAAA,EACtD;AAEA,qBAAmB,IAAI,WAAW,GAAG,IAAI,YAAY;AAErD,MAAI,IAAI,iBAAiB,QAAW;AAClC,QAAI,CAAC,MAAM,QAAQ,IAAI,YAAY,GAAG;AACpC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,GAAG,IAAI;AAAA,MACT;AAAA,IACF;AACA,aAAS,IAAI,GAAG,IAAI,IAAI,aAAa,QAAQ,KAAK;AAChD,YAAM,KAAK,IAAI,aAAa,CAAC;AAC7B,UAAI,OAAO,OAAO,YAAY,GAAG,WAAW,GAAG;AAC7C,cAAM,IAAI;AAAA,UACR;AAAA,UACA,GAAG,IAAI,iBAAiB,CAAC;AAAA,QAC3B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,IAAI,UAAU,UAAa,OAAO,IAAI,UAAU,UAAU;AAC5D,UAAM,IAAI,yBAAyB,0BAA0B,GAAG,IAAI,QAAQ;AAAA,EAC9E;AACF;AAYA,SAAS,iBAAiB,OAAwB;AAChD,MAAI,MAAM,SAAS,GAAG,EAAG,QAAO;AAChC,MAAI,UAAU,KAAK,KAAK,EAAG,QAAO;AAClC,MAAI,sBAAsB,KAAK,KAAK,EAAG,QAAO;AAC9C,MAAI,SAAS,KAAK,KAAK,EAAG,QAAO;AACjC,SAAO;AACT;","names":[]}

package/dist/release-report-DLWbBPtH.d.ts DELETED Viewed

@@ -1,292 +0,0 @@
-import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-ueRVTUoY.js';
-import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-D7AQS7eB.js';
-import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
-/**
- * Release confidence gate.
- *
- * This is the production-facing composition layer over the lower-level
- * primitives:
- *   - Dataset manifests prove corpus/version coverage.
- *   - RunRecord rows prove reproducible search/holdout outcomes.
- *   - Multi-shot trace evidence carries turn counts and ASI diagnostics.
- *   - HeldOutGate decisions remain the paired promotion authority.
- *
- * The gate is intentionally pure and conservative. Missing declared evidence
- * fails closed instead of being treated as a neutral zero.
- */
-type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
-type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
-interface ReleaseTraceEvidence {
-    scenarioId: string;
-    candidateId?: string;
-    split?: RunSplitTag;
-    score?: number;
-    ok?: boolean;
-    turnCount?: number;
-    costUsd?: number;
-    durationMs?: number;
-    failureMode?: string;
-    asi?: ActionableSideInfo[];
-    metadata?: Record<string, unknown>;
-}
-interface ReleaseConfidenceThresholds {
-    /** Require a Dataset manifest or explicit scenarios. Default true. */
-    requireCorpus?: boolean;
-    minScenarioCount?: number;
-    minSearchRuns?: number;
-    minHoldoutRuns?: number;
-    /** Require at least one holdout scenario/run. Default true. */
-    requireHoldout?: boolean;
-    minPassRate?: number;
-    minMeanScore?: number;
-    /** Search mean may exceed holdout mean by at most this much. */
-    maxOverfitGap?: number;
-    maxMeanCostUsd?: number;
-    maxP95WallMs?: number;
-    /** Low-score/failed rows must carry ASI. Default true. */
-    requireAsiForFailures?: boolean;
-    /** Score below this is considered a failure for ASI coverage. Default 0.5. */
-    failureScoreThreshold?: number;
-}
-interface ReleaseConfidenceInput {
-    target: string;
-    candidateId?: string;
-    baselineId?: string;
-    dataset?: DatasetManifest;
-    scenarios?: readonly DatasetScenario[];
-    runs?: readonly RunRecord[];
-    traces?: readonly ReleaseTraceEvidence[];
-    gateDecision?: GateDecision | null;
-    thresholds?: ReleaseConfidenceThresholds;
-}
-interface ReleaseConfidenceAxis {
-    name: ReleaseConfidenceAxisName;
-    status: ReleaseConfidenceStatus;
-    score: number;
-    detail: string;
-}
-interface ReleaseConfidenceIssue {
-    axis: ReleaseConfidenceAxisName;
-    severity: 'critical' | 'warning';
-    code: string;
-    detail: string;
-}
-interface ReleaseConfidenceMetrics {
-    scenarioCount: number;
-    searchRuns: number;
-    holdoutRuns: number;
-    passRate: number;
-    meanScore: number;
-    searchMeanScore: number;
-    holdoutMeanScore: number;
-    overfitGap: number;
-    meanCostUsd: number;
-    p95WallMs: number;
-    failedRows: number;
-    failuresWithAsi: number;
-    singleShotTraces: number;
-    multiShotTraces: number;
-    splitCounts: Record<DatasetSplit, number>;
-    domainCounts: Record<string, number>;
-    failureModeCounts: Record<string, number>;
-    responsibleSurfaceCounts: Record<string, number>;
-}
-interface ReleaseConfidenceScorecard {
-    target: string;
-    candidateId: string | null;
-    baselineId: string | null;
-    status: ReleaseConfidenceStatus;
-    promote: boolean;
-    axes: ReleaseConfidenceAxis[];
-    issues: ReleaseConfidenceIssue[];
-    metrics: ReleaseConfidenceMetrics;
-    dataset: DatasetManifest | null;
-    gateDecision: GateDecision | null;
-    summary: string;
-}
-declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
-declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
-declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
-/**
- * Paper-grade paired statistics for held-out promotion gates.
- *
- * The promotion gate (`HeldOutGate`) needs three things:
- *
- *   1. A bootstrap confidence interval on the per-item paired delta
- *      (`pairedBootstrap`). Median delta is the headline number; the
- *      CI lower bound is what the gate checks against `pairedDeltaThreshold`.
- *   2. A non-parametric significance test on the paired deltas
- *      (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
- *      paper-style name).
- *   3. False-discovery-rate correction across simultaneously-tested
- *      candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
- *
- * Why a separate file: every existing primitive lives in `statistics.ts`
- * (general) or `power-analysis.ts` (correction). Paired-bootstrap is
- * paired-only, paper-grade, and load-bearing for the promotion gate.
- * Putting it next to `statistics.ts` would require editing that file;
- * the brief forbids that. New file, new exports, no surface change.
- */
-interface PairedBootstrapResult {
-    /** Number of paired observations (after dropping unequal lengths is rejected). */
-    n: number;
-    /** Median of paired deltas (after − before). */
-    median: number;
-    /** Mean of paired deltas. */
-    mean: number;
-    /** Lower bound of the bootstrap CI on the median delta. */
-    low: number;
-    /** Upper bound of the bootstrap CI on the median delta. */
-    high: number;
-    /** Confidence level used (e.g. 0.95). */
-    confidence: number;
-    /** Number of bootstrap resamples used. */
-    resamples: number;
-}
-interface PairedBootstrapOptions {
-    /** Confidence level. Default 0.95. */
-    confidence?: number;
-    /** Bootstrap resample count. Default 2000. */
-    resamples?: number;
-    /** Statistic to bootstrap. Default 'median'. */
-    statistic?: 'median' | 'mean';
-    /** Deterministic seed. If omitted, uses Math.random(). */
-    seed?: number;
-}
-/**
- * Paired bootstrap on (after - before) deltas. Returns a CI on the
- * chosen statistic (median by default). Pairs are resampled with
- * replacement. The lower bound is what the promotion gate checks: if
- * `low > pairedDeltaThreshold`, the gain is real at the chosen
- * confidence level.
- *
- * Throws on unequal sample sizes — caller must align pairs upstream.
- */
-declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
-/**
- * Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
- * paired deltas is the standard non-parametric significance test for
- * "candidate beats baseline on matched items." Use alongside the
- * bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
- */
-declare function pairedWilcoxon(before: number[], after: number[]): {
-    w: number;
-    p: number;
-};
-/**
- * Paper-style alias for `benjaminiHochberg`. Use to correct p-values
- * across multiple candidate-vs-baseline comparisons run in the same
- * promotion sweep. Returns BH-adjusted q-values and significance at
- * the requested FDR (default 0.05).
- */
-declare function bhAdjust(pValues: number[], fdr?: number): {
-    qValues: number[];
-    significant: boolean[];
-};
-/**
- * Bootstrap-CI promotion gate.
- *
- * In any iterative-improvement loop (GEPA, prompt evolution, dataset
- * curation), the question is "did this generation actually improve, or are
- * we celebrating noise?". With small N and noisy outcomes, point-estimate
- * deltas lie. Bootstrap confidence intervals tell the operator whether the
- * delta is real before code or prompts get promoted.
- *
- * This module is pure functions — no I/O, no model calls. Easy to unit-test
- * and to compose into any verdict gate.
- *
- * Default gate:
- *   - Bootstrap mean baseline vs candidate (1k resamples).
- *   - Compute the delta distribution; pass if the lower CI bound > 0.
- *   - Tunable confidence (default 95%) and resample count.
- *
- * Verdict semantics intentionally match the existing `experiments.jsonl`
- * vocabulary:
- *   - ADVANCE: candidate's CI lower bound > baseline mean (real win)
- *   - KEEP:    overlap, but candidate point estimate >= baseline (neutral)
- *   - REVERT:  candidate's CI upper bound < baseline mean (real regression)
- *   - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
- */
-type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
-interface BootstrapResult {
-    baselineMean: number;
-    candidateMean: number;
-    /** candidateMean - baselineMean, point estimate. */
-    delta: number;
-    /** Lower bound of the (1 - alpha) CI on the delta. */
-    ciLower: number;
-    /** Upper bound of the (1 - alpha) CI on the delta. */
-    ciUpper: number;
-    /** Number of bootstrap resamples used. */
-    iterations: number;
-    alpha: number;
-    verdict: Verdict;
-}
-interface BootstrapOptions {
-    /** Confidence level alpha (default 0.05 → 95% CI). */
-    alpha?: number;
-    /** Number of resamples (default 1000). */
-    iterations?: number;
-    /**
-     * Minimum total samples (baseline + candidate) below which we always
-     * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
-     * Default 6 (combined).
-     */
-    minTotalSamples?: number;
-    /** RNG seed for reproducibility. Default: Math.random. */
-    seed?: number;
-}
-/**
- * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
- *
- * Uses simple percentile bootstrap on the difference of resampled means.
- * That's the standard non-parametric primitive — no distributional
- * assumptions, robust to skew, easy to reason about.
- */
-declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
-/**
- * Judge-replay promotion gate.
- *
- * The cheap inner-loop judge that drives an evolution run is by definition
- * fast and noisy. When you're about to promote a winning variant to the
- * canonical default, you want a STRONGER judge (a more expensive model, a
- * human grader, a separately-trained reward model) to confirm the win
- * generalises beyond the inner loop.
- *
- * This helper takes raw winner + baseline outputs, scores both through the
- * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
- * judge agrees the winner is real with the configured confidence. Doesn't
- * matter what shape your "output" is — pass a string, an object, anything
- * the judge can read.
- */
-interface JudgeReplayGateArgs<TOutput> {
-    baselineOutputs: TOutput[];
-    candidateOutputs: TOutput[];
-    /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
-    judge: (output: TOutput) => Promise<number> | number;
-    alpha?: number;
-    iterations?: number;
-    /** RNG seed for reproducibility. */
-    seed?: number;
-    /** Maximum concurrent judge calls. Default 4. */
-    judgeConcurrency?: number;
-}
-declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
-    baselineSamples: number;
-    candidateSamples: number;
-}>;
-interface RenderReleaseReportOptions {
-    title?: string;
-    runs?: readonly RunRecord[];
-    comparator?: string;
-    traceAnalystFindings?: readonly string[];
-    nextActions?: readonly string[];
-}
-declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
-export { type BootstrapOptions as B, type JudgeReplayGateArgs as J, type PairedBootstrapOptions as P, type ReleaseConfidenceAxis as R, type Verdict as V, type BootstrapResult as a, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, bhAdjust as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, pairedWilcoxon as r, releaseTraceEvidenceFromMultiShotTrials as s, renderReleaseReport as t };

/package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} RENAMED Viewed

File without changes

/package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} RENAMED Viewed

File without changes

/package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} RENAMED Viewed

File without changes

/package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} RENAMED Viewed

File without changes