@tangle-network/agent-eval 0.47.0 → 0.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +7 -0
  2. package/dist/adapters/otel.d.ts +103 -0
  3. package/dist/adapters/otel.js +110 -0
  4. package/dist/adapters/otel.js.map +1 -0
  5. package/dist/campaign/index.d.ts +2 -2
  6. package/dist/campaign/index.js +1 -1
  7. package/dist/{chunk-ZQABFCVJ.js → chunk-OYI6RZJK.js} +9 -14
  8. package/dist/chunk-OYI6RZJK.js.map +1 -0
  9. package/dist/{chunk-HRKOCLQA.js → chunk-XAP6DJZE.js} +1 -1
  10. package/dist/chunk-XAP6DJZE.js.map +1 -0
  11. package/dist/contract/index.d.ts +3 -3
  12. package/dist/contract/index.js +4 -4
  13. package/dist/contract/index.js.map +1 -1
  14. package/dist/hosted/index.js +1 -1
  15. package/dist/index.d.ts +6 -5
  16. package/dist/index.js +30 -3
  17. package/dist/index.js.map +1 -1
  18. package/dist/matrix/index.d.ts +2 -2
  19. package/dist/multishot/index.d.ts +2 -2
  20. package/dist/openapi.json +1 -1
  21. package/dist/{release-report-BtpgWRI0.d.ts → release-report-DBB8lB1P.d.ts} +1 -1
  22. package/dist/reporting.d.ts +2 -2
  23. package/dist/{researcher-CoJMs2Iz.d.ts → researcher-CHMO56K0.d.ts} +1 -1
  24. package/dist/rl.d.ts +3 -3
  25. package/dist/rl.js +3 -1
  26. package/dist/rl.js.map +1 -1
  27. package/dist/{run-improvement-loop-Bfam3MT1.d.ts → run-improvement-loop-B-L8GgpW.d.ts} +1 -1
  28. package/dist/{sequential-DdV5ShjT.d.ts → sequential-CbFH___X.d.ts} +23 -1
  29. package/dist/{types-DHqkLwEU.d.ts → types-CqPax19X.d.ts} +1 -1
  30. package/dist/verdict-CeEgtjyI.d.ts +32 -0
  31. package/docs/adapters-observability.md +15 -0
  32. package/docs/design/substrate-gaps.md +118 -0
  33. package/package.json +17 -31
  34. package/dist/chunk-HRKOCLQA.js.map +0 -1
  35. package/dist/chunk-ZQABFCVJ.js.map +0 -1
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport {\n type RunImprovementLoopResult,\n runImprovementLoop,\n} from '../campaign/presets/run-improvement-loop'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\nimport { createHostedClient, type HostedTenant } from '../hosted/client'\nimport type {\n EvalRunCellScore,\n EvalRunEvent,\n EvalRunGenerationSnapshot,\n} from '../hosted/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n\n /**\n * Opt-in: ship eval-run events to a hosted orchestrator (ours, your\n * self-hosted one, or any compatible implementation of the\n * `docs/hosted-ingest-spec.md` wire format). When set, the substrate\n * POSTs the final `EvalRunEvent` to `${endpoint}/v1/ingest/eval-runs`\n * after the loop completes. Failures are logged but do not fail the\n * loop — local result is always returned.\n *\n * For our orchestrator: `{ endpoint: 'https://orchestrator.tangle.tools/v1', apiKey, tenantId }`.\n *\n * For your self-hosted: any URL serving the wire format. See\n * `examples/hosted-ingest-server/` for the reference receiver.\n */\n hostedTenant?: HostedTenant\n\n /** Free-form labels attached to the hosted event (env, branch, model id,\n * etc.). Ignored when `hostedTenant` is unset. */\n hostedLabels?: Record<string, string>\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(byScenario: Record<string, { meanComposite: number }>): {\n compositeMean: number\n perScenario: Record<string, number>\n} {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n \"Anchor every claim in something the scenario's brief literally supports.\",\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error(\n 'selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.',\n )\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target:\n opts.driverTarget ??\n 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) =>\n sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n const summary: SelfImproveResult<TScenario, TArtifact> = {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n\n // Opt-in hosted ingest. Failures logged but never fail the loop — the\n // local result is always returned. This matches the wedge-doc invariant\n // that LAND-tier never blocks on EXPAND-tier infra.\n if (opts.hostedTenant) {\n try {\n await shipEvalRunToHosted(opts.hostedTenant, opts, summary, result, runDir)\n } catch (err) {\n const msg = err instanceof Error ? err.message : String(err)\n // eslint-disable-next-line no-console -- intentional: hosted-ingest is best-effort\n console.warn(`[agent-eval] hosted ingest failed (continuing): ${msg}`)\n }\n }\n\n return summary\n}\n\nasync function shipEvalRunToHosted<TScenario extends Scenario, TArtifact>(\n tenant: HostedTenant,\n opts: SelfImproveOptions<TScenario, TArtifact>,\n summary: SelfImproveResult<TScenario, TArtifact>,\n raw: RunImprovementLoopResult<TArtifact, TScenario>,\n runDir: string,\n): Promise<void> {\n const client = createHostedClient(tenant)\n\n function snapshotFromCampaign(\n index: number,\n surface: MutableSurface | undefined,\n campaign: RunImprovementLoopResult<TArtifact, TScenario>['baselineCampaign'],\n durationMs: number,\n ): EvalRunGenerationSnapshot {\n const cells: EvalRunCellScore[] = campaign.cells.map((cell) => {\n const judgeScores = Object.values(cell.judgeScores)\n const composite =\n judgeScores.length === 0\n ? 0\n : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length\n return {\n scenarioId: cell.scenarioId,\n rep: cell.rep,\n compositeMean: composite,\n dimensions: Object.fromEntries(\n Object.entries(cell.judgeScores).map(([name, score]) => [name, score.dimensions]),\n ),\n errorMessage: cell.error ?? undefined,\n }\n })\n const compositeMean =\n cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length\n return {\n index,\n surfaceHash: typeof surface === 'string' ? hashString(surface) : hashString(JSON.stringify(surface ?? '')),\n surface,\n cells,\n compositeMean,\n costUsd: campaign.aggregates.totalCostUsd,\n durationMs,\n }\n }\n\n const generations: EvalRunGenerationSnapshot[] = []\n // Baseline as generation 0.\n generations.push(snapshotFromCampaign(0, opts.baselineSurface, raw.baselineCampaign, 0))\n // Improvement generations as 1..N. Substrate stores per-surface campaigns\n // per generation — we summarize the WINNING surface per generation here.\n for (const gen of raw.generations) {\n const winner = gen.surfaces.reduce((best, s) =>\n s.campaign.aggregates.cellsExecuted > 0 &&\n (best === undefined || averageComposite(s.campaign) > averageComposite(best.campaign))\n ? s\n : best,\n gen.surfaces[0],\n )\n if (!winner) continue\n generations.push(\n snapshotFromCampaign(gen.record.generationIndex + 1, winner.surface, winner.campaign, 0),\n )\n }\n\n const event: EvalRunEvent = {\n runId: `${runDir}#${Date.now()}`,\n runDir,\n timestamp: new Date().toISOString(),\n status: 'finished',\n labels: opts.hostedLabels ?? {},\n baseline: generations[0],\n generations,\n gateDecision: summary.gateDecision,\n holdoutLift: summary.lift,\n totalCostUsd: summary.totalCostUsd,\n totalDurationMs: summary.durationMs,\n }\n\n await client.ingestEvalRun(event)\n}\n\nfunction averageComposite(\n campaign: RunImprovementLoopResult<unknown, Scenario>['baselineCampaign'],\n): number {\n const aggs = Object.values(campaign.aggregates.byScenario)\n return aggs.length === 0 ? 0 : aggs.reduce((s, a) => s + a.meanComposite, 0) / aggs.length\n}\n\nfunction hashString(s: string): string {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h.toString(16).padStart(8, '0')\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAsNA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cAAc,YAGrB;AACA,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QACE,KAAK,gBACL;AAAA,IACF,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QACJ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IACjF;AAAA,EACF;AAEF,QAAM,UAAmD;AAAA,IACvD;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AAKA,MAAI,KAAK,cAAc;AACrB,QAAI;AACF,YAAM,oBAAoB,KAAK,cAAc,MAAM,SAAS,QAAQ,MAAM;AAAA,IAC5E,SAAS,KAAK;AACZ,YAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE3D,cAAQ,KAAK,mDAAmD,GAAG,EAAE;AAAA,IACvE;AAAA,EACF;AAEA,SAAO;AACT;AAEA,eAAe,oBACb,QACA,MACA,SACA,KACA,QACe;AACf,QAAM,SAAS,mBAAmB,MAAM;AAExC,WAAS,qBACP,OACA,SACA,UACA,YAC2B;AAC3B,UAAM,QAA4B,SAAS,MAAM,IAAI,CAAC,SAAS;AAC7D,YAAM,cAAc,OAAO,OAAO,KAAK,WAAW;AAClD,YAAM,YACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,aAAO;AAAA,QACL,YAAY,KAAK;AAAA,QACjB,KAAK,KAAK;AAAA,QACV,eAAe;AAAA,QACf,YAAY,OAAO;AAAA,UACjB,OAAO,QAAQ,KAAK,WAAW,EAAE,IAAI,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,MAAM,MAAM,UAAU,CAAC;AAAA,QAClF;AAAA,QACA,cAAc,KAAK,SAAS;AAAA,MAC9B;AAAA,IACF,CAAC;AACD,UAAM,gBACJ,MAAM,WAAW,IAAI,IAAI,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,MAAM;AAClF,WAAO;AAAA,MACL;AAAA,MACA,aAAa,OAAO,YAAY,WAAW,WAAW,OAAO,IAAI,WAAW,KAAK,UAAU,WAAW,EAAE,CAAC;AAAA,MACzG;AAAA,MACA;AAAA,MACA;AAAA,MACA,SAAS,SAAS,WAAW;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAEA,QAAM,cAA2C,CAAC;AAElD,cAAY,KAAK,qBAAqB,GAAG,KAAK,iBAAiB,IAAI,kBAAkB,CAAC,CAAC;AAGvF,aAAW,OAAO,IAAI,aAAa;AACjC,UAAM,SAAS,IAAI,SAAS;AAAA,MAAO,CAAC,MAAM,MACxC,EAAE,SAAS,WAAW,gBAAgB,MACrC,SAAS,UAAa,iBAAiB,EAAE,QAAQ,IAAI,iBAAiB,KAAK,QAAQ,KAChF,IACA;AAAA,MACJ,IAAI,SAAS,CAAC;AAAA,IAChB;AACA,QAAI,CAAC,OAAQ;AACb,gBAAY;AAAA,MACV,qBAAqB,IAAI,OAAO,kBAAkB,GAAG,OAAO,SAAS,OAAO,UAAU,CAAC;AAAA,IACzF;AAAA,EACF;AAEA,QAAM,QAAsB;AAAA,IAC1B,OAAO,GAAG,MAAM,IAAI,KAAK,IAAI,CAAC;AAAA,IAC9B;AAAA,IACA,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,IAClC,QAAQ;AAAA,IACR,QAAQ,KAAK,gBAAgB,CAAC;AAAA,IAC9B,UAAU,YAAY,CAAC;AAAA,IACvB;AAAA,IACA,cAAc,QAAQ;AAAA,IACtB,aAAa,QAAQ;AAAA,IACrB,cAAc,QAAQ;AAAA,IACtB,iBAAiB,QAAQ;AAAA,EAC3B;AAEA,QAAM,OAAO,cAAc,KAAK;AAClC;AAEA,SAAS,iBACP,UACQ;AACR,QAAM,OAAO,OAAO,OAAO,SAAS,WAAW,UAAU;AACzD,SAAO,KAAK,WAAW,IAAI,IAAI,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,KAAK;AACtF;AAEA,SAAS,WAAW,GAAmB;AACrC,MAAI,IAAI,eAAe;AACvB,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,SAAK,EAAE,WAAW,CAAC;AACnB,QAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,EACjC;AACA,SAAO,EAAE,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG;AACvC;","names":[]}
1
+ {"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport {\n type RunImprovementLoopResult,\n runImprovementLoop,\n} from '../campaign/presets/run-improvement-loop'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\nimport { createHostedClient, type HostedTenant } from '../hosted/client'\nimport type { EvalRunCellScore, EvalRunEvent, EvalRunGenerationSnapshot } from '../hosted/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n\n /**\n * Opt-in: ship eval-run events to a hosted orchestrator (ours, your\n * self-hosted one, or any compatible implementation of the\n * `docs/hosted-ingest-spec.md` wire format). When set, the substrate\n * POSTs the final `EvalRunEvent` to `${endpoint}/v1/ingest/eval-runs`\n * after the loop completes. Failures are logged but do not fail the\n * loop — local result is always returned.\n *\n * For our orchestrator: `{ endpoint: 'https://orchestrator.tangle.tools/v1', apiKey, tenantId }`.\n *\n * For your self-hosted: any URL serving the wire format. See\n * `examples/hosted-ingest-server/` for the reference receiver.\n */\n hostedTenant?: HostedTenant\n\n /** Free-form labels attached to the hosted event (env, branch, model id,\n * etc.). Ignored when `hostedTenant` is unset. */\n hostedLabels?: Record<string, string>\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(byScenario: Record<string, { meanComposite: number }>): {\n compositeMean: number\n perScenario: Record<string, number>\n} {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n \"Anchor every claim in something the scenario's brief literally supports.\",\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error(\n 'selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.',\n )\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target:\n opts.driverTarget ??\n 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) =>\n sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n const summary: SelfImproveResult<TScenario, TArtifact> = {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n\n // Opt-in hosted ingest. Failures logged but never fail the loop — the\n // local result is always returned. This matches the wedge-doc invariant\n // that LAND-tier never blocks on EXPAND-tier infra.\n if (opts.hostedTenant) {\n try {\n await shipEvalRunToHosted(opts.hostedTenant, opts, summary, result, runDir)\n } catch (err) {\n const msg = err instanceof Error ? err.message : String(err)\n // eslint-disable-next-line no-console -- intentional: hosted-ingest is best-effort\n console.warn(`[agent-eval] hosted ingest failed (continuing): ${msg}`)\n }\n }\n\n return summary\n}\n\nasync function shipEvalRunToHosted<TScenario extends Scenario, TArtifact>(\n tenant: HostedTenant,\n opts: SelfImproveOptions<TScenario, TArtifact>,\n summary: SelfImproveResult<TScenario, TArtifact>,\n raw: RunImprovementLoopResult<TArtifact, TScenario>,\n runDir: string,\n): Promise<void> {\n const client = createHostedClient(tenant)\n\n function snapshotFromCampaign(\n index: number,\n surface: MutableSurface | undefined,\n campaign: RunImprovementLoopResult<TArtifact, TScenario>['baselineCampaign'],\n durationMs: number,\n ): EvalRunGenerationSnapshot {\n const cells: EvalRunCellScore[] = campaign.cells.map((cell) => {\n const judgeScores = Object.values(cell.judgeScores)\n const composite =\n judgeScores.length === 0\n ? 0\n : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length\n return {\n scenarioId: cell.scenarioId,\n rep: cell.rep,\n compositeMean: composite,\n dimensions: Object.fromEntries(\n Object.entries(cell.judgeScores).map(([name, score]) => [name, score.dimensions]),\n ),\n errorMessage: cell.error ?? undefined,\n }\n })\n const compositeMean =\n cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length\n return {\n index,\n surfaceHash:\n typeof surface === 'string'\n ? hashString(surface)\n : hashString(JSON.stringify(surface ?? '')),\n surface,\n cells,\n compositeMean,\n costUsd: campaign.aggregates.totalCostUsd,\n durationMs,\n }\n }\n\n const generations: EvalRunGenerationSnapshot[] = []\n // Baseline as generation 0.\n generations.push(snapshotFromCampaign(0, opts.baselineSurface, raw.baselineCampaign, 0))\n // Improvement generations as 1..N. Substrate stores per-surface campaigns\n // per generation — we summarize the WINNING surface per generation here.\n for (const gen of raw.generations) {\n const winner = gen.surfaces.reduce(\n (best, s) =>\n s.campaign.aggregates.cellsExecuted > 0 &&\n (best === undefined || averageComposite(s.campaign) > averageComposite(best.campaign))\n ? s\n : best,\n gen.surfaces[0],\n )\n if (!winner) continue\n generations.push(\n snapshotFromCampaign(gen.record.generationIndex + 1, winner.surface, winner.campaign, 0),\n )\n }\n\n const event: EvalRunEvent = {\n runId: `${runDir}#${Date.now()}`,\n runDir,\n timestamp: new Date().toISOString(),\n status: 'finished',\n labels: opts.hostedLabels ?? {},\n baseline: generations[0],\n generations,\n gateDecision: summary.gateDecision,\n holdoutLift: summary.lift,\n totalCostUsd: summary.totalCostUsd,\n totalDurationMs: summary.durationMs,\n }\n\n await client.ingestEvalRun(event)\n}\n\nfunction averageComposite(\n campaign: RunImprovementLoopResult<unknown, Scenario>['baselineCampaign'],\n): number {\n const aggs = Object.values(campaign.aggregates.byScenario)\n return aggs.length === 0 ? 0 : aggs.reduce((s, a) => s + a.meanComposite, 0) / aggs.length\n}\n\nfunction hashString(s: string): string {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h.toString(16).padStart(8, '0')\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAkNA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cAAc,YAGrB;AACA,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QACE,KAAK,gBACL;AAAA,IACF,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QACJ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IACjF;AAAA,EACF;AAEF,QAAM,UAAmD;AAAA,IACvD;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AAKA,MAAI,KAAK,cAAc;AACrB,QAAI;AACF,YAAM,oBAAoB,KAAK,cAAc,MAAM,SAAS,QAAQ,MAAM;AAAA,IAC5E,SAAS,KAAK;AACZ,YAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE3D,cAAQ,KAAK,mDAAmD,GAAG,EAAE;AAAA,IACvE;AAAA,EACF;AAEA,SAAO;AACT;AAEA,eAAe,oBACb,QACA,MACA,SACA,KACA,QACe;AACf,QAAM,SAAS,mBAAmB,MAAM;AAExC,WAAS,qBACP,OACA,SACA,UACA,YAC2B;AAC3B,UAAM,QAA4B,SAAS,MAAM,IAAI,CAAC,SAAS;AAC7D,YAAM,cAAc,OAAO,OAAO,KAAK,WAAW;AAClD,YAAM,YACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,aAAO;AAAA,QACL,YAAY,KAAK;AAAA,QACjB,KAAK,KAAK;AAAA,QACV,eAAe;AAAA,QACf,YAAY,OAAO;AAAA,UACjB,OAAO,QAAQ,KAAK,WAAW,EAAE,IAAI,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,MAAM,MAAM,UAAU,CAAC;AAAA,QAClF;AAAA,QACA,cAAc,KAAK,SAAS;AAAA,MAC9B;AAAA,IACF,CAAC;AACD,UAAM,gBACJ,MAAM,WAAW,IAAI,IAAI,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,MAAM;AAClF,WAAO;AAAA,MACL;AAAA,MACA,aACE,OAAO,YAAY,WACf,WAAW,OAAO,IAClB,WAAW,KAAK,UAAU,WAAW,EAAE,CAAC;AAAA,MAC9C;AAAA,MACA;AAAA,MACA;AAAA,MACA,SAAS,SAAS,WAAW;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAEA,QAAM,cAA2C,CAAC;AAElD,cAAY,KAAK,qBAAqB,GAAG,KAAK,iBAAiB,IAAI,kBAAkB,CAAC,CAAC;AAGvF,aAAW,OAAO,IAAI,aAAa;AACjC,UAAM,SAAS,IAAI,SAAS;AAAA,MAC1B,CAAC,MAAM,MACL,EAAE,SAAS,WAAW,gBAAgB,MACrC,SAAS,UAAa,iBAAiB,EAAE,QAAQ,IAAI,iBAAiB,KAAK,QAAQ,KAChF,IACA;AAAA,MACN,IAAI,SAAS,CAAC;AAAA,IAChB;AACA,QAAI,CAAC,OAAQ;AACb,gBAAY;AAAA,MACV,qBAAqB,IAAI,OAAO,kBAAkB,GAAG,OAAO,SAAS,OAAO,UAAU,CAAC;AAAA,IACzF;AAAA,EACF;AAEA,QAAM,QAAsB;AAAA,IAC1B,OAAO,GAAG,MAAM,IAAI,KAAK,IAAI,CAAC;AAAA,IAC9B;AAAA,IACA,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,IAClC,QAAQ;AAAA,IACR,QAAQ,KAAK,gBAAgB,CAAC;AAAA,IAC9B,UAAU,YAAY,CAAC;AAAA,IACvB;AAAA,IACA,cAAc,QAAQ;AAAA,IACtB,aAAa,QAAQ;AAAA,IACrB,cAAc,QAAQ;AAAA,IACtB,iBAAiB,QAAQ;AAAA,EAC3B;AAEA,QAAM,OAAO,cAAc,KAAK;AAClC;AAEA,SAAS,iBACP,UACQ;AACR,QAAM,OAAO,OAAO,OAAO,SAAS,WAAW,UAAU;AACzD,SAAO,KAAK,WAAW,IAAI,IAAI,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,KAAK;AACtF;AAEA,SAAS,WAAW,GAAmB;AACrC,MAAI,IAAI,eAAe;AACvB,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,SAAK,EAAE,WAAW,CAAC;AACnB,QAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,EACjC;AACA,SAAO,EAAE,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG;AACvC;","names":[]}
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  HOSTED_WIRE_VERSION,
3
3
  createHostedClient
4
- } from "../chunk-ZQABFCVJ.js";
4
+ } from "../chunk-OYI6RZJK.js";
5
5
  import "../chunk-NSBPE2FW.js";
6
6
  export {
7
7
  HOSTED_WIRE_VERSION,
package/dist/index.d.ts CHANGED
@@ -2,16 +2,16 @@ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunT
2
2
  import { R as RunRecord } from './run-record-BGY6bHRh.js';
3
3
  export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
4
4
  import { AxAIService, AxFunction } from '@ax-llm/ax';
5
- import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-CoJMs2Iz.js';
6
- export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
5
+ import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-CHMO56K0.js';
6
+ export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-CHMO56K0.js';
7
7
  import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
8
8
  export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
9
9
  import { L as LlmClientOptions, b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
10
10
  export { d as LlmCallError, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
11
11
  import { TraceAnalysisStore, AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
12
12
  export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
13
- import { s as JudgeInput, t as JudgeFn, u as BenchmarkRunnerConfig, S as Scenario, v as BenchmarkReport, x as ProductClientConfig, C as CheckResult, T as TestResult, y as PersonaConfig, D as DriverResult, z as DriverState, A as CollectedArtifacts, E as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, F as TurnMetrics, G as ScenarioFile, H as CompletionCriterion } from './release-report-BtpgWRI0.js';
14
- export { I as ActionableSideInfo, K as ArtifactCheck, L as ArtifactResult, M as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, N as CorpusAgreementOptions, O as CorpusAgreementPerDimension, Q as CorpusAgreementReport, U as CorpusScoreRecord, W as EvalResult, X as FeedbackPattern, Y as JudgeConfig, J as JudgeReplayGateArgs, Z as JudgeRubric, _ as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, $ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, a0 as RouteMap, a1 as RubricDimension, a2 as Turn, a3 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a4 as bonferroni, n as bootstrapCi, a5 as cohensD, a6 as confidenceInterval, a7 as corpusInterRaterAgreement, a8 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a9 as interRaterReliability, p as judgeReplayGate, aa as mannWhitneyU, ab as normalizeScores, q as pairedBootstrap, ac as pairedMde, ad as pairedTTest, ae as partialCredit, r as renderReleaseReport, af as requiredSampleSize, ag as weightedMean, w as wilcoxonSignedRank } from './release-report-BtpgWRI0.js';
13
+ import { s as JudgeInput, t as JudgeFn, u as BenchmarkRunnerConfig, S as Scenario, v as BenchmarkReport, x as ProductClientConfig, C as CheckResult, T as TestResult, y as PersonaConfig, D as DriverResult, z as DriverState, A as CollectedArtifacts, E as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, F as TurnMetrics, G as ScenarioFile, H as CompletionCriterion } from './release-report-DBB8lB1P.js';
14
+ export { I as ActionableSideInfo, K as ArtifactCheck, L as ArtifactResult, M as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, N as CorpusAgreementOptions, O as CorpusAgreementPerDimension, Q as CorpusAgreementReport, U as CorpusScoreRecord, W as EvalResult, X as FeedbackPattern, Y as JudgeConfig, J as JudgeReplayGateArgs, Z as JudgeRubric, _ as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, $ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, a0 as RouteMap, a1 as RubricDimension, a2 as Turn, a3 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a4 as bonferroni, n as bootstrapCi, a5 as cohensD, a6 as confidenceInterval, a7 as corpusInterRaterAgreement, a8 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a9 as interRaterReliability, p as judgeReplayGate, aa as mannWhitneyU, ab as normalizeScores, q as pairedBootstrap, ac as pairedMde, ad as pairedTTest, ae as partialCredit, r as renderReleaseReport, af as requiredSampleSize, ag as weightedMean, w as wilcoxonSignedRank } from './release-report-DBB8lB1P.js';
15
15
  import { TCloud } from '@tangle-network/tcloud';
16
16
  import { z } from 'zod';
17
17
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
@@ -32,6 +32,7 @@ import { a as BaselineReport } from './baseline-4R5deP0N.js';
32
32
  export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
33
33
  import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
34
34
  export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
35
+ export { D as DefaultVerdict } from './verdict-CeEgtjyI.js';
35
36
  import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
36
37
  export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-BlwAtYYf.js';
37
38
  export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
@@ -39,7 +40,7 @@ export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b
39
40
  import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
40
41
  export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
41
42
  export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
42
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, w as GateDecision, x as GateEvidence, H as HeldOutGate, y as HeldOutGateConfig, z as HeldOutGateRejectionCode, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-DdV5ShjT.js';
43
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, w as GateDecision, x as GateEvidence, H as HeldOutGate, y as HeldOutGateConfig, z as HeldOutGateRejectionCode, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-CbFH___X.js';
43
44
  import './outcome-store-BxJ3DQKJ.js';
44
45
 
45
46
  interface RunScore {
package/dist/index.js CHANGED
@@ -9612,6 +9612,7 @@ var HeldOutGate = class {
9612
9612
  confidence;
9613
9613
  resamples;
9614
9614
  seed;
9615
+ costPerTaskCeiling;
9615
9616
  constructor(config) {
9616
9617
  if (!config.baselineKey) {
9617
9618
  throw new Error("HeldOutGate: baselineKey is required");
@@ -9623,6 +9624,10 @@ var HeldOutGate = class {
9623
9624
  this.confidence = config.confidence ?? 0.95;
9624
9625
  this.resamples = config.bootstrapResamples ?? 2e3;
9625
9626
  this.seed = config.seed;
9627
+ if (config.costPerTaskCeiling !== void 0 && !(Number.isFinite(config.costPerTaskCeiling) && config.costPerTaskCeiling > 0)) {
9628
+ throw new Error("HeldOutGate: costPerTaskCeiling must be a positive finite number");
9629
+ }
9630
+ this.costPerTaskCeiling = config.costPerTaskCeiling;
9626
9631
  }
9627
9632
  /** Decide whether `candidate` should replace `baseline`. Pairing
9628
9633
  * is by (experimentId, seed) — identical experiment + seed pairs
@@ -9650,6 +9655,8 @@ var HeldOutGate = class {
9650
9655
  const baselineHoldoutMean = mean4(scores(baseline, "holdoutScore", "holdout"));
9651
9656
  const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
9652
9657
  const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
9658
+ const medianCandidateCost = medianFinite(candidate.map((r) => r.costUsd));
9659
+ const medianBaselineCost = medianFinite(baseline.map((r) => r.costUsd));
9653
9660
  if (productiveRuns < this.minProductiveRuns) {
9654
9661
  return {
9655
9662
  promote: false,
@@ -9663,7 +9670,9 @@ var HeldOutGate = class {
9663
9670
  searchScore: candidateSearchMean,
9664
9671
  holdoutScore: candidateHoldoutMean,
9665
9672
  overfitGap,
9666
- baselineOverfitGap
9673
+ baselineOverfitGap,
9674
+ medianCandidateCost,
9675
+ medianBaselineCost
9667
9676
  },
9668
9677
  reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
9669
9678
  rejectionCode: "few_runs"
@@ -9684,7 +9693,9 @@ var HeldOutGate = class {
9684
9693
  searchScore: candidateSearchMean,
9685
9694
  holdoutScore: candidateHoldoutMean,
9686
9695
  overfitGap,
9687
- baselineOverfitGap
9696
+ baselineOverfitGap,
9697
+ medianCandidateCost,
9698
+ medianBaselineCost
9688
9699
  };
9689
9700
  if (!(ci.low > this.pairedDeltaThreshold)) {
9690
9701
  return {
@@ -9706,12 +9717,22 @@ var HeldOutGate = class {
9706
9717
  rejectionCode: "overfit_gap"
9707
9718
  };
9708
9719
  }
9720
+ if (this.costPerTaskCeiling !== void 0 && Number.isFinite(medianCandidateCost) && medianCandidateCost > this.costPerTaskCeiling) {
9721
+ return {
9722
+ promote: false,
9723
+ candidateId,
9724
+ baselineId,
9725
+ evidence,
9726
+ reason: `cost_ceiling: candidate median cost $${fmt(medianCandidateCost)} exceeds ceiling $${fmt(this.costPerTaskCeiling)} (baseline $${fmt(medianBaselineCost)})`,
9727
+ rejectionCode: "cost_ceiling"
9728
+ };
9729
+ }
9709
9730
  return {
9710
9731
  promote: true,
9711
9732
  candidateId,
9712
9733
  baselineId,
9713
9734
  evidence,
9714
- reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
9735
+ reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}; median cost candidate=$${fmt(medianCandidateCost)} vs baseline=$${fmt(medianBaselineCost)}`,
9715
9736
  rejectionCode: null
9716
9737
  };
9717
9738
  }
@@ -9757,6 +9778,12 @@ function medianDelta(before, after) {
9757
9778
  const mid = Math.floor(ds.length / 2);
9758
9779
  return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
9759
9780
  }
9781
+ function medianFinite(xs) {
9782
+ const ys = xs.filter((x) => Number.isFinite(x)).sort((x, y) => x - y);
9783
+ if (ys.length === 0) return Number.NaN;
9784
+ const mid = Math.floor(ys.length / 2);
9785
+ return ys.length % 2 === 0 ? (ys[mid - 1] + ys[mid]) / 2 : ys[mid];
9786
+ }
9760
9787
  function fmt(x) {
9761
9788
  if (!Number.isFinite(x)) return String(x);
9762
9789
  return x.toFixed(4);