npm - @tangle-network/agent-eval - Versions diffs - 0.61.0 → 0.63.0 - Mend

@tangle-network/agent-eval 0.61.0 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/CHANGELOG.md +48 -8
package/dist/adapters/http.d.ts +4 -1
package/dist/adapters/langchain.d.ts +4 -1
package/dist/adapters/otel.d.ts +4 -4
package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
package/dist/benchmarks/index.d.ts +2 -2
package/dist/campaign/index.d.ts +388 -11
package/dist/campaign/index.js +597 -12
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
package/dist/chunk-4ODZXQV2.js.map +1 -0
package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
package/dist/chunk-7TPYV2ER.js.map +1 -0
package/dist/chunk-E22YUOAL.js +111 -0
package/dist/chunk-E22YUOAL.js.map +1 -0
package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
package/dist/contract/index.d.ts +9 -9
package/dist/contract/index.js +4 -3
package/dist/contract/index.js.map +1 -1
package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/hosted/index.d.ts +4 -4
package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
package/dist/index.d.ts +98 -14
package/dist/index.js +331 -128
package/dist/index.js.map +1 -1
package/dist/meta-eval/index.d.ts +2 -2
package/dist/multishot/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
package/dist/reporting.d.ts +4 -4
package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
package/dist/rl.d.ts +6 -6
package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
package/package.json +1 -1
package/dist/chunk-GMXHLSLL.js.map +0 -1
package/dist/chunk-OLULBECP.js.map +0 -1
package/dist/chunk-SUGME4OT.js.map +0 -1
/package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0

package/dist/meta-eval/index.d.ts CHANGED Viewed

@@ -2,8 +2,8 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
 import { R as Run } from '../schema-m0gsnbt3.js';
 import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-DgBHWsh7.js';
-import '../run-record-DgUVo5pw.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-D_4BSXGV.js';
+import '../run-record-BgTFzO2r.js';
 import '../errors-Dwqw-T_m.js';
 /**

package/dist/multishot/index.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/matrix.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string \| null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(\n req: RouterCompletionRequest,\n): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string \| null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') \|\| model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') \|\| model.includes('glm') \|\| model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(\n /\\/+$/,\n '',\n )\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. /\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /* Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. /\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: {\n type: 'string',\n description: 'Optional scope: time window, geography, jurisdiction, segment',\n },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: {\n type: 'string',\n description: 'Optional language preference (default: TypeScript)',\n },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(\n config: DefaultResearcherConfig = {},\n): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /* When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. /\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string \| undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) =>\n name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined,\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /* JSON field name + score key. /\n key: string\n /* Description shown in the judge's user prompt. /\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /* Display name (for trace + log). /\n name: string\n /* Model used for this judge. /\n model?: string\n /* 0-10 scored dimensions. /\n dimensions: JudgeDimension[]\n /* Judge system prompt — sets persona + JSON-only constraint. /\n systemPrompt: string\n /* Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. /\n buildPrompt: (input: TInput) => string\n /* Optional model + api overrides. /\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /* Per-dimension 0-10 score. Missing dims default to 0. /\n dimensions: Record<string, number>\n /* Mean across dimensions. /\n composite: number\n /* Free-form 1-2 sentence critique from the judge (when provided). /\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(\n judge: JudgeConfig<TInput>,\n input: TInput,\n): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return {\n ...ZERO_SCORE,\n notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n\n let parsed: Record<string, unknown> \| null = null\n try {\n const cleaned = raw\n .replace(/^```json\\s/i, '')\n .replace(/```\\s$/, '')\n .trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/* Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. /\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/* Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. /\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport type { MatrixResult } from '../matrix'\nimport { runAgentMatrix } from '../matrix'\nimport { type JudgeConfig, type JudgeScore, runJudge } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /* Scores the full transcript end-to-end (always runs). /\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /* Scores each code-type artifact. Optional — omit when domain has no code artifacts. /\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /* Scores each non-code (research/content/template) artifact. Optional. /\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /* Which artifact types route to codeReview. Defaults to ['code']. /\n codeArtifactTypes?: string[]\n /* Which artifact types route to contentQuality. Defaults to ['research']. /\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n contentQuality?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /* AgentProfile axis (matrix primary). /\n profiles: Array<{ id: string; value: AgentProfile }>\n /* Persona axis. /\n personas: TPersona[]\n /* Persona-shaping callbacks. /\n shape: MultishotShape<TPersona>\n /* Judge configurations. /\n judges: MultishotJudges<TPersona>\n /* Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. /\n tools?: MultishotToolDefinition[]\n /* Map from tool name → inline executor. Must align with `tools`. /\n toolExecutors?: Record<string, MultishotToolExecutor>\n /* Tool name → artifact type label. Defaults to research/code mapping. /\n artifactTypeFor?: (toolName: string) => string \| undefined\n /* Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. /\n runDir: string\n /* Replicates per (profile, persona) cell. /\n reps?: number\n /* Max conversation turns per cell. /\n maxTurns?: number\n /* Max concurrent cells. /\n maxConcurrency?: number\n /* Total $ ceiling across the matrix; cells aborted past this. /\n costCeiling?: number\n /* Agent model. /\n agentModel?: string\n /* Driver model. /\n driverModel?: string\n /* Pass-thru fields. /\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as AgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(\n codeArtifacts.map((artifact) =>\n runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(\n contentArtifacts.map((artifact) =>\n runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite =\n codeReviews.length === 0\n ? 0\n : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite =\n contentReviews.length === 0\n ? 0\n : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview)\n cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality)\n cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: {\n turns: sim.transcript.length,\n toolCalls: sim.toolCalls,\n artifactCount: sim.artifacts.length,\n },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `Cells: ${matrix.summary.totalCells} \| Pass rate: ${(matrix.summary.overallPassRate 100).toFixed(0)}% \| Mean: ${matrix.summary.overallMeanScore.toFixed(2)} \| Cost: $${matrix.summary.totalCostUsd.toFixed(2)} \| Duration: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '\| profile \| pass \| mean \| cost \|',\n '\|---\|---\|---\|---\|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(\n ([id, s]) =>\n `\| ${id} \| ${(s.passRate * 100).toFixed(0)}% \| ${s.meanScore.toFixed(2)} \| $${s.totalCostUsd.toFixed(2)} \|`,\n ),\n ``,\n `## By persona`,\n ``,\n '\| persona \| pass \| mean \| cost \|',\n '\|---\|---\|---\|---\|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(\n ([id, s]) =>\n `\| ${id} \| ${(s.passRate * 100).toFixed(0)}% \| ${s.meanScore.toFixed(2)} \| $${s.totalCostUsd.toFixed(2)} \|`,\n ),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' \| 'assistant' \| 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. /\n id: string\n /* Per-domain payload (income/profile/voice/etc.) shaped by the consumer. /\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /* Opening user message (turn 0) — the persona's first ask. /\n buildOpener: (persona: TPersona) => string\n /* System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. /\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n type MultishotArtifact,\n MultishotDriverEmptyError,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: AgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /* Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. /\n tools?: MultishotToolDefinition[]\n /* Map from tool name → executor invoked inline when the agent emits a tool_call. /\n toolExecutors?: Record<string, MultishotToolExecutor>\n /* Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string \| undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle =\n opts.tools && opts.toolExecutors\n ? {\n tools: opts.tools,\n executors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor ?? (() => undefined),\n }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText \|\| null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({\n type: artifactType,\n turn,\n invocation: { name: tc.name, args: tc.args },\n content: toolResult,\n })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult \|\| 'done' })\n transcript.push({ role: 'tool', content: toolResult \|\| 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBACpB,KACmC;AACnC,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC;AAAA,IAC9E;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;ACnFO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO;AAAA,UACL,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU;AAAA,UACR,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBACd,SAAkC,CAAC,GACZ;AACvB,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAChB,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EACpF;AACF;;;ACpIO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SACpB,OACA,OACqB;AACrB,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,GAAG;AAAA,MACH,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7F;AAAA,EACF;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IACb,QAAQ,gBAAgB,EAAE,EAC1B,QAAQ,WAAW,EAAE,EACrB,KAAK;AACR,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACzGA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;;;AC4Cd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SACJ,KAAK,SAAS,KAAK,gBACf;AAAA,IACE,OAAO,KAAK;AAAA,IACZ,WAAW,KAAK;AAAA,IAChB,iBAAiB,KAAK,oBAAoB,MAAM;AAAA,EAClD,IACA,uBAAuB;AAC7B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK;AAAA,cACb,MAAM;AAAA,cACN;AAAA,cACA,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK;AAAA,cAC3C,SAAS;AAAA,YACX,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AFxHA,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ;AAAA,UACN,cAAc;AAAA,YAAI,CAAC,aACjB,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACpE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ;AAAA,UACN,iBAAiB;AAAA,YAAI,CAAC,aACpB,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACxE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,YAAM,mBACJ,eAAe,WAAW,IACtB,IACA,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAG3E,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO;AACd,kBAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AAC9E,UAAI,KAAK,OAAO;AACd,kBAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAExF,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ;AAAA,UACN,OAAO,IAAI,WAAW;AAAA,UACtB,WAAW,IAAI;AAAA,UACf,eAAe,IAAI,UAAU;AAAA,QAC/B;AAAA,QACA,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
1	+ {"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/matrix.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string \| null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(\n req: RouterCompletionRequest,\n): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string \| null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') \|\| model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') \|\| model.includes('glm') \|\| model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(\n /\\/+$/,\n '',\n )\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. /\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /* Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. /\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: {\n type: 'string',\n description: 'Optional scope: time window, geography, jurisdiction, segment',\n },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: {\n type: 'string',\n description: 'Optional language preference (default: TypeScript)',\n },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(\n config: DefaultResearcherConfig = {},\n): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /* When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. /\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string \| undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) =>\n name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined,\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /* JSON field name + score key. /\n key: string\n /* Description shown in the judge's user prompt. /\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /* Display name (for trace + log). /\n name: string\n /* Model used for this judge. /\n model?: string\n /* 0-10 scored dimensions. /\n dimensions: JudgeDimension[]\n /* Judge system prompt — sets persona + JSON-only constraint. /\n systemPrompt: string\n /* Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. /\n buildPrompt: (input: TInput) => string\n /* Optional model + api overrides. /\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /* Per-dimension 0-10 score. Missing dims default to 0. /\n dimensions: Record<string, number>\n /* Mean across dimensions. /\n composite: number\n /* Free-form 1-2 sentence critique from the judge (when provided). /\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(\n judge: JudgeConfig<TInput>,\n input: TInput,\n): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return {\n ...ZERO_SCORE,\n notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n\n let parsed: Record<string, unknown> \| null = null\n try {\n const cleaned = raw\n .replace(/^```json\\s/i, '')\n .replace(/```\\s$/, '')\n .trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/* Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. /\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/* Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. /\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile as SandboxAgentProfile } from '@tangle-network/sandbox'\nimport type { MatrixResult } from '../matrix'\nimport { runAgentMatrix } from '../matrix'\nimport { type JudgeConfig, type JudgeScore, runJudge } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /* Scores the full transcript end-to-end (always runs). /\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /* Scores each code-type artifact. Optional — omit when domain has no code artifacts. /\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /* Scores each non-code (research/content/template) artifact. Optional. /\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /* Which artifact types route to codeReview. Defaults to ['code']. /\n codeArtifactTypes?: string[]\n /* Which artifact types route to contentQuality. Defaults to ['research']. /\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n contentQuality?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /* AgentProfile axis (matrix primary). /\n profiles: Array<{ id: string; value: SandboxAgentProfile }>\n /* Persona axis. /\n personas: TPersona[]\n /* Persona-shaping callbacks. /\n shape: MultishotShape<TPersona>\n /* Judge configurations. /\n judges: MultishotJudges<TPersona>\n /* Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. /\n tools?: MultishotToolDefinition[]\n /* Map from tool name → inline executor. Must align with `tools`. /\n toolExecutors?: Record<string, MultishotToolExecutor>\n /* Tool name → artifact type label. Defaults to research/code mapping. /\n artifactTypeFor?: (toolName: string) => string \| undefined\n /* Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. /\n runDir: string\n /* Replicates per (profile, persona) cell. /\n reps?: number\n /* Max conversation turns per cell. /\n maxTurns?: number\n /* Max concurrent cells. /\n maxConcurrency?: number\n /* Total $ ceiling across the matrix; cells aborted past this. /\n costCeiling?: number\n /* Agent model. /\n agentModel?: string\n /* Driver model. /\n driverModel?: string\n /* Pass-thru fields. /\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as SandboxAgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(\n codeArtifacts.map((artifact) =>\n runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(\n contentArtifacts.map((artifact) =>\n runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite =\n codeReviews.length === 0\n ? 0\n : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite =\n contentReviews.length === 0\n ? 0\n : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview)\n cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality)\n cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: {\n turns: sim.transcript.length,\n toolCalls: sim.toolCalls,\n artifactCount: sim.artifacts.length,\n },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `Cells: ${matrix.summary.totalCells} \| Pass rate: ${(matrix.summary.overallPassRate 100).toFixed(0)}% \| Mean: ${matrix.summary.overallMeanScore.toFixed(2)} \| Cost: $${matrix.summary.totalCostUsd.toFixed(2)} \| Duration: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '\| profile \| pass \| mean \| cost \|',\n '\|---\|---\|---\|---\|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(\n ([id, s]) =>\n `\| ${id} \| ${(s.passRate * 100).toFixed(0)}% \| ${s.meanScore.toFixed(2)} \| $${s.totalCostUsd.toFixed(2)} \|`,\n ),\n ``,\n `## By persona`,\n ``,\n '\| persona \| pass \| mean \| cost \|',\n '\|---\|---\|---\|---\|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(\n ([id, s]) =>\n `\| ${id} \| ${(s.passRate * 100).toFixed(0)}% \| ${s.meanScore.toFixed(2)} \| $${s.totalCostUsd.toFixed(2)} \|`,\n ),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' \| 'assistant' \| 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. /\n id: string\n /* Per-domain payload (income/profile/voice/etc.) shaped by the consumer. /\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /* Opening user message (turn 0) — the persona's first ask. /\n buildOpener: (persona: TPersona) => string\n /* System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. /\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile as SandboxAgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n type MultishotArtifact,\n MultishotDriverEmptyError,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: SandboxAgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /* Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. /\n tools?: MultishotToolDefinition[]\n /* Map from tool name → executor invoked inline when the agent emits a tool_call. /\n toolExecutors?: Record<string, MultishotToolExecutor>\n /* Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string \| undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle =\n opts.tools && opts.toolExecutors\n ? {\n tools: opts.tools,\n executors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor ?? (() => undefined),\n }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText \|\| null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({\n type: artifactType,\n turn,\n invocation: { name: tc.name, args: tc.args },\n content: toolResult,\n })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult \|\| 'done' })\n transcript.push({ role: 'tool', content: toolResult \|\| 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBACpB,KACmC;AACnC,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC;AAAA,IAC9E;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;ACnFO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO;AAAA,UACL,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU;AAAA,UACR,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBACd,SAAkC,CAAC,GACZ;AACvB,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAChB,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EACpF;AACF;;;ACpIO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SACpB,OACA,OACqB;AACrB,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,GAAG;AAAA,MACH,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7F;AAAA,EACF;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IACb,QAAQ,gBAAgB,EAAE,EAC1B,QAAQ,WAAW,EAAE,EACrB,KAAK;AACR,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACzGA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;;;AC4Cd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SACJ,KAAK,SAAS,KAAK,gBACf;AAAA,IACE,OAAO,KAAK;AAAA,IACZ,WAAW,KAAK;AAAA,IAChB,iBAAiB,KAAK,oBAAoB,MAAM;AAAA,EAClD,IACA,uBAAuB;AAC7B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK;AAAA,cACb,MAAM;AAAA,cACN;AAAA,cACA,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK;AAAA,cAC3C,SAAS;AAAA,YACX,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AFxHA,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ;AAAA,UACN,cAAc;AAAA,YAAI,CAAC,aACjB,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACpE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ;AAAA,UACN,iBAAiB;AAAA,YAAI,CAAC,aACpB,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACxE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,YAAM,mBACJ,eAAe,WAAW,IACtB,IACA,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAG3E,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO;AACd,kBAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AAC9E,UAAI,KAAK,OAAO;AACd,kBAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAExF,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ;AAAA,UACN,OAAO,IAAI,WAAW;AAAA,UACtB,WAAW,IAAI;AAAA,UACf,eAAe,IAAI,UAAU;AAAA,QAC/B;AAAA,QACA,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.61.0",
+    "version": "0.63.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} RENAMED Viewed

@@ -1,8 +1,8 @@
-import { S as Scenario, C as CampaignResult, q as GateResult, v as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, n as CampaignTraceWriter, M as MutableSurface, s as GenerationRecord, p as GateDecision } from './types-Beb6KPqZ.js';
+import { S as Scenario, f as CampaignResult, k as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, M as MutableSurface, m as GenerationRecord, P as ParetoParent, j as GateDecision } from './types-c2R2kfmv.js';
 import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
 import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
-import { R as RunRecord } from './run-record-DgUVo5pw.js';
-import { H as HostedClient, T as TraceSpanEvent } from './index-D9dwa00f.js';
+import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { H as HostedClient, T as TraceSpanEvent } from './index-GISRh500.js';
 /**
  * @experimental
@@ -83,13 +83,17 @@ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDrive
  * scores + weakest dimensions, asks an LLM to propose targeted rewrites of
  * the current surface, and returns them as the next population.
  *
- * Honest scope vs the GEPA paper (Agrawal et al., arXiv:2507.19457):
- * this driver implements the *reflection* primitive — it does NOT implement
- * GEPA's Pareto frontier of candidates, multi-objective non-dominated
- * tracking, or the combine-complementary-lessons step. We use "best by
- * composite" as the parent each generation; the paper retains a Pareto set
- * and combines lessons across non-dominated candidates. Tracked as #101 in
- * the substrate roadmap. See `docs/specs/driver-honest-spec.md`.
+ * Maps onto the GEPA paper (Agrawal et al., arXiv:2507.19457):
+ *   - *Reflection*: each generation reflects on the best parent's weakest
+ *     dimensions + per-scenario top/bottom scores to propose targeted rewrites.
+ *   - *Pareto frontier*: `runOptimization` maintains the non-dominated set of
+ *     surfaces across generations (per-scenario objective vectors) and supplies
+ *     it as `ctx.paretoParents`. A surface uniquely best on one hard scenario
+ *     survives even when its mean composite is lower.
+ *   - *Combine complementary lessons*: when the frontier has >1 member, the
+ *     first population slot is a merge of those parents' strengths (one LLM
+ *     call citing each parent's winning scenarios). Toggle via `combineParents`.
+ * Dominance is computed by the package-canonical `paretoFrontier` (`pareto.ts`).
  *
  * Optional `constraints` move structured-doc guards into the driver
  * (preserve H2 section headings, cap sentence-level edits) — useful when
@@ -140,6 +144,16 @@ interface GepaDriverOptions {
     /** Structured-doc constraints. Candidates violating any are rejected
      *  post-parse and dropped from the returned population. */
     constraints?: GepaDriverConstraints;
+    /** GEPA combine-complementary-lessons: when the loop supplies a Pareto
+     *  frontier of >1 non-dominated parents (`ctx.paretoParents`), spend one
+     *  slot of the population on a merge of their strengths. Default `true` —
+     *  this is the GEPA-faithful behavior; the merge only fires once the
+     *  frontier has more than one member (generation ≥ 1). Set `false` for
+     *  pure single-parent reflection. */
+    combineParents?: boolean;
+    /** Cap on how many frontier parents feed one combine prompt (highest
+     *  composite first), to bound prompt size. Default 4. */
+    combineMaxParents?: number;
 }
 declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
 /** Extract H2 headings (`## Foo`) from a markdown surface. Exported for
@@ -295,6 +309,17 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
      *  at `<runDir>/traces/`. `'off'` disables capture entirely — substrate
      *  refuses this when the caller wires `autoOnPromote !== 'none'`. */
     tracing?: 'on' | 'off';
+    /**
+     * Per-cell usage expectation — the early, fine-grained sibling of the
+     * batch `assertRealBackend` guard. A cell that produced an artifact (no
+     * error) but reported `costUsd === 0` AND zero tokens is a stub: the
+     * dispatch never reported LLM activity via `ctx.cost`. Modes:
+     *   - `'warn'` (default) — log the offending cell loudly, keep going.
+     *   - `'assert'` — throw `BackendIntegrityError` on the first such cell
+     *     (fail-fast; recommended for CI campaigns expecting real LLM calls).
+     *   - `'off'` — no check (replay / deterministic-only / offline analysis).
+     */
+    expectUsage?: 'assert' | 'warn' | 'off';
     /** Test seam — override the wall clock for deterministic tests. */
     now?: () => Date;
     /** Test seam — override per-cell trace writer factory. */
@@ -399,6 +424,12 @@ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
      *  emitted provenance record. Absent when the winner is the baseline. */
     winnerRationale?: string;
     baselineCampaign: CampaignResult<TArtifact, TScenario>;
+    /** The GEPA Pareto frontier across every scored surface (baseline + all
+     *  generations) by per-scenario objective vector — the non-dominated set.
+     *  Each generation's `propose()` received the frontier-so-far as
+     *  `ctx.paretoParents`; this is the final frontier. A surface here that is
+     *  NOT the winner is uniquely best on some scenario the winner loses on. */
+    paretoFrontier: ParetoParent[];
 }
 declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
 declare function surfaceHash(surface: MutableSurface): string;
@@ -649,4 +680,4 @@ interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends
  */
 declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
-export { provenanceSpansPath as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EmitLoopProvenanceArgs as E, runCampaign as F, type GepaDriverConstraints as G, type HeldOutGateOptions as H, runEval as I, runImprovementLoop as J, runOptimization as K, type LoopProvenanceBackend as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type EmitLoopProvenanceResult as a, type EvolutionaryDriverOptions as b, type GepaDriverOptions as c, type LoopProvenanceCandidate as d, type LoopProvenanceRecord as e, type OpenAutoPrResult as f, type RunEvalOptions as g, type RunImprovementLoopOptions as h, type RunImprovementLoopResult as i, type RunOptimizationOptions as j, type RunOptimizationResult as k, buildLoopProvenanceRecord as l, composeGate as m, countSentenceEdits as n, defaultProductionGate as o, defaultRenderDiff as p, emitLoopProvenance as q, evolutionaryDriver as r, extractH2Sections as s, fsCampaignStorage as t, gepaDriver as u, heldOutGate as v, inMemoryCampaignStorage as w, loopProvenanceSpans as x, openAutoPr as y, provenanceRecordPath as z };
+export { loopProvenanceSpans as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, openAutoPr as F, type GepaDriverOptions as G, type HeldOutGateOptions as H, provenanceRecordPath as I, provenanceSpansPath as J, runOptimization as K, type LoopProvenanceRecord as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type EmitLoopProvenanceArgs as m, type EmitLoopProvenanceResult as n, type GepaDriverConstraints as o, type LoopProvenanceBackend as p, type LoopProvenanceCandidate as q, runCampaign as r, type OpenAutoPrResult as s, type RunOptimizationOptions as t, type RunOptimizationResult as u, buildLoopProvenanceRecord as v, countSentenceEdits as w, defaultRenderDiff as x, emitLoopProvenance as y, extractH2Sections as z };

package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
 import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
-import { R as RunRecord } from './run-record-DgUVo5pw.js';
+import { R as RunRecord } from './run-record-BgTFzO2r.js';
 import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
 import { J as JudgeInput } from './types-DhqpAi_z.js';

package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} RENAMED Viewed

@@ -1,8 +1,8 @@
 import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
 import { a as JudgeScore } from './types-DhqpAi_z.js';
 import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
-import { m as GateDecision } from './summary-report-BQvXpvaR.js';
-import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
+import { m as GateDecision } from './summary-report-ByiOUrHj.js';
+import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
 /**
  * Release confidence gate.

package/dist/reporting.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-DgBHWsh7.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DszkgvJ3.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
+export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DGoeObZT.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
-export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BQvXpvaR.js';
-import './run-record-DgUVo5pw.js';
+export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
+import './run-record-BgTFzO2r.js';
 import './errors-Dwqw-T_m.js';
 import './schema-m0gsnbt3.js';
 import './outcome-store-D6KWmYvj.js';

package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
-import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-DgUVo5pw.js';
+import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BgTFzO2r.js';
 import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
-import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BQvXpvaR.js';
+import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-ByiOUrHj.js';
 import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
 import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
 import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';

package/dist/rl.d.ts CHANGED Viewed

@@ -1,17 +1,17 @@
-import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
-import { C as CampaignResult } from './types-Beb6KPqZ.js';
-import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-BaVsy0sW.js';
-export { r as runEvalCampaign } from './researcher-BaVsy0sW.js';
+import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
+import { f as CampaignResult } from './types-c2R2kfmv.js';
+import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-WJvIpX3L.js';
+export { r as runEvalCampaign } from './researcher-WJvIpX3L.js';
 import { S as Span } from './schema-m0gsnbt3.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
-import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-DgBHWsh7.js';
+import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-D_4BSXGV.js';
 import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
 import './errors-Dwqw-T_m.js';
 import './llm-client-DbjLfz-K.js';
 import './raw-provider-sink-C46HDghv.js';
-import './summary-report-BQvXpvaR.js';
+import './summary-report-ByiOUrHj.js';
 import './failure-cluster-CL7IVgkJ.js';
 import './emitter-DEZwY14K.js';
 import './integrity-CJzrpUua.js';

package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-DgUVo5pw.js';
+import { R as RunRecord } from './run-record-BgTFzO2r.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 /**

package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} RENAMED Viewed

@@ -1,10 +1,11 @@
 import {
   runCampaign
-} from "./chunk-OLULBECP.js";
+} from "./chunk-7TPYV2ER.js";
+import "./chunk-E22YUOAL.js";
 import "./chunk-ITBRCT73.js";
 import "./chunk-3BFEG2F6.js";
 import "./chunk-PZ5AY32C.js";
 export {
   runCampaign
 };
-//# sourceMappingURL=run-campaign-HXPJAUZ3.js.map
+//# sourceMappingURL=run-campaign-5J3ED2UJ.js.map

package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} RENAMED Viewed

@@ -304,4 +304,4 @@ declare function parseRunRecordSafe(input: unknown): {
 /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
 declare function roundTripRunRecord(record: RunRecord): RunRecord;
-export { type AgentProfileCell as A, validateAgentProfileCell as B, validateRunRecord as C, verifyAgentProfileCell as D, type JudgeScoresRecord as J, type RunRecord as R, type SandboxAgentProfileLike as S, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type AgentProfileCellInput as d, AGENT_PROFILE_KINDS as e, type AgentProfileCellSchemaVersion as f, AgentProfileCellValidationError as g, type AgentProfileDimensionValue as h, type AgentProfileHarness as i, type AgentProfileJson as j, type AgentProfileKind as k, type AgentProfileSource as l, type AgentProfileSourceInput as m, type RunOutcome as n, RunRecordValidationError as o, agentProfileCellHashMaterial as p, agentProfileCellKey as q, assertRunAgentProfileCell as r, buildAgentProfileCell as s, buildSandboxAgentProfileCell as t, groupRunsByAgentProfileCell as u, isRunRecord as v, parseRunRecordSafe as w, requireAgentProfileCell as x, roundTripRunRecord as y, toAgentProfileJson as z };
+export { type AgentProfileCell as A, validateAgentProfileCell as B, validateRunRecord as C, verifyAgentProfileCell as D, type JudgeScoresRecord as J, type RunRecord as R, type SandboxAgentProfileLike as S, type RunTokenUsage as a, type RunSplitTag as b, type RunJudgeMetadata as c, type AgentProfileCellInput as d, AGENT_PROFILE_KINDS as e, type AgentProfileCellSchemaVersion as f, AgentProfileCellValidationError as g, type AgentProfileDimensionValue as h, type AgentProfileHarness as i, type AgentProfileJson as j, type AgentProfileKind as k, type AgentProfileSource as l, type AgentProfileSourceInput as m, type RunOutcome as n, RunRecordValidationError as o, agentProfileCellHashMaterial as p, agentProfileCellKey as q, assertRunAgentProfileCell as r, buildAgentProfileCell as s, buildSandboxAgentProfileCell as t, groupRunsByAgentProfileCell as u, isRunRecord as v, parseRunRecordSafe as w, requireAgentProfileCell as x, roundTripRunRecord as y, toAgentProfileJson as z };

package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-DgUVo5pw.js';
+import { R as RunRecord } from './run-record-BgTFzO2r.js';
 import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
 /**

package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} RENAMED Viewed

@@ -1,3 +1,5 @@
+import { a as RunTokenUsage } from './run-record-BgTFzO2r.js';
 /**
  * @experimental
  *
@@ -17,6 +19,7 @@
  * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers
  * can build dashboards / CI gates / regression diffs against a stable schema.
  */
 /** @experimental Stable identifier + kind tag for any scenario. Consumers
  *  extend with their per-domain payload (persona, task, requirement, ...). */
 interface Scenario {
@@ -136,6 +139,28 @@ interface ProposedCandidate {
 /** @experimental Type guard: a proposal carrying its rationale vs a bare
  *  surface. The loop branches on this to populate `GenerationCandidate`. */
 declare function isProposedCandidate(value: MutableSurface | ProposedCandidate): value is ProposedCandidate;
+/** @experimental A non-dominated parent on the GEPA Pareto frontier — a
+ *  surface that, across the per-scenario objective vectors, no other tried
+ *  surface beats on every scenario. A candidate worse on the mean composite
+ *  but uniquely best on one hard scenario is non-dominated and survives here;
+ *  the composite-best ranking would discard the lesson it carries. The loop
+ *  computes the frontier across ALL generations and hands it to the driver so
+ *  a reflective driver can combine complementary lessons (GEPA, Agrawal et
+ *  al., arXiv:2507.19457). See `pareto.ts` (`paretoFrontier`). */
+interface ParetoParent {
+    surface: MutableSurface;
+    surfaceHash: string;
+    /** The objective vector: per-scenario composite (higher is better). The
+     *  axes the frontier is computed over. */
+    objectives: Record<string, number>;
+    /** Mean composite across the objective scenarios — the scalar summary used
+     *  for ordering + display, NOT for dominance. */
+    composite: number;
+    /** Generation that produced this surface (`-1` for the baseline). */
+    generation: number;
+    label?: string;
+    rationale?: string;
+}
 /** @experimental Stateless surface mutation — given findings + current
  *  surface, return N candidate surfaces. Pure transform, no generation
  *  awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
@@ -174,6 +199,13 @@ interface ProposeContext<TFindings = unknown> {
      *  1 = single-shot; >1 = it may iterate on its own change before handing it
      *  back to be measured. */
     maxImprovementShots?: number;
+    /** GEPA Pareto frontier across ALL generations so far — the non-dominated
+     *  surfaces by per-scenario objective vector. Empty/absent on generation 0
+     *  (only the baseline is scored). A reflective driver combines the
+     *  complementary lessons of these parents (each excels on different
+     *  scenarios) into a merged candidate. Drivers doing pure single-parent
+     *  reflection may ignore it. See {@link ParetoParent}. */
+    paretoParents?: ParetoParent[];
 }
 /** @experimental A surface-improvement strategy — the DRIVER of the
  *  improvement loop. Given the current best surface, the history of what's
@@ -257,17 +289,18 @@ interface CampaignArtifactWriter {
     write(path: string, content: string | Uint8Array): Promise<string>;
     writeJson(path: string, value: unknown): Promise<string>;
 }
-/** Token usage accumulated for a cell. Structurally mirrors `RunTokenUsage`
- *  (run-record.ts) so a cell maps cleanly onto a `RunRecord` for the
- *  backend-integrity guard without coupling the campaign module to it. */
-interface CampaignTokenUsage {
-    input: number;
-    output: number;
-    cached?: number;
-}
-/** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
- *  via the cost-ledger backend hooks; consumers can record additional
- *  spend (sandbox time, tool costs) via `observe`. */
+/** Token usage accumulated for a cell. Aliased to the canonical `RunTokenUsage`
+ *  (run-record.ts, same package) so a cell maps onto a `RunRecord` for the
+ *  backend-integrity guard with ONE source of truth — a field added to
+ *  `RunTokenUsage` is a compile error here, not a silent drift. */
+type CampaignTokenUsage = RunTokenUsage;
+/** @experimental Cell-scoped cost meter. NOTHING is captured automatically —
+ *  the substrate does not intercept the LLM call, so it cannot see cost or
+ *  tokens unless the dispatch reports them. Every LLM cost MUST be reported via
+ *  `observe` and every token count via `observeTokens`; a dispatch that reports
+ *  neither yields a `{cost:0, tokens:0}` cell, which the backend-integrity
+ *  guard (`assertRealBackend`) correctly reads as a stub. Also use `observe`
+ *  for non-LLM spend (sandbox time, tool costs). */
 interface CampaignCostMeter {
     observe(amountUsd: number, source: string): void;
     /** Record LLM token usage for this cell; accumulates across calls. A cell
@@ -450,4 +483,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
     scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
 }
-export { labelTrustRank as A, type CampaignResult as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type LabeledScenarioSource as g, type CodeSurface as h, type CampaignAggregates as i, type CampaignArtifactWriter as j, type CampaignCellResult as k, type CampaignCostMeter as l, type CampaignTokenUsage as m, type CampaignTraceWriter as n, type GateContext as o, type GateDecision as p, type GateResult as q, type GenerationCandidate as r, type GenerationRecord as s, type JudgeAggregate as t, type JudgeDimension as u, type Mutator as v, type ProposedCandidate as w, type ScenarioAggregate as x, type SessionScript as y, isProposedCandidate as z };
+export { isProposedCandidate as A, labelTrustRank as B, type CampaignAggregates as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ParetoParent as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type CampaignArtifactWriter as c, type CampaignCellResult as d, type CampaignCostMeter as e, type CampaignResult as f, type CampaignTraceWriter as g, type CodeSurface as h, type GateContext as i, type GateDecision as j, type GateResult as k, type GenerationCandidate as l, type GenerationRecord as m, type JudgeDimension as n, type Mutator as o, type SessionScript as p, type LabeledScenarioWrite as q, type LabeledScenarioSampleArgs as r, type LabeledScenarioRecord as s, type LabelTrust as t, type LabeledScenarioSource as u, type CampaignTokenUsage as v, type JudgeAggregate as w, type ProposeContext as x, type ProposedCandidate as y, type ScenarioAggregate as z };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.61.0",
+  "version": "0.63.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {