@tangle-network/agent-eval 0.61.0 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/CHANGELOG.md +48 -8
  2. package/dist/adapters/http.d.ts +4 -1
  3. package/dist/adapters/langchain.d.ts +4 -1
  4. package/dist/adapters/otel.d.ts +4 -4
  5. package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
  6. package/dist/benchmarks/index.d.ts +2 -2
  7. package/dist/campaign/index.d.ts +388 -11
  8. package/dist/campaign/index.js +597 -12
  9. package/dist/campaign/index.js.map +1 -1
  10. package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
  11. package/dist/chunk-4ODZXQV2.js.map +1 -0
  12. package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
  13. package/dist/chunk-7TPYV2ER.js.map +1 -0
  14. package/dist/chunk-E22YUOAL.js +111 -0
  15. package/dist/chunk-E22YUOAL.js.map +1 -0
  16. package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
  17. package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
  18. package/dist/contract/index.d.ts +9 -9
  19. package/dist/contract/index.js +4 -3
  20. package/dist/contract/index.js.map +1 -1
  21. package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
  22. package/dist/control.d.ts +2 -2
  23. package/dist/hosted/index.d.ts +4 -4
  24. package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
  25. package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
  26. package/dist/index.d.ts +98 -14
  27. package/dist/index.js +331 -128
  28. package/dist/index.js.map +1 -1
  29. package/dist/meta-eval/index.d.ts +2 -2
  30. package/dist/multishot/index.js.map +1 -1
  31. package/dist/openapi.json +1 -1
  32. package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
  33. package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
  34. package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
  35. package/dist/reporting.d.ts +4 -4
  36. package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
  37. package/dist/rl.d.ts +6 -6
  38. package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
  39. package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
  40. package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
  41. package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
  42. package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
  43. package/package.json +1 -1
  44. package/dist/chunk-GMXHLSLL.js.map +0 -1
  45. package/dist/chunk-OLULBECP.js.map +0 -1
  46. package/dist/chunk-SUGME4OT.js.map +0 -1
  47. /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
@@ -2,8 +2,8 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
2
2
  import { R as Run } from '../schema-m0gsnbt3.js';
3
3
  import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
4
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
5
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-DgBHWsh7.js';
6
- import '../run-record-DgUVo5pw.js';
5
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-D_4BSXGV.js';
6
+ import '../run-record-BgTFzO2r.js';
7
7
  import '../errors-Dwqw-T_m.js';
8
8
 
9
9
  /**
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/matrix.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string | null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(\n req: RouterCompletionRequest,\n): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string | null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') || model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') || model.includes('glm') || model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(\n /\\/+$/,\n '',\n )\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. */\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /** Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. */\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: {\n type: 'string',\n description: 'Optional scope: time window, geography, jurisdiction, segment',\n },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: {\n type: 'string',\n description: 'Optional language preference (default: TypeScript)',\n },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(\n config: DefaultResearcherConfig = {},\n): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /** When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. */\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string | undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) =>\n name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined,\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /** Display name (for trace + log). */\n name: string\n /** Model used for this judge. */\n model?: string\n /** 0-10 scored dimensions. */\n dimensions: JudgeDimension[]\n /** Judge system prompt — sets persona + JSON-only constraint. */\n systemPrompt: string\n /** Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. */\n buildPrompt: (input: TInput) => string\n /** Optional model + api overrides. */\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /** Per-dimension 0-10 score. Missing dims default to 0. */\n dimensions: Record<string, number>\n /** Mean across dimensions. */\n composite: number\n /** Free-form 1-2 sentence critique from the judge (when provided). */\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(\n judge: JudgeConfig<TInput>,\n input: TInput,\n): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return {\n ...ZERO_SCORE,\n notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n\n let parsed: Record<string, unknown> | null = null\n try {\n const cleaned = raw\n .replace(/^```json\\s*/i, '')\n .replace(/```\\s*$/, '')\n .trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/** Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/** Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. */\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport type { MatrixResult } from '../matrix'\nimport { runAgentMatrix } from '../matrix'\nimport { type JudgeConfig, type JudgeScore, runJudge } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /** Scores the full transcript end-to-end (always runs). */\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Scores each non-code (research/content/template) artifact. Optional. */\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Which artifact types route to codeReview. Defaults to ['code']. */\n codeArtifactTypes?: string[]\n /** Which artifact types route to contentQuality. Defaults to ['research']. */\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n contentQuality?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /** AgentProfile axis (matrix primary). */\n profiles: Array<{ id: string; value: AgentProfile }>\n /** Persona axis. */\n personas: TPersona[]\n /** Persona-shaping callbacks. */\n shape: MultishotShape<TPersona>\n /** Judge configurations. */\n judges: MultishotJudges<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → inline executor. Must align with `tools`. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Tool name → artifact type label. Defaults to research/code mapping. */\n artifactTypeFor?: (toolName: string) => string | undefined\n /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */\n runDir: string\n /** Replicates per (profile, persona) cell. */\n reps?: number\n /** Max conversation turns per cell. */\n maxTurns?: number\n /** Max concurrent cells. */\n maxConcurrency?: number\n /** Total $ ceiling across the matrix; cells aborted past this. */\n costCeiling?: number\n /** Agent model. */\n agentModel?: string\n /** Driver model. */\n driverModel?: string\n /** Pass-thru fields. */\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as AgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(\n codeArtifacts.map((artifact) =>\n runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(\n contentArtifacts.map((artifact) =>\n runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite =\n codeReviews.length === 0\n ? 0\n : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite =\n contentReviews.length === 0\n ? 0\n : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview)\n cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality)\n cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: {\n turns: sim.transcript.length,\n toolCalls: sim.toolCalls,\n artifactCount: sim.artifacts.length,\n },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `**Cells**: ${matrix.summary.totalCells} | **Pass rate**: ${(matrix.summary.overallPassRate * 100).toFixed(0)}% | **Mean**: ${matrix.summary.overallMeanScore.toFixed(2)} | **Cost**: $${matrix.summary.totalCostUsd.toFixed(2)} | **Duration**: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '| profile | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n `## By persona`,\n ``,\n '| persona | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' | 'assistant' | 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */\n id: string\n /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /** Opening user message (turn 0) — the persona's first ask. */\n buildOpener: (persona: TPersona) => string\n /** System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. */\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n type MultishotArtifact,\n MultishotDriverEmptyError,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: AgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → executor invoked inline when the agent emits a tool_call. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string | undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle =\n opts.tools && opts.toolExecutors\n ? {\n tools: opts.tools,\n executors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor ?? (() => undefined),\n }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText || null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({\n type: artifactType,\n turn,\n invocation: { name: tc.name, args: tc.args },\n content: toolResult,\n })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult || 'done' })\n transcript.push({ role: 'tool', content: toolResult || 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBACpB,KACmC;AACnC,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC;AAAA,IAC9E;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;ACnFO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO;AAAA,UACL,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU;AAAA,UACR,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBACd,SAAkC,CAAC,GACZ;AACvB,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAChB,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EACpF;AACF;;;ACpIO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SACpB,OACA,OACqB;AACrB,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,GAAG;AAAA,MACH,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7F;AAAA,EACF;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IACb,QAAQ,gBAAgB,EAAE,EAC1B,QAAQ,WAAW,EAAE,EACrB,KAAK;AACR,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACzGA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;;;AC4Cd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SACJ,KAAK,SAAS,KAAK,gBACf;AAAA,IACE,OAAO,KAAK;AAAA,IACZ,WAAW,KAAK;AAAA,IAChB,iBAAiB,KAAK,oBAAoB,MAAM;AAAA,EAClD,IACA,uBAAuB;AAC7B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK;AAAA,cACb,MAAM;AAAA,cACN;AAAA,cACA,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK;AAAA,cAC3C,SAAS;AAAA,YACX,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AFxHA,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ;AAAA,UACN,cAAc;AAAA,YAAI,CAAC,aACjB,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACpE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ;AAAA,UACN,iBAAiB;AAAA,YAAI,CAAC,aACpB,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACxE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,YAAM,mBACJ,eAAe,WAAW,IACtB,IACA,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAG3E,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO;AACd,kBAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AAC9E,UAAI,KAAK,OAAO;AACd,kBAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAExF,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ;AAAA,UACN,OAAO,IAAI,WAAW;AAAA,UACtB,WAAW,IAAI;AAAA,UACf,eAAe,IAAI,UAAU;AAAA,QAC/B;AAAA,QACA,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
1
+ {"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/matrix.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string | null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(\n req: RouterCompletionRequest,\n): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string | null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') || model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') || model.includes('glm') || model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(\n /\\/+$/,\n '',\n )\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. */\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /** Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. */\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: {\n type: 'string',\n description: 'Optional scope: time window, geography, jurisdiction, segment',\n },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: {\n type: 'string',\n description: 'Optional language preference (default: TypeScript)',\n },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(\n config: DefaultResearcherConfig = {},\n): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /** When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. */\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string | undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) =>\n name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined,\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /** Display name (for trace + log). */\n name: string\n /** Model used for this judge. */\n model?: string\n /** 0-10 scored dimensions. */\n dimensions: JudgeDimension[]\n /** Judge system prompt — sets persona + JSON-only constraint. */\n systemPrompt: string\n /** Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. */\n buildPrompt: (input: TInput) => string\n /** Optional model + api overrides. */\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /** Per-dimension 0-10 score. Missing dims default to 0. */\n dimensions: Record<string, number>\n /** Mean across dimensions. */\n composite: number\n /** Free-form 1-2 sentence critique from the judge (when provided). */\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(\n judge: JudgeConfig<TInput>,\n input: TInput,\n): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return {\n ...ZERO_SCORE,\n notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n\n let parsed: Record<string, unknown> | null = null\n try {\n const cleaned = raw\n .replace(/^```json\\s*/i, '')\n .replace(/```\\s*$/, '')\n .trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/** Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/** Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. */\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile as SandboxAgentProfile } from '@tangle-network/sandbox'\nimport type { MatrixResult } from '../matrix'\nimport { runAgentMatrix } from '../matrix'\nimport { type JudgeConfig, type JudgeScore, runJudge } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /** Scores the full transcript end-to-end (always runs). */\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Scores each non-code (research/content/template) artifact. Optional. */\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Which artifact types route to codeReview. Defaults to ['code']. */\n codeArtifactTypes?: string[]\n /** Which artifact types route to contentQuality. Defaults to ['research']. */\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n contentQuality?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /** AgentProfile axis (matrix primary). */\n profiles: Array<{ id: string; value: SandboxAgentProfile }>\n /** Persona axis. */\n personas: TPersona[]\n /** Persona-shaping callbacks. */\n shape: MultishotShape<TPersona>\n /** Judge configurations. */\n judges: MultishotJudges<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → inline executor. Must align with `tools`. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Tool name → artifact type label. Defaults to research/code mapping. */\n artifactTypeFor?: (toolName: string) => string | undefined\n /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */\n runDir: string\n /** Replicates per (profile, persona) cell. */\n reps?: number\n /** Max conversation turns per cell. */\n maxTurns?: number\n /** Max concurrent cells. */\n maxConcurrency?: number\n /** Total $ ceiling across the matrix; cells aborted past this. */\n costCeiling?: number\n /** Agent model. */\n agentModel?: string\n /** Driver model. */\n driverModel?: string\n /** Pass-thru fields. */\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as SandboxAgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(\n codeArtifacts.map((artifact) =>\n runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(\n contentArtifacts.map((artifact) =>\n runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite =\n codeReviews.length === 0\n ? 0\n : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite =\n contentReviews.length === 0\n ? 0\n : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview)\n cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality)\n cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: {\n turns: sim.transcript.length,\n toolCalls: sim.toolCalls,\n artifactCount: sim.artifacts.length,\n },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `**Cells**: ${matrix.summary.totalCells} | **Pass rate**: ${(matrix.summary.overallPassRate * 100).toFixed(0)}% | **Mean**: ${matrix.summary.overallMeanScore.toFixed(2)} | **Cost**: $${matrix.summary.totalCostUsd.toFixed(2)} | **Duration**: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '| profile | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n `## By persona`,\n ``,\n '| persona | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' | 'assistant' | 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */\n id: string\n /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /** Opening user message (turn 0) — the persona's first ask. */\n buildOpener: (persona: TPersona) => string\n /** System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. */\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile as SandboxAgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n type MultishotArtifact,\n MultishotDriverEmptyError,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: SandboxAgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → executor invoked inline when the agent emits a tool_call. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string | undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle =\n opts.tools && opts.toolExecutors\n ? {\n tools: opts.tools,\n executors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor ?? (() => undefined),\n }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText || null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({\n type: artifactType,\n turn,\n invocation: { name: tc.name, args: tc.args },\n content: toolResult,\n })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult || 'done' })\n transcript.push({ role: 'tool', content: toolResult || 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBACpB,KACmC;AACnC,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC;AAAA,IAC9E;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;ACnFO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO;AAAA,UACL,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU;AAAA,UACR,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBACd,SAAkC,CAAC,GACZ;AACvB,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAChB,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EACpF;AACF;;;ACpIO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SACpB,OACA,OACqB;AACrB,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,GAAG;AAAA,MACH,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7F;AAAA,EACF;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IACb,QAAQ,gBAAgB,EAAE,EAC1B,QAAQ,WAAW,EAAE,EACrB,KAAK;AACR,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACzGA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;;;AC4Cd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SACJ,KAAK,SAAS,KAAK,gBACf;AAAA,IACE,OAAO,KAAK;AAAA,IACZ,WAAW,KAAK;AAAA,IAChB,iBAAiB,KAAK,oBAAoB,MAAM;AAAA,EAClD,IACA,uBAAuB;AAC7B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK;AAAA,cACb,MAAM;AAAA,cACN;AAAA,cACA,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK;AAAA,cAC3C,SAAS;AAAA,YACX,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AFxHA,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ;AAAA,UACN,cAAc;AAAA,YAAI,CAAC,aACjB,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACpE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ;AAAA,UACN,iBAAiB;AAAA,YAAI,CAAC,aACpB,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACxE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,YAAM,mBACJ,eAAe,WAAW,IACtB,IACA,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAG3E,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO;AACd,kBAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AAC9E,UAAI,KAAK,OAAO;AACd,kBAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAExF,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ;AAAA,UACN,OAAO,IAAI,WAAW;AAAA,UACtB,WAAW,IAAI;AAAA,UACf,eAAe,IAAI,UAAU;AAAA,QAC/B;AAAA,QACA,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.61.0",
5
+ "version": "0.63.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,8 +1,8 @@
1
- import { S as Scenario, C as CampaignResult, q as GateResult, v as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, n as CampaignTraceWriter, M as MutableSurface, s as GenerationRecord, p as GateDecision } from './types-Beb6KPqZ.js';
1
+ import { S as Scenario, f as CampaignResult, k as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, M as MutableSurface, m as GenerationRecord, P as ParetoParent, j as GateDecision } from './types-c2R2kfmv.js';
2
2
  import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
3
3
  import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
4
- import { R as RunRecord } from './run-record-DgUVo5pw.js';
5
- import { H as HostedClient, T as TraceSpanEvent } from './index-D9dwa00f.js';
4
+ import { R as RunRecord } from './run-record-BgTFzO2r.js';
5
+ import { H as HostedClient, T as TraceSpanEvent } from './index-GISRh500.js';
6
6
 
7
7
  /**
8
8
  * @experimental
@@ -83,13 +83,17 @@ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDrive
83
83
  * scores + weakest dimensions, asks an LLM to propose targeted rewrites of
84
84
  * the current surface, and returns them as the next population.
85
85
  *
86
- * Honest scope vs the GEPA paper (Agrawal et al., arXiv:2507.19457):
87
- * this driver implements the *reflection* primitive it does NOT implement
88
- * GEPA's Pareto frontier of candidates, multi-objective non-dominated
89
- * tracking, or the combine-complementary-lessons step. We use "best by
90
- * composite" as the parent each generation; the paper retains a Pareto set
91
- * and combines lessons across non-dominated candidates. Tracked as #101 in
92
- * the substrate roadmap. See `docs/specs/driver-honest-spec.md`.
86
+ * Maps onto the GEPA paper (Agrawal et al., arXiv:2507.19457):
87
+ * - *Reflection*: each generation reflects on the best parent's weakest
88
+ * dimensions + per-scenario top/bottom scores to propose targeted rewrites.
89
+ * - *Pareto frontier*: `runOptimization` maintains the non-dominated set of
90
+ * surfaces across generations (per-scenario objective vectors) and supplies
91
+ * it as `ctx.paretoParents`. A surface uniquely best on one hard scenario
92
+ * survives even when its mean composite is lower.
93
+ * - *Combine complementary lessons*: when the frontier has >1 member, the
94
+ * first population slot is a merge of those parents' strengths (one LLM
95
+ * call citing each parent's winning scenarios). Toggle via `combineParents`.
96
+ * Dominance is computed by the package-canonical `paretoFrontier` (`pareto.ts`).
93
97
  *
94
98
  * Optional `constraints` move structured-doc guards into the driver
95
99
  * (preserve H2 section headings, cap sentence-level edits) — useful when
@@ -140,6 +144,16 @@ interface GepaDriverOptions {
140
144
  /** Structured-doc constraints. Candidates violating any are rejected
141
145
  * post-parse and dropped from the returned population. */
142
146
  constraints?: GepaDriverConstraints;
147
+ /** GEPA combine-complementary-lessons: when the loop supplies a Pareto
148
+ * frontier of >1 non-dominated parents (`ctx.paretoParents`), spend one
149
+ * slot of the population on a merge of their strengths. Default `true` —
150
+ * this is the GEPA-faithful behavior; the merge only fires once the
151
+ * frontier has more than one member (generation ≥ 1). Set `false` for
152
+ * pure single-parent reflection. */
153
+ combineParents?: boolean;
154
+ /** Cap on how many frontier parents feed one combine prompt (highest
155
+ * composite first), to bound prompt size. Default 4. */
156
+ combineMaxParents?: number;
143
157
  }
144
158
  declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
145
159
  /** Extract H2 headings (`## Foo`) from a markdown surface. Exported for
@@ -295,6 +309,17 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
295
309
  * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate
296
310
  * refuses this when the caller wires `autoOnPromote !== 'none'`. */
297
311
  tracing?: 'on' | 'off';
312
+ /**
313
+ * Per-cell usage expectation — the early, fine-grained sibling of the
314
+ * batch `assertRealBackend` guard. A cell that produced an artifact (no
315
+ * error) but reported `costUsd === 0` AND zero tokens is a stub: the
316
+ * dispatch never reported LLM activity via `ctx.cost`. Modes:
317
+ * - `'warn'` (default) — log the offending cell loudly, keep going.
318
+ * - `'assert'` — throw `BackendIntegrityError` on the first such cell
319
+ * (fail-fast; recommended for CI campaigns expecting real LLM calls).
320
+ * - `'off'` — no check (replay / deterministic-only / offline analysis).
321
+ */
322
+ expectUsage?: 'assert' | 'warn' | 'off';
298
323
  /** Test seam — override the wall clock for deterministic tests. */
299
324
  now?: () => Date;
300
325
  /** Test seam — override per-cell trace writer factory. */
@@ -399,6 +424,12 @@ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
399
424
  * emitted provenance record. Absent when the winner is the baseline. */
400
425
  winnerRationale?: string;
401
426
  baselineCampaign: CampaignResult<TArtifact, TScenario>;
427
+ /** The GEPA Pareto frontier across every scored surface (baseline + all
428
+ * generations) by per-scenario objective vector — the non-dominated set.
429
+ * Each generation's `propose()` received the frontier-so-far as
430
+ * `ctx.paretoParents`; this is the final frontier. A surface here that is
431
+ * NOT the winner is uniquely best on some scenario the winner loses on. */
432
+ paretoFrontier: ParetoParent[];
402
433
  }
403
434
  declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
404
435
  declare function surfaceHash(surface: MutableSurface): string;
@@ -649,4 +680,4 @@ interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends
649
680
  */
650
681
  declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
651
682
 
652
- export { provenanceSpansPath as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EmitLoopProvenanceArgs as E, runCampaign as F, type GepaDriverConstraints as G, type HeldOutGateOptions as H, runEval as I, runImprovementLoop as J, runOptimization as K, type LoopProvenanceBackend as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type EmitLoopProvenanceResult as a, type EvolutionaryDriverOptions as b, type GepaDriverOptions as c, type LoopProvenanceCandidate as d, type LoopProvenanceRecord as e, type OpenAutoPrResult as f, type RunEvalOptions as g, type RunImprovementLoopOptions as h, type RunImprovementLoopResult as i, type RunOptimizationOptions as j, type RunOptimizationResult as k, buildLoopProvenanceRecord as l, composeGate as m, countSentenceEdits as n, defaultProductionGate as o, defaultRenderDiff as p, emitLoopProvenance as q, evolutionaryDriver as r, extractH2Sections as s, fsCampaignStorage as t, gepaDriver as u, heldOutGate as v, inMemoryCampaignStorage as w, loopProvenanceSpans as x, openAutoPr as y, provenanceRecordPath as z };
683
+ export { loopProvenanceSpans as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, openAutoPr as F, type GepaDriverOptions as G, type HeldOutGateOptions as H, provenanceRecordPath as I, provenanceSpansPath as J, runOptimization as K, type LoopProvenanceRecord as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type EmitLoopProvenanceArgs as m, type EmitLoopProvenanceResult as n, type GepaDriverConstraints as o, type LoopProvenanceBackend as p, type LoopProvenanceCandidate as q, runCampaign as r, type OpenAutoPrResult as s, type RunOptimizationOptions as t, type RunOptimizationResult as u, buildLoopProvenanceRecord as v, countSentenceEdits as w, defaultRenderDiff as x, emitLoopProvenance as y, extractH2Sections as z };
@@ -1,5 +1,5 @@
1
1
  import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
2
- import { R as RunRecord } from './run-record-DgUVo5pw.js';
2
+ import { R as RunRecord } from './run-record-BgTFzO2r.js';
3
3
  import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
4
4
  import { J as JudgeInput } from './types-DhqpAi_z.js';
5
5
 
@@ -1,8 +1,8 @@
1
1
  import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
2
  import { a as JudgeScore } from './types-DhqpAi_z.js';
3
3
  import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
4
- import { m as GateDecision } from './summary-report-BQvXpvaR.js';
5
- import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
4
+ import { m as GateDecision } from './summary-report-ByiOUrHj.js';
5
+ import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
6
6
 
7
7
  /**
8
8
  * Release confidence gate.
@@ -1,8 +1,8 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-DgBHWsh7.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DszkgvJ3.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DGoeObZT.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BQvXpvaR.js';
5
- import './run-record-DgUVo5pw.js';
4
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
5
+ import './run-record-BgTFzO2r.js';
6
6
  import './errors-Dwqw-T_m.js';
7
7
  import './schema-m0gsnbt3.js';
8
8
  import './outcome-store-D6KWmYvj.js';
@@ -1,6 +1,6 @@
1
- import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-DgUVo5pw.js';
1
+ import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BgTFzO2r.js';
2
2
  import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
3
- import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BQvXpvaR.js';
3
+ import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-ByiOUrHj.js';
4
4
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
5
5
  import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
6
6
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
package/dist/rl.d.ts CHANGED
@@ -1,17 +1,17 @@
1
- import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
2
- import { C as CampaignResult } from './types-Beb6KPqZ.js';
3
- import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-BaVsy0sW.js';
4
- export { r as runEvalCampaign } from './researcher-BaVsy0sW.js';
1
+ import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
2
+ import { f as CampaignResult } from './types-c2R2kfmv.js';
3
+ import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-WJvIpX3L.js';
4
+ export { r as runEvalCampaign } from './researcher-WJvIpX3L.js';
5
5
  import { S as Span } from './schema-m0gsnbt3.js';
6
6
  import { T as TraceStore } from './store-CKUAgsJz.js';
7
7
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
8
8
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
9
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-DgBHWsh7.js';
9
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-D_4BSXGV.js';
10
10
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
11
11
  import './errors-Dwqw-T_m.js';
12
12
  import './llm-client-DbjLfz-K.js';
13
13
  import './raw-provider-sink-C46HDghv.js';
14
- import './summary-report-BQvXpvaR.js';
14
+ import './summary-report-ByiOUrHj.js';
15
15
  import './failure-cluster-CL7IVgkJ.js';
16
16
  import './emitter-DEZwY14K.js';
17
17
  import './integrity-CJzrpUua.js';
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-DgUVo5pw.js';
1
+ import { R as RunRecord } from './run-record-BgTFzO2r.js';
2
2
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
3
 
4
4
  /**
@@ -1,10 +1,11 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-OLULBECP.js";
3
+ } from "./chunk-7TPYV2ER.js";
4
+ import "./chunk-E22YUOAL.js";
4
5
  import "./chunk-ITBRCT73.js";
5
6
  import "./chunk-3BFEG2F6.js";
6
7
  import "./chunk-PZ5AY32C.js";
7
8
  export {
8
9
  runCampaign
9
10
  };
10
- //# sourceMappingURL=run-campaign-HXPJAUZ3.js.map
11
+ //# sourceMappingURL=run-campaign-5J3ED2UJ.js.map
@@ -304,4 +304,4 @@ declare function parseRunRecordSafe(input: unknown): {
304
304
  /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
305
305
  declare function roundTripRunRecord(record: RunRecord): RunRecord;
306
306
 
307
- export { type AgentProfileCell as A, validateAgentProfileCell as B, validateRunRecord as C, verifyAgentProfileCell as D, type JudgeScoresRecord as J, type RunRecord as R, type SandboxAgentProfileLike as S, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type AgentProfileCellInput as d, AGENT_PROFILE_KINDS as e, type AgentProfileCellSchemaVersion as f, AgentProfileCellValidationError as g, type AgentProfileDimensionValue as h, type AgentProfileHarness as i, type AgentProfileJson as j, type AgentProfileKind as k, type AgentProfileSource as l, type AgentProfileSourceInput as m, type RunOutcome as n, RunRecordValidationError as o, agentProfileCellHashMaterial as p, agentProfileCellKey as q, assertRunAgentProfileCell as r, buildAgentProfileCell as s, buildSandboxAgentProfileCell as t, groupRunsByAgentProfileCell as u, isRunRecord as v, parseRunRecordSafe as w, requireAgentProfileCell as x, roundTripRunRecord as y, toAgentProfileJson as z };
307
+ export { type AgentProfileCell as A, validateAgentProfileCell as B, validateRunRecord as C, verifyAgentProfileCell as D, type JudgeScoresRecord as J, type RunRecord as R, type SandboxAgentProfileLike as S, type RunTokenUsage as a, type RunSplitTag as b, type RunJudgeMetadata as c, type AgentProfileCellInput as d, AGENT_PROFILE_KINDS as e, type AgentProfileCellSchemaVersion as f, AgentProfileCellValidationError as g, type AgentProfileDimensionValue as h, type AgentProfileHarness as i, type AgentProfileJson as j, type AgentProfileKind as k, type AgentProfileSource as l, type AgentProfileSourceInput as m, type RunOutcome as n, RunRecordValidationError as o, agentProfileCellHashMaterial as p, agentProfileCellKey as q, assertRunAgentProfileCell as r, buildAgentProfileCell as s, buildSandboxAgentProfileCell as t, groupRunsByAgentProfileCell as u, isRunRecord as v, parseRunRecordSafe as w, requireAgentProfileCell as x, roundTripRunRecord as y, toAgentProfileJson as z };
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-DgUVo5pw.js';
1
+ import { R as RunRecord } from './run-record-BgTFzO2r.js';
2
2
  import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
3
3
 
4
4
  /**
@@ -1,3 +1,5 @@
1
+ import { a as RunTokenUsage } from './run-record-BgTFzO2r.js';
2
+
1
3
  /**
2
4
  * @experimental
3
5
  *
@@ -17,6 +19,7 @@
17
19
  * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers
18
20
  * can build dashboards / CI gates / regression diffs against a stable schema.
19
21
  */
22
+
20
23
  /** @experimental Stable identifier + kind tag for any scenario. Consumers
21
24
  * extend with their per-domain payload (persona, task, requirement, ...). */
22
25
  interface Scenario {
@@ -136,6 +139,28 @@ interface ProposedCandidate {
136
139
  /** @experimental Type guard: a proposal carrying its rationale vs a bare
137
140
  * surface. The loop branches on this to populate `GenerationCandidate`. */
138
141
  declare function isProposedCandidate(value: MutableSurface | ProposedCandidate): value is ProposedCandidate;
142
+ /** @experimental A non-dominated parent on the GEPA Pareto frontier — a
143
+ * surface that, across the per-scenario objective vectors, no other tried
144
+ * surface beats on every scenario. A candidate worse on the mean composite
145
+ * but uniquely best on one hard scenario is non-dominated and survives here;
146
+ * the composite-best ranking would discard the lesson it carries. The loop
147
+ * computes the frontier across ALL generations and hands it to the driver so
148
+ * a reflective driver can combine complementary lessons (GEPA, Agrawal et
149
+ * al., arXiv:2507.19457). See `pareto.ts` (`paretoFrontier`). */
150
+ interface ParetoParent {
151
+ surface: MutableSurface;
152
+ surfaceHash: string;
153
+ /** The objective vector: per-scenario composite (higher is better). The
154
+ * axes the frontier is computed over. */
155
+ objectives: Record<string, number>;
156
+ /** Mean composite across the objective scenarios — the scalar summary used
157
+ * for ordering + display, NOT for dominance. */
158
+ composite: number;
159
+ /** Generation that produced this surface (`-1` for the baseline). */
160
+ generation: number;
161
+ label?: string;
162
+ rationale?: string;
163
+ }
139
164
  /** @experimental Stateless surface mutation — given findings + current
140
165
  * surface, return N candidate surfaces. Pure transform, no generation
141
166
  * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
@@ -174,6 +199,13 @@ interface ProposeContext<TFindings = unknown> {
174
199
  * 1 = single-shot; >1 = it may iterate on its own change before handing it
175
200
  * back to be measured. */
176
201
  maxImprovementShots?: number;
202
+ /** GEPA Pareto frontier across ALL generations so far — the non-dominated
203
+ * surfaces by per-scenario objective vector. Empty/absent on generation 0
204
+ * (only the baseline is scored). A reflective driver combines the
205
+ * complementary lessons of these parents (each excels on different
206
+ * scenarios) into a merged candidate. Drivers doing pure single-parent
207
+ * reflection may ignore it. See {@link ParetoParent}. */
208
+ paretoParents?: ParetoParent[];
177
209
  }
178
210
  /** @experimental A surface-improvement strategy — the DRIVER of the
179
211
  * improvement loop. Given the current best surface, the history of what's
@@ -257,17 +289,18 @@ interface CampaignArtifactWriter {
257
289
  write(path: string, content: string | Uint8Array): Promise<string>;
258
290
  writeJson(path: string, value: unknown): Promise<string>;
259
291
  }
260
- /** Token usage accumulated for a cell. Structurally mirrors `RunTokenUsage`
261
- * (run-record.ts) so a cell maps cleanly onto a `RunRecord` for the
262
- * backend-integrity guard without coupling the campaign module to it. */
263
- interface CampaignTokenUsage {
264
- input: number;
265
- output: number;
266
- cached?: number;
267
- }
268
- /** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
269
- * via the cost-ledger backend hooks; consumers can record additional
270
- * spend (sandbox time, tool costs) via `observe`. */
292
+ /** Token usage accumulated for a cell. Aliased to the canonical `RunTokenUsage`
293
+ * (run-record.ts, same package) so a cell maps onto a `RunRecord` for the
294
+ * backend-integrity guard with ONE source of truth a field added to
295
+ * `RunTokenUsage` is a compile error here, not a silent drift. */
296
+ type CampaignTokenUsage = RunTokenUsage;
297
+ /** @experimental Cell-scoped cost meter. NOTHING is captured automatically —
298
+ * the substrate does not intercept the LLM call, so it cannot see cost or
299
+ * tokens unless the dispatch reports them. Every LLM cost MUST be reported via
300
+ * `observe` and every token count via `observeTokens`; a dispatch that reports
301
+ * neither yields a `{cost:0, tokens:0}` cell, which the backend-integrity
302
+ * guard (`assertRealBackend`) correctly reads as a stub. Also use `observe`
303
+ * for non-LLM spend (sandbox time, tool costs). */
271
304
  interface CampaignCostMeter {
272
305
  observe(amountUsd: number, source: string): void;
273
306
  /** Record LLM token usage for this cell; accumulates across calls. A cell
@@ -450,4 +483,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
450
483
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
451
484
  }
452
485
 
453
- export { labelTrustRank as A, type CampaignResult as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type LabeledScenarioSource as g, type CodeSurface as h, type CampaignAggregates as i, type CampaignArtifactWriter as j, type CampaignCellResult as k, type CampaignCostMeter as l, type CampaignTokenUsage as m, type CampaignTraceWriter as n, type GateContext as o, type GateDecision as p, type GateResult as q, type GenerationCandidate as r, type GenerationRecord as s, type JudgeAggregate as t, type JudgeDimension as u, type Mutator as v, type ProposedCandidate as w, type ScenarioAggregate as x, type SessionScript as y, isProposedCandidate as z };
486
+ export { isProposedCandidate as A, labelTrustRank as B, type CampaignAggregates as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ParetoParent as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type CampaignArtifactWriter as c, type CampaignCellResult as d, type CampaignCostMeter as e, type CampaignResult as f, type CampaignTraceWriter as g, type CodeSurface as h, type GateContext as i, type GateDecision as j, type GateResult as k, type GenerationCandidate as l, type GenerationRecord as m, type JudgeDimension as n, type Mutator as o, type SessionScript as p, type LabeledScenarioWrite as q, type LabeledScenarioSampleArgs as r, type LabeledScenarioRecord as s, type LabelTrust as t, type LabeledScenarioSource as u, type CampaignTokenUsage as v, type JudgeAggregate as w, type ProposeContext as x, type ProposedCandidate as y, type ScenarioAggregate as z };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.61.0",
3
+ "version": "0.63.0",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {