@tangle-network/agent-eval 0.60.0 → 0.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/dist/adapters/http.d.ts +4 -1
- package/dist/adapters/langchain.d.ts +4 -1
- package/dist/adapters/otel.d.ts +5 -5
- package/dist/agent-profile-DzcPHR1Z.d.ts +114 -0
- package/dist/benchmarks/index.d.ts +3 -3
- package/dist/builder-eval/index.js +2 -2
- package/dist/campaign/index.d.ts +151 -11
- package/dist/campaign/index.js +212 -10
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
- package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
- package/dist/chunk-3BFEG2F6.js.map +1 -0
- package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
- package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
- package/dist/{chunk-NOPYCRNG.js → chunk-7TPYV2ER.js} +39 -2
- package/dist/chunk-7TPYV2ER.js.map +1 -0
- package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
- package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
- package/dist/{chunk-LBSXXH56.js → chunk-CV2BS2OV.js} +8 -6
- package/dist/chunk-CV2BS2OV.js.map +1 -0
- package/dist/chunk-E22YUOAL.js +111 -0
- package/dist/chunk-E22YUOAL.js.map +1 -0
- package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
- package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
- package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
- package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
- package/dist/chunk-PQV2TKC3.js +27 -0
- package/dist/chunk-PQV2TKC3.js.map +1 -0
- package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
- package/dist/{chunk-GBHRUAOF.js → chunk-SS2SOBBT.js} +2 -107
- package/dist/chunk-SS2SOBBT.js.map +1 -0
- package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
- package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
- package/dist/cli.js +3 -3
- package/dist/contract/index.d.ts +13 -13
- package/dist/contract/index.js +8 -7
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DjEgwWNo.d.ts → control-DxvZeV5X.d.ts} +2 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +3 -3
- package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
- package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
- package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -3
- package/dist/hosted/index.d.ts +5 -5
- package/dist/{index-wlaiph9Y.d.ts → index-DsnOpCO6.d.ts} +1 -1
- package/dist/{index-BIkvdkSU.d.ts → index-DxfmYUjC.d.ts} +2 -2
- package/dist/index.d.ts +108 -132
- package/dist/index.js +339 -73
- package/dist/index.js.map +1 -1
- package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
- package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +3 -3
- package/dist/{provenance-BM8vmMBa.d.ts → provenance-CYBV9Ox6.d.ts} +16 -5
- package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
- package/dist/{registry-DK9kqXvb.d.ts → registry-DPly4_hZ.d.ts} +2 -2
- package/dist/{release-report-DmPjIce3.d.ts → release-report-DGoeObZT.d.ts} +3 -3
- package/dist/reporting.d.ts +6 -6
- package/dist/reporting.js +4 -4
- package/dist/{researcher-JP8EvnLv.d.ts → researcher-WJvIpX3L.d.ts} +4 -4
- package/dist/rl.d.ts +9 -9
- package/dist/rl.js +7 -7
- package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
- package/dist/run-campaign-5J3ED2UJ.js +11 -0
- package/dist/{run-record-etiCMsUq.d.ts → run-record-BgTFzO2r.d.ts} +2 -2
- package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +3 -3
- package/dist/{types-VCIXx_yo.d.ts → types-DH22o8hM.d.ts} +28 -4
- package/dist/wire/index.d.ts +3 -3
- package/dist/wire/index.js +3 -3
- package/package.json +12 -25
- package/dist/chunk-GBHRUAOF.js.map +0 -1
- package/dist/chunk-LBSXXH56.js.map +0 -1
- package/dist/chunk-NOPYCRNG.js.map +0 -1
- package/dist/chunk-QYJT52YW.js.map +0 -1
- package/dist/run-campaign-5XENUKRF.js +0 -10
- /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
- /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
- /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
- /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
- /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
- /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
- /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
- /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
- /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
- /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
- /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
- /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
- /package/dist/{run-campaign-5XENUKRF.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-
|
|
1
|
+
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-Dwqw-T_m.js';
|
|
2
2
|
import { R as RawProviderSink, P as ProviderRedactor } from './raw-provider-sink-C46HDghv.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
@@ -2,9 +2,9 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
|
2
2
|
import { R as Run } from '../schema-m0gsnbt3.js';
|
|
3
3
|
import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
4
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
5
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-
|
|
6
|
-
import '../run-record-
|
|
7
|
-
import '../errors-
|
|
5
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-D_4BSXGV.js';
|
|
6
|
+
import '../run-record-BgTFzO2r.js';
|
|
7
|
+
import '../errors-Dwqw-T_m.js';
|
|
8
8
|
|
|
9
9
|
/**
|
|
10
10
|
* Correlation study — "does our eval score predict real-world outcomes?"
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/matrix.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string | null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(\n req: RouterCompletionRequest,\n): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string | null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') || model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') || model.includes('glm') || model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(\n /\\/+$/,\n '',\n )\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. */\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /** Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. */\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: {\n type: 'string',\n description: 'Optional scope: time window, geography, jurisdiction, segment',\n },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: {\n type: 'string',\n description: 'Optional language preference (default: TypeScript)',\n },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(\n config: DefaultResearcherConfig = {},\n): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /** When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. */\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string | undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) =>\n name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined,\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /** Display name (for trace + log). */\n name: string\n /** Model used for this judge. */\n model?: string\n /** 0-10 scored dimensions. */\n dimensions: JudgeDimension[]\n /** Judge system prompt — sets persona + JSON-only constraint. */\n systemPrompt: string\n /** Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. */\n buildPrompt: (input: TInput) => string\n /** Optional model + api overrides. */\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /** Per-dimension 0-10 score. Missing dims default to 0. */\n dimensions: Record<string, number>\n /** Mean across dimensions. */\n composite: number\n /** Free-form 1-2 sentence critique from the judge (when provided). */\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(\n judge: JudgeConfig<TInput>,\n input: TInput,\n): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return {\n ...ZERO_SCORE,\n notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n\n let parsed: Record<string, unknown> | null = null\n try {\n const cleaned = raw\n .replace(/^```json\\s*/i, '')\n .replace(/```\\s*$/, '')\n .trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/** Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/** Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. */\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport type { MatrixResult } from '../matrix'\nimport { runAgentMatrix } from '../matrix'\nimport { type JudgeConfig, type JudgeScore, runJudge } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /** Scores the full transcript end-to-end (always runs). */\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Scores each non-code (research/content/template) artifact. Optional. */\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Which artifact types route to codeReview. Defaults to ['code']. */\n codeArtifactTypes?: string[]\n /** Which artifact types route to contentQuality. Defaults to ['research']. */\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n contentQuality?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /** AgentProfile axis (matrix primary). */\n profiles: Array<{ id: string; value: AgentProfile }>\n /** Persona axis. */\n personas: TPersona[]\n /** Persona-shaping callbacks. */\n shape: MultishotShape<TPersona>\n /** Judge configurations. */\n judges: MultishotJudges<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → inline executor. Must align with `tools`. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Tool name → artifact type label. Defaults to research/code mapping. */\n artifactTypeFor?: (toolName: string) => string | undefined\n /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */\n runDir: string\n /** Replicates per (profile, persona) cell. */\n reps?: number\n /** Max conversation turns per cell. */\n maxTurns?: number\n /** Max concurrent cells. */\n maxConcurrency?: number\n /** Total $ ceiling across the matrix; cells aborted past this. */\n costCeiling?: number\n /** Agent model. */\n agentModel?: string\n /** Driver model. */\n driverModel?: string\n /** Pass-thru fields. */\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as AgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(\n codeArtifacts.map((artifact) =>\n runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(\n contentArtifacts.map((artifact) =>\n runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite =\n codeReviews.length === 0\n ? 0\n : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite =\n contentReviews.length === 0\n ? 0\n : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview)\n cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality)\n cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: {\n turns: sim.transcript.length,\n toolCalls: sim.toolCalls,\n artifactCount: sim.artifacts.length,\n },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `**Cells**: ${matrix.summary.totalCells} | **Pass rate**: ${(matrix.summary.overallPassRate * 100).toFixed(0)}% | **Mean**: ${matrix.summary.overallMeanScore.toFixed(2)} | **Cost**: $${matrix.summary.totalCostUsd.toFixed(2)} | **Duration**: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '| profile | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n `## By persona`,\n ``,\n '| persona | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' | 'assistant' | 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */\n id: string\n /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /** Opening user message (turn 0) — the persona's first ask. */\n buildOpener: (persona: TPersona) => string\n /** System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. */\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n type MultishotArtifact,\n MultishotDriverEmptyError,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: AgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → executor invoked inline when the agent emits a tool_call. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string | undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle =\n opts.tools && opts.toolExecutors\n ? {\n tools: opts.tools,\n executors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor ?? (() => undefined),\n }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText || null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({\n type: artifactType,\n turn,\n invocation: { name: tc.name, args: tc.args },\n content: toolResult,\n })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult || 'done' })\n transcript.push({ role: 'tool', content: toolResult || 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBACpB,KACmC;AACnC,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC;AAAA,IAC9E;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;ACnFO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO;AAAA,UACL,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU;AAAA,UACR,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBACd,SAAkC,CAAC,GACZ;AACvB,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAChB,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EACpF;AACF;;;ACpIO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SACpB,OACA,OACqB;AACrB,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,GAAG;AAAA,MACH,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7F;AAAA,EACF;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IACb,QAAQ,gBAAgB,EAAE,EAC1B,QAAQ,WAAW,EAAE,EACrB,KAAK;AACR,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACzGA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;;;AC4Cd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SACJ,KAAK,SAAS,KAAK,gBACf;AAAA,IACE,OAAO,KAAK;AAAA,IACZ,WAAW,KAAK;AAAA,IAChB,iBAAiB,KAAK,oBAAoB,MAAM;AAAA,EAClD,IACA,uBAAuB;AAC7B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK;AAAA,cACb,MAAM;AAAA,cACN;AAAA,cACA,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK;AAAA,cAC3C,SAAS;AAAA,YACX,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AFxHA,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ;AAAA,UACN,cAAc;AAAA,YAAI,CAAC,aACjB,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACpE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ;AAAA,UACN,iBAAiB;AAAA,YAAI,CAAC,aACpB,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACxE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,YAAM,mBACJ,eAAe,WAAW,IACtB,IACA,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAG3E,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO;AACd,kBAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AAC9E,UAAI,KAAK,OAAO;AACd,kBAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAExF,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ;AAAA,UACN,OAAO,IAAI,WAAW;AAAA,UACtB,WAAW,IAAI;AAAA,UACf,eAAe,IAAI,UAAU;AAAA,QAC/B;AAAA,QACA,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/matrix.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string | null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(\n req: RouterCompletionRequest,\n): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string | null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') || model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') || model.includes('glm') || model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(\n /\\/+$/,\n '',\n )\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. */\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /** Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. */\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: {\n type: 'string',\n description: 'Optional scope: time window, geography, jurisdiction, segment',\n },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: {\n type: 'string',\n description: 'Optional language preference (default: TypeScript)',\n },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(\n config: DefaultResearcherConfig = {},\n): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /** When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. */\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string | undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) =>\n name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined,\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /** Display name (for trace + log). */\n name: string\n /** Model used for this judge. */\n model?: string\n /** 0-10 scored dimensions. */\n dimensions: JudgeDimension[]\n /** Judge system prompt — sets persona + JSON-only constraint. */\n systemPrompt: string\n /** Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. */\n buildPrompt: (input: TInput) => string\n /** Optional model + api overrides. */\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /** Per-dimension 0-10 score. Missing dims default to 0. */\n dimensions: Record<string, number>\n /** Mean across dimensions. */\n composite: number\n /** Free-form 1-2 sentence critique from the judge (when provided). */\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(\n judge: JudgeConfig<TInput>,\n input: TInput,\n): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return {\n ...ZERO_SCORE,\n notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n\n let parsed: Record<string, unknown> | null = null\n try {\n const cleaned = raw\n .replace(/^```json\\s*/i, '')\n .replace(/```\\s*$/, '')\n .trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/** Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/** Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. */\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile as SandboxAgentProfile } from '@tangle-network/sandbox'\nimport type { MatrixResult } from '../matrix'\nimport { runAgentMatrix } from '../matrix'\nimport { type JudgeConfig, type JudgeScore, runJudge } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /** Scores the full transcript end-to-end (always runs). */\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Scores each non-code (research/content/template) artifact. Optional. */\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Which artifact types route to codeReview. Defaults to ['code']. */\n codeArtifactTypes?: string[]\n /** Which artifact types route to contentQuality. Defaults to ['research']. */\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n contentQuality?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /** AgentProfile axis (matrix primary). */\n profiles: Array<{ id: string; value: SandboxAgentProfile }>\n /** Persona axis. */\n personas: TPersona[]\n /** Persona-shaping callbacks. */\n shape: MultishotShape<TPersona>\n /** Judge configurations. */\n judges: MultishotJudges<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → inline executor. Must align with `tools`. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Tool name → artifact type label. Defaults to research/code mapping. */\n artifactTypeFor?: (toolName: string) => string | undefined\n /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */\n runDir: string\n /** Replicates per (profile, persona) cell. */\n reps?: number\n /** Max conversation turns per cell. */\n maxTurns?: number\n /** Max concurrent cells. */\n maxConcurrency?: number\n /** Total $ ceiling across the matrix; cells aborted past this. */\n costCeiling?: number\n /** Agent model. */\n agentModel?: string\n /** Driver model. */\n driverModel?: string\n /** Pass-thru fields. */\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as SandboxAgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(\n codeArtifacts.map((artifact) =>\n runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(\n contentArtifacts.map((artifact) =>\n runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite =\n codeReviews.length === 0\n ? 0\n : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite =\n contentReviews.length === 0\n ? 0\n : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview)\n cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality)\n cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: {\n turns: sim.transcript.length,\n toolCalls: sim.toolCalls,\n artifactCount: sim.artifacts.length,\n },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `**Cells**: ${matrix.summary.totalCells} | **Pass rate**: ${(matrix.summary.overallPassRate * 100).toFixed(0)}% | **Mean**: ${matrix.summary.overallMeanScore.toFixed(2)} | **Cost**: $${matrix.summary.totalCostUsd.toFixed(2)} | **Duration**: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '| profile | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n `## By persona`,\n ``,\n '| persona | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' | 'assistant' | 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */\n id: string\n /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /** Opening user message (turn 0) — the persona's first ask. */\n buildOpener: (persona: TPersona) => string\n /** System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. */\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile as SandboxAgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n type MultishotArtifact,\n MultishotDriverEmptyError,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: SandboxAgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → executor invoked inline when the agent emits a tool_call. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string | undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle =\n opts.tools && opts.toolExecutors\n ? {\n tools: opts.tools,\n executors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor ?? (() => undefined),\n }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText || null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({\n type: artifactType,\n turn,\n invocation: { name: tc.name, args: tc.args },\n content: toolResult,\n })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult || 'done' })\n transcript.push({ role: 'tool', content: toolResult || 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBACpB,KACmC;AACnC,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC;AAAA,IAC9E;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;ACnFO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO;AAAA,UACL,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU;AAAA,UACR,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBACd,SAAkC,CAAC,GACZ;AACvB,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAChB,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EACpF;AACF;;;ACpIO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SACpB,OACA,OACqB;AACrB,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,GAAG;AAAA,MACH,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7F;AAAA,EACF;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IACb,QAAQ,gBAAgB,EAAE,EAC1B,QAAQ,WAAW,EAAE,EACrB,KAAK;AACR,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACzGA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;;;AC4Cd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SACJ,KAAK,SAAS,KAAK,gBACf;AAAA,IACE,OAAO,KAAK;AAAA,IACZ,WAAW,KAAK;AAAA,IAChB,iBAAiB,KAAK,oBAAoB,MAAM;AAAA,EAClD,IACA,uBAAuB;AAC7B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK;AAAA,cACb,MAAM;AAAA,cACN;AAAA,cACA,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK;AAAA,cAC3C,SAAS;AAAA,YACX,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AFxHA,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ;AAAA,UACN,cAAc;AAAA,YAAI,CAAC,aACjB,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACpE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ;AAAA,UACN,iBAAiB;AAAA,YAAI,CAAC,aACpB,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACxE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,YAAM,mBACJ,eAAe,WAAW,IACtB,IACA,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAG3E,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO;AACd,kBAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AAC9E,UAAI,KAAK,OAAO;AACd,kBAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAExF,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ;AAAA,UACN,OAAO,IAAI,WAAW;AAAA,UACtB,WAAW,IAAI;AAAA,UACf,eAAe,IAAI,UAAU;AAAA,QAC/B;AAAA,QACA,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.62.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/pipelines/index.js
CHANGED
|
@@ -3,13 +3,13 @@ import {
|
|
|
3
3
|
classifyFailure,
|
|
4
4
|
compareToBaseline,
|
|
5
5
|
computeToolUseMetrics
|
|
6
|
-
} from "../chunk-
|
|
6
|
+
} from "../chunk-3B7Y5AUR.js";
|
|
7
7
|
import {
|
|
8
8
|
buildTrajectory
|
|
9
9
|
} from "../chunk-RZTMDUO7.js";
|
|
10
10
|
import {
|
|
11
11
|
interRaterReliability
|
|
12
|
-
} from "../chunk-
|
|
12
|
+
} from "../chunk-ITBRCT73.js";
|
|
13
13
|
import {
|
|
14
14
|
aggregateLlm,
|
|
15
15
|
argHash,
|
|
@@ -18,7 +18,7 @@ import {
|
|
|
18
18
|
toolSpans
|
|
19
19
|
} from "../chunk-47X6LRCE.js";
|
|
20
20
|
import "../chunk-5BKGXME7.js";
|
|
21
|
-
import "../chunk-
|
|
21
|
+
import "../chunk-3BFEG2F6.js";
|
|
22
22
|
import "../chunk-PZ5AY32C.js";
|
|
23
23
|
|
|
24
24
|
// src/pipelines/budget-breach.ts
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { S as Scenario,
|
|
2
|
-
import { L as LlmClientOptions } from './llm-client-
|
|
3
|
-
import { R as RedTeamCase } from './red-team-
|
|
4
|
-
import { R as RunRecord } from './run-record-
|
|
5
|
-
import { H as HostedClient, T as TraceSpanEvent } from './index-
|
|
1
|
+
import { S as Scenario, C as CampaignResult, q as GateResult, v as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, n as CampaignTraceWriter, M as MutableSurface, s as GenerationRecord, p as GateDecision } from './types-DH22o8hM.js';
|
|
2
|
+
import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
|
|
3
|
+
import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
|
|
4
|
+
import { R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
5
|
+
import { H as HostedClient, T as TraceSpanEvent } from './index-DxfmYUjC.js';
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
8
|
* @experimental
|
|
@@ -295,6 +295,17 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
|
|
|
295
295
|
* at `<runDir>/traces/`. `'off'` disables capture entirely — substrate
|
|
296
296
|
* refuses this when the caller wires `autoOnPromote !== 'none'`. */
|
|
297
297
|
tracing?: 'on' | 'off';
|
|
298
|
+
/**
|
|
299
|
+
* Per-cell usage expectation — the early, fine-grained sibling of the
|
|
300
|
+
* batch `assertRealBackend` guard. A cell that produced an artifact (no
|
|
301
|
+
* error) but reported `costUsd === 0` AND zero tokens is a stub: the
|
|
302
|
+
* dispatch never reported LLM activity via `ctx.cost`. Modes:
|
|
303
|
+
* - `'warn'` (default) — log the offending cell loudly, keep going.
|
|
304
|
+
* - `'assert'` — throw `BackendIntegrityError` on the first such cell
|
|
305
|
+
* (fail-fast; recommended for CI campaigns expecting real LLM calls).
|
|
306
|
+
* - `'off'` — no check (replay / deterministic-only / offline analysis).
|
|
307
|
+
*/
|
|
308
|
+
expectUsage?: 'assert' | 'warn' | 'off';
|
|
298
309
|
/** Test seam — override the wall clock for deterministic tests. */
|
|
299
310
|
now?: () => Date;
|
|
300
311
|
/** Test seam — override per-cell trace writer factory. */
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-
|
|
2
|
-
import { R as RunRecord } from './run-record-
|
|
1
|
+
import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
|
|
2
|
+
import { R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
3
3
|
import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
|
|
4
4
|
import { J as JudgeInput } from './types-DhqpAi_z.js';
|
|
5
5
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
2
2
|
import { a as JudgeScore } from './types-DhqpAi_z.js';
|
|
3
|
-
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-
|
|
4
|
-
import { m as GateDecision } from './summary-report-
|
|
5
|
-
import { R as RunRecord,
|
|
3
|
+
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
|
|
4
|
+
import { m as GateDecision } from './summary-report-ByiOUrHj.js';
|
|
5
|
+
import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
8
|
* Release confidence gate.
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-
|
|
1
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DGoeObZT.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
-
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-
|
|
5
|
-
import './run-record-
|
|
6
|
-
import './errors-
|
|
4
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
|
|
5
|
+
import './run-record-BgTFzO2r.js';
|
|
6
|
+
import './errors-Dwqw-T_m.js';
|
|
7
7
|
import './schema-m0gsnbt3.js';
|
|
8
8
|
import './outcome-store-D6KWmYvj.js';
|
|
9
9
|
import './judge-calibration-DilmB3Ml.js';
|
|
10
10
|
import './types-DhqpAi_z.js';
|
|
11
11
|
import '@tangle-network/tcloud';
|
|
12
|
-
import './dataset-
|
|
12
|
+
import './dataset-B2kL-fSM.js';
|
|
13
13
|
import './failure-cluster-CL7IVgkJ.js';
|
|
14
14
|
import './store-CKUAgsJz.js';
|
package/dist/reporting.js
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
evaluateReleaseConfidence,
|
|
5
5
|
judgeReplayGate,
|
|
6
6
|
renderReleaseReport
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-B26KI423.js";
|
|
8
8
|
import {
|
|
9
9
|
rubricPredictiveValidity
|
|
10
10
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -18,14 +18,14 @@ import {
|
|
|
18
18
|
paretoChart,
|
|
19
19
|
researchReport,
|
|
20
20
|
summaryTable
|
|
21
|
-
} from "./chunk-
|
|
21
|
+
} from "./chunk-KX6F6NCG.js";
|
|
22
22
|
import {
|
|
23
23
|
benjaminiHochberg,
|
|
24
24
|
pairedBootstrap,
|
|
25
25
|
wilcoxonSignedRank
|
|
26
|
-
} from "./chunk-
|
|
26
|
+
} from "./chunk-ITBRCT73.js";
|
|
27
27
|
import "./chunk-VSMTAMNK.js";
|
|
28
|
-
import "./chunk-
|
|
28
|
+
import "./chunk-3BFEG2F6.js";
|
|
29
29
|
import "./chunk-PZ5AY32C.js";
|
|
30
30
|
export {
|
|
31
31
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-
|
|
3
|
-
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-
|
|
1
|
+
import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
2
|
+
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
|
|
3
|
+
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-ByiOUrHj.js';
|
|
4
4
|
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
|
|
5
|
-
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-
|
|
5
|
+
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
|
|
6
6
|
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
|
7
7
|
import { F as FailureClass } from './schema-m0gsnbt3.js';
|
|
8
8
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
import { R as RunRecord,
|
|
2
|
-
import {
|
|
3
|
-
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-
|
|
4
|
-
export { r as runEvalCampaign } from './researcher-
|
|
1
|
+
import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
|
|
2
|
+
import { C as CampaignResult } from './types-DH22o8hM.js';
|
|
3
|
+
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-WJvIpX3L.js';
|
|
4
|
+
export { r as runEvalCampaign } from './researcher-WJvIpX3L.js';
|
|
5
5
|
import { S as Span } from './schema-m0gsnbt3.js';
|
|
6
6
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
7
7
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
8
8
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
9
|
-
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-
|
|
9
|
+
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-D_4BSXGV.js';
|
|
10
10
|
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
11
|
-
import './errors-
|
|
12
|
-
import './llm-client-
|
|
11
|
+
import './errors-Dwqw-T_m.js';
|
|
12
|
+
import './llm-client-DbjLfz-K.js';
|
|
13
13
|
import './raw-provider-sink-C46HDghv.js';
|
|
14
|
-
import './summary-report-
|
|
14
|
+
import './summary-report-ByiOUrHj.js';
|
|
15
15
|
import './failure-cluster-CL7IVgkJ.js';
|
|
16
16
|
import './emitter-DEZwY14K.js';
|
|
17
|
-
import './integrity-
|
|
17
|
+
import './integrity-CJzrpUua.js';
|
|
18
18
|
|
|
19
19
|
/**
|
|
20
20
|
* Test-time compute scaling curves.
|
package/dist/rl.js
CHANGED
|
@@ -10,27 +10,27 @@ import {
|
|
|
10
10
|
} from "./chunk-3RF76KTD.js";
|
|
11
11
|
import {
|
|
12
12
|
runEvalCampaign
|
|
13
|
-
} from "./chunk-
|
|
14
|
-
import "./chunk-
|
|
13
|
+
} from "./chunk-AIWHLG7J.js";
|
|
14
|
+
import "./chunk-F3SRAAZO.js";
|
|
15
15
|
import {
|
|
16
16
|
rubricPredictiveValidity
|
|
17
17
|
} from "./chunk-YRZ4M5GS.js";
|
|
18
18
|
import {
|
|
19
19
|
evaluateInterimReleaseConfidence
|
|
20
20
|
} from "./chunk-MAZ26DC7.js";
|
|
21
|
-
import "./chunk-
|
|
21
|
+
import "./chunk-KX6F6NCG.js";
|
|
22
22
|
import {
|
|
23
23
|
benjaminiHochberg,
|
|
24
24
|
wilcoxonSignedRank
|
|
25
|
-
} from "./chunk-
|
|
26
|
-
import "./chunk-
|
|
25
|
+
} from "./chunk-ITBRCT73.js";
|
|
26
|
+
import "./chunk-SBCB6VZY.js";
|
|
27
27
|
import "./chunk-TVVP3ZZQ.js";
|
|
28
28
|
import "./chunk-VSMTAMNK.js";
|
|
29
|
-
import "./chunk-
|
|
29
|
+
import "./chunk-IHDHUN2X.js";
|
|
30
30
|
import "./chunk-PC4UYEBM.js";
|
|
31
31
|
import {
|
|
32
32
|
ValidationError
|
|
33
|
-
} from "./chunk-
|
|
33
|
+
} from "./chunk-3BFEG2F6.js";
|
|
34
34
|
import "./chunk-PZ5AY32C.js";
|
|
35
35
|
|
|
36
36
|
// src/rl/compute-curves.ts
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import {
|
|
2
|
+
runCampaign
|
|
3
|
+
} from "./chunk-7TPYV2ER.js";
|
|
4
|
+
import "./chunk-E22YUOAL.js";
|
|
5
|
+
import "./chunk-ITBRCT73.js";
|
|
6
|
+
import "./chunk-3BFEG2F6.js";
|
|
7
|
+
import "./chunk-PZ5AY32C.js";
|
|
8
|
+
export {
|
|
9
|
+
runCampaign
|
|
10
|
+
};
|
|
11
|
+
//# sourceMappingURL=run-campaign-5J3ED2UJ.js.map
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { V as ValidationError } from './errors-
|
|
1
|
+
import { V as ValidationError } from './errors-Dwqw-T_m.js';
|
|
2
2
|
import { F as FailureClass } from './schema-m0gsnbt3.js';
|
|
3
3
|
|
|
4
4
|
type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1';
|
|
@@ -304,4 +304,4 @@ declare function parseRunRecordSafe(input: unknown): {
|
|
|
304
304
|
/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
|
|
305
305
|
declare function roundTripRunRecord(record: RunRecord): RunRecord;
|
|
306
306
|
|
|
307
|
-
export { type AgentProfileCell as A, validateAgentProfileCell as B, validateRunRecord as C, verifyAgentProfileCell as D, type JudgeScoresRecord as J, type RunRecord as R, type SandboxAgentProfileLike as S, type
|
|
307
|
+
export { type AgentProfileCell as A, validateAgentProfileCell as B, validateRunRecord as C, verifyAgentProfileCell as D, type JudgeScoresRecord as J, type RunRecord as R, type SandboxAgentProfileLike as S, type RunTokenUsage as a, type RunSplitTag as b, type RunJudgeMetadata as c, type AgentProfileCellInput as d, AGENT_PROFILE_KINDS as e, type AgentProfileCellSchemaVersion as f, AgentProfileCellValidationError as g, type AgentProfileDimensionValue as h, type AgentProfileHarness as i, type AgentProfileJson as j, type AgentProfileKind as k, type AgentProfileSource as l, type AgentProfileSourceInput as m, type RunOutcome as n, RunRecordValidationError as o, agentProfileCellHashMaterial as p, agentProfileCellKey as q, assertRunAgentProfileCell as r, buildAgentProfileCell as s, buildSandboxAgentProfileCell as t, groupRunsByAgentProfileCell as u, isRunRecord as v, parseRunRecordSafe as w, requireAgentProfileCell as x, roundTripRunRecord as y, toAgentProfileJson as z };
|
package/dist/traces.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { N as NotFoundError, R as ReplayError } from './errors-
|
|
1
|
+
import { N as NotFoundError, R as ReplayError } from './errors-Dwqw-T_m.js';
|
|
2
2
|
import { P as ProviderRedactor, R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
|
|
3
3
|
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
|
|
4
4
|
import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
|
|
5
5
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
|
|
6
|
-
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-
|
|
6
|
+
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CJzrpUua.js';
|
|
7
7
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
8
8
|
export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
|
|
9
9
|
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
|
package/dist/traces.js
CHANGED
|
@@ -34,7 +34,7 @@ import {
|
|
|
34
34
|
tokenizeDomainWords,
|
|
35
35
|
traceAnalystFunctionGroup,
|
|
36
36
|
traceAnalystOnRunComplete
|
|
37
|
-
} from "./chunk-
|
|
37
|
+
} from "./chunk-Z4ZCBC7M.js";
|
|
38
38
|
import {
|
|
39
39
|
DEFAULT_REDACTION_RULES,
|
|
40
40
|
REDACTION_VERSION,
|
|
@@ -64,7 +64,7 @@ import {
|
|
|
64
64
|
RunIntegrityError,
|
|
65
65
|
assertRunCaptured,
|
|
66
66
|
throwIfRunIncomplete
|
|
67
|
-
} from "./chunk-
|
|
67
|
+
} from "./chunk-SBCB6VZY.js";
|
|
68
68
|
import {
|
|
69
69
|
TraceEmitter,
|
|
70
70
|
llmSpanFromProvider
|
|
@@ -77,7 +77,7 @@ import {
|
|
|
77
77
|
defaultProviderRedactor,
|
|
78
78
|
providerFromBaseUrl
|
|
79
79
|
} from "./chunk-PC4UYEBM.js";
|
|
80
|
-
import "./chunk-
|
|
80
|
+
import "./chunk-3BFEG2F6.js";
|
|
81
81
|
import "./chunk-PZ5AY32C.js";
|
|
82
82
|
export {
|
|
83
83
|
DEFAULT_REDACTION_RULES,
|