@tangle-network/agent-eval 0.38.0 → 0.40.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/campaign/index.d.ts +775 -0
  2. package/dist/campaign/index.js +807 -0
  3. package/dist/campaign/index.js.map +1 -0
  4. package/dist/chunk-5U2DOJU4.js +565 -0
  5. package/dist/chunk-5U2DOJU4.js.map +1 -0
  6. package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
  7. package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
  8. package/dist/chunk-BWZEGTES.js.map +1 -0
  9. package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
  10. package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
  11. package/dist/chunk-GGE4NNQT.js +65 -0
  12. package/dist/chunk-GGE4NNQT.js.map +1 -0
  13. package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
  14. package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
  15. package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
  16. package/dist/chunk-MAOZCN36.js.map +1 -0
  17. package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
  18. package/dist/chunk-TMXPFWC7.js +305 -0
  19. package/dist/chunk-TMXPFWC7.js.map +1 -0
  20. package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
  21. package/dist/chunk-WP7SY7AI.js.map +1 -0
  22. package/dist/chunk-YV7J7X5N.js +313 -0
  23. package/dist/chunk-YV7J7X5N.js.map +1 -0
  24. package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
  25. package/dist/control.d.ts +3 -3
  26. package/dist/control.js +2 -2
  27. package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
  28. package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
  29. package/dist/governance/index.d.ts +133 -5
  30. package/dist/index.d.ts +35 -34
  31. package/dist/index.js +97 -630
  32. package/dist/index.js.map +1 -1
  33. package/dist/multishot/index.d.ts +21 -21
  34. package/dist/multishot/index.js +64 -15
  35. package/dist/multishot/index.js.map +1 -1
  36. package/dist/openapi.json +1 -1
  37. package/dist/optimization.d.ts +2 -2
  38. package/dist/optimization.js +5 -5
  39. package/dist/pipelines/index.js +2 -2
  40. package/dist/red-team-30II1T4o.d.ts +63 -0
  41. package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
  42. package/dist/reporting.d.ts +2 -2
  43. package/dist/reporting.js +3 -3
  44. package/dist/rl.js +15 -315
  45. package/dist/rl.js.map +1 -1
  46. package/dist/run-campaign-JYJXYHHL.js +10 -0
  47. package/dist/run-campaign-JYJXYHHL.js.map +1 -0
  48. package/dist/traces.js +7 -5
  49. package/dist/wire/index.d.ts +2 -2
  50. package/docs/design/loop-taxonomy.md +233 -0
  51. package/docs/design/self-improvement-engine.md +130 -0
  52. package/package.json +33 -24
  53. package/dist/chunk-KHZRNY3F.js.map +0 -1
  54. package/dist/chunk-L5UNCDAJ.js.map +0 -1
  55. package/dist/chunk-TSPOEDM3.js.map +0 -1
  56. package/dist/index-CN2agEaO.d.ts +0 -191
  57. /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
  58. /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
  59. /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
  60. /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
  61. /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
  62. /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
@@ -168,27 +168,6 @@ declare function renderDimensions(dims: readonly JudgeDimension[]): string;
168
168
  /** Convenience: build the "Respond with ONLY this JSON" footer for a judge prompt. */
169
169
  declare function renderJsonFooter(dims: readonly JudgeDimension[]): string;
170
170
 
171
- interface RunMultishotOptions<TPersona extends MultishotPersona> {
172
- profile: AgentProfile;
173
- persona: TPersona;
174
- shape: MultishotShape<TPersona>;
175
- /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */
176
- tools?: MultishotToolDefinition[];
177
- /** Map from tool name → executor invoked inline when the agent emits a tool_call. */
178
- toolExecutors?: Record<string, MultishotToolExecutor>;
179
- /** Map from tool name → artifact type label written into MultishotArtifact.type.
180
- * Tools without a mapping still execute, but their results aren't surfaced as
181
- * typed artifacts (only as tool messages in the transcript). */
182
- artifactTypeFor?: (toolName: string) => string | undefined;
183
- maxTurns?: number;
184
- agentModel?: string;
185
- driverModel?: string;
186
- apiKey?: string;
187
- baseUrl?: string;
188
- signal?: AbortSignal;
189
- }
190
- declare function runMultishot<TPersona extends MultishotPersona>(opts: RunMultishotOptions<TPersona>): Promise<MultishotResult>;
191
-
192
171
  interface ConversationJudgeInput<TPersona extends MultishotPersona> {
193
172
  transcript: MultishotMessage[];
194
173
  persona: TPersona;
@@ -273,4 +252,25 @@ interface RunMultishotMatrixResult {
273
252
  }
274
253
  declare function runMultishotMatrix<TPersona extends MultishotPersona>(opts: RunMultishotMatrixOptions<TPersona>): Promise<RunMultishotMatrixResult>;
275
254
 
255
+ interface RunMultishotOptions<TPersona extends MultishotPersona> {
256
+ profile: AgentProfile;
257
+ persona: TPersona;
258
+ shape: MultishotShape<TPersona>;
259
+ /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */
260
+ tools?: MultishotToolDefinition[];
261
+ /** Map from tool name → executor invoked inline when the agent emits a tool_call. */
262
+ toolExecutors?: Record<string, MultishotToolExecutor>;
263
+ /** Map from tool name → artifact type label written into MultishotArtifact.type.
264
+ * Tools without a mapping still execute, but their results aren't surfaced as
265
+ * typed artifacts (only as tool messages in the transcript). */
266
+ artifactTypeFor?: (toolName: string) => string | undefined;
267
+ maxTurns?: number;
268
+ agentModel?: string;
269
+ driverModel?: string;
270
+ apiKey?: string;
271
+ baseUrl?: string;
272
+ signal?: AbortSignal;
273
+ }
274
+ declare function runMultishot<TPersona extends MultishotPersona>(opts: RunMultishotOptions<TPersona>): Promise<MultishotResult>;
275
+
276
276
  export { type ArtifactJudgeInput, type CellCompositeScore, type ConversationJudgeInput, DEFAULT_CODER_MODEL, DEFAULT_DELEGATE_CODE_TOOL, DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_JUDGE_MODEL, DEFAULT_RESEARCHER_MODEL, type DefaultCoderConfig, type DefaultResearcherConfig, type DefaultToolsBundle, type DefaultToolsConfig, type JudgeConfig, type JudgeDimension, type JudgeScore, type MultishotArtifact, MultishotDriverEmptyError, type MultishotJudges, type MultishotMessage, type MultishotPersona, type MultishotResult, type MultishotShape, type MultishotToolDefinition, type MultishotToolExecutor, type RouterCompletionRequest, type RouterCompletionResponse, type RouterToolCall, type RunMultishotMatrixOptions, type RunMultishotMatrixResult, type RunMultishotOptions, createCodeExecutor, createResearchExecutor, defaultDelegationTools, defaultRouterBaseUrl, estimateRouterCost, renderDimensions, renderJsonFooter, requireRouterApiKey, routerCompletion, runJudge, runMultishot, runMultishotMatrix };
@@ -47,7 +47,10 @@ function estimateRouterCost(model, usage) {
47
47
  return (inputTok * inPer1k + outputTok * outPer1k) / 1e3;
48
48
  }
49
49
  function defaultRouterBaseUrl() {
50
- return (process.env.TANGLE_ROUTER_BASE_URL ?? "https://router.tangle.tools/v1").replace(/\/+$/, "");
50
+ return (process.env.TANGLE_ROUTER_BASE_URL ?? "https://router.tangle.tools/v1").replace(
51
+ /\/+$/,
52
+ ""
53
+ );
51
54
  }
52
55
  function requireRouterApiKey() {
53
56
  const key = process.env.TANGLE_API_KEY;
@@ -69,7 +72,10 @@ var DEFAULT_DELEGATE_RESEARCH_TOOL = {
69
72
  type: "object",
70
73
  properties: {
71
74
  question: { type: "string", description: "Specific question to research" },
72
- scope: { type: "string", description: "Optional scope: time window, geography, jurisdiction, segment" }
75
+ scope: {
76
+ type: "string",
77
+ description: "Optional scope: time window, geography, jurisdiction, segment"
78
+ }
73
79
  },
74
80
  required: ["question"]
75
81
  }
@@ -84,7 +90,10 @@ var DEFAULT_DELEGATE_CODE_TOOL = {
84
90
  type: "object",
85
91
  properties: {
86
92
  goal: { type: "string", description: "What the code must accomplish" },
87
- language: { type: "string", description: "Optional language preference (default: TypeScript)" }
93
+ language: {
94
+ type: "string",
95
+ description: "Optional language preference (default: TypeScript)"
96
+ }
88
97
  },
89
98
  required: ["goal"]
90
99
  }
@@ -169,7 +178,10 @@ async function runJudge(judge, input) {
169
178
  });
170
179
  raw = (message.content ?? "").trim();
171
180
  } catch (err) {
172
- return { ...ZERO_SCORE, notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}` };
181
+ return {
182
+ ...ZERO_SCORE,
183
+ notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`
184
+ };
173
185
  }
174
186
  let parsed = null;
175
187
  try {
@@ -201,6 +213,10 @@ function renderJsonFooter(dims) {
201
213
  {${fields},"notes":"1-2 sentence critique"}`;
202
214
  }
203
215
 
216
+ // src/multishot/matrix.ts
217
+ import { mkdirSync, writeFileSync } from "fs";
218
+ import { join } from "path";
219
+
204
220
  // src/multishot/types.ts
205
221
  var MultishotDriverEmptyError = class extends Error {
206
222
  constructor(turn) {
@@ -218,7 +234,11 @@ async function runMultishot(opts) {
218
234
  const maxTurns = opts.maxTurns ?? 10;
219
235
  const agentModel = opts.agentModel ?? "openai/gpt-5.4";
220
236
  const driverModel = opts.driverModel ?? "openai/gpt-4o-mini";
221
- const bundle = opts.tools && opts.toolExecutors ? { tools: opts.tools, executors: opts.toolExecutors, artifactTypeFor: opts.artifactTypeFor ?? (() => void 0) } : defaultDelegationTools();
237
+ const bundle = opts.tools && opts.toolExecutors ? {
238
+ tools: opts.tools,
239
+ executors: opts.toolExecutors,
240
+ artifactTypeFor: opts.artifactTypeFor ?? (() => void 0)
241
+ } : defaultDelegationTools();
222
242
  const tools = opts.tools ?? bundle.tools;
223
243
  const executors = opts.toolExecutors ?? bundle.executors;
224
244
  const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor;
@@ -282,7 +302,12 @@ async function runMultishot(opts) {
282
302
  totalCostUsd += r.costUsd;
283
303
  const artifactType = artifactTypeFor(tc.name);
284
304
  if (artifactType) {
285
- artifacts.push({ type: artifactType, turn, invocation: { name: tc.name, args: tc.args }, content: toolResult });
305
+ artifacts.push({
306
+ type: artifactType,
307
+ turn,
308
+ invocation: { name: tc.name, args: tc.args },
309
+ content: toolResult
310
+ });
286
311
  }
287
312
  }
288
313
  } catch (err) {
@@ -349,8 +374,6 @@ async function driverTurn(opts) {
349
374
  }
350
375
 
351
376
  // src/multishot/matrix.ts
352
- import { mkdirSync, writeFileSync } from "fs";
353
- import { join } from "path";
354
377
  async function runMultishotMatrix(opts) {
355
378
  const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ["code"]);
356
379
  const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ["research"]);
@@ -385,16 +408,34 @@ async function runMultishotMatrix(opts) {
385
408
  const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type));
386
409
  const [conversation, codeReviews, contentReviews] = await Promise.all([
387
410
  runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),
388
- opts.judges.codeReview ? Promise.all(codeArtifacts.map((artifact) => runJudge(opts.judges.codeReview, { artifact, persona }).then((s) => ({ ...s, turn: artifact.turn, type: artifact.type })))) : Promise.resolve([]),
389
- opts.judges.contentQuality ? Promise.all(contentArtifacts.map((artifact) => runJudge(opts.judges.contentQuality, { artifact, persona }).then((s) => ({ ...s, turn: artifact.turn, type: artifact.type })))) : Promise.resolve([])
411
+ opts.judges.codeReview ? Promise.all(
412
+ codeArtifacts.map(
413
+ (artifact) => runJudge(opts.judges.codeReview, { artifact, persona }).then((s) => ({
414
+ ...s,
415
+ turn: artifact.turn,
416
+ type: artifact.type
417
+ }))
418
+ )
419
+ ) : Promise.resolve([]),
420
+ opts.judges.contentQuality ? Promise.all(
421
+ contentArtifacts.map(
422
+ (artifact) => runJudge(opts.judges.contentQuality, { artifact, persona }).then((s) => ({
423
+ ...s,
424
+ turn: artifact.turn,
425
+ type: artifact.type
426
+ }))
427
+ )
428
+ ) : Promise.resolve([])
390
429
  ]);
391
430
  const codeComposite = codeReviews.length === 0 ? 0 : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length;
392
431
  const contentComposite = contentReviews.length === 0 ? 0 : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length;
393
432
  const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0);
394
433
  const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount;
395
434
  const cellScore = { composite, conversation };
396
- if (opts.judges.codeReview) cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite };
397
- if (opts.judges.contentQuality) cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite };
435
+ if (opts.judges.codeReview)
436
+ cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite };
437
+ if (opts.judges.contentQuality)
438
+ cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite };
398
439
  const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`);
399
440
  mkdirSync(cellDir, { recursive: true });
400
441
  writeFileSync(join(cellDir, "transcript.json"), JSON.stringify(sim.transcript, null, 2));
@@ -404,7 +445,11 @@ async function runMultishotMatrix(opts) {
404
445
  if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`);
405
446
  if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`);
406
447
  return {
407
- output: { turns: sim.transcript.length, toolCalls: sim.toolCalls, artifactCount: sim.artifacts.length },
448
+ output: {
449
+ turns: sim.transcript.length,
450
+ toolCalls: sim.toolCalls,
451
+ artifactCount: sim.artifacts.length
452
+ },
408
453
  verdict: { valid: composite >= 5, score: composite, notes: notes.join(" ") },
409
454
  costUsd: sim.costUsd,
410
455
  durationMs: sim.durationMs
@@ -432,13 +477,17 @@ async function runMultishotMatrix(opts) {
432
477
  ``,
433
478
  "| profile | pass | mean | cost |",
434
479
  "|---|---|---|---|",
435
- ...Object.entries(matrix.byAxis.profile ?? {}).map(([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`),
480
+ ...Object.entries(matrix.byAxis.profile ?? {}).map(
481
+ ([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`
482
+ ),
436
483
  ``,
437
484
  `## By persona`,
438
485
  ``,
439
486
  "| persona | pass | mean | cost |",
440
487
  "|---|---|---|---|",
441
- ...Object.entries(matrix.byAxis.persona ?? {}).map(([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`),
488
+ ...Object.entries(matrix.byAxis.persona ?? {}).map(
489
+ ([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`
490
+ ),
442
491
  ``
443
492
  ];
444
493
  writeFileSync(join(opts.runDir, "summary.md"), md.join("\n"));
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts","../../src/multishot/matrix.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string | null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(req: RouterCompletionRequest): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string | null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') || model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') || model.includes('glm') || model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(/\\/+$/, '')\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. */\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /** Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. */\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: { type: 'string', description: 'Optional scope: time window, geography, jurisdiction, segment' },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: { type: 'string', description: 'Optional language preference (default: TypeScript)' },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(config: DefaultResearcherConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /** When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. */\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string | undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) => (name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined),\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /** Display name (for trace + log). */\n name: string\n /** Model used for this judge. */\n model?: string\n /** 0-10 scored dimensions. */\n dimensions: JudgeDimension[]\n /** Judge system prompt — sets persona + JSON-only constraint. */\n systemPrompt: string\n /** Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. */\n buildPrompt: (input: TInput) => string\n /** Optional model + api overrides. */\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /** Per-dimension 0-10 score. Missing dims default to 0. */\n dimensions: Record<string, number>\n /** Mean across dimensions. */\n composite: number\n /** Free-form 1-2 sentence critique from the judge (when provided). */\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(judge: JudgeConfig<TInput>, input: TInput): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}` }\n }\n\n let parsed: Record<string, unknown> | null = null\n try {\n const cleaned = raw.replace(/^```json\\s*/i, '').replace(/```\\s*$/, '').trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/** Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/** Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. */\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' | 'assistant' | 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */\n id: string\n /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /** Opening user message (turn 0) — the persona's first ask. */\n buildOpener: (persona: TPersona) => string\n /** System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. */\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n MultishotDriverEmptyError,\n type MultishotArtifact,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: AgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → executor invoked inline when the agent emits a tool_call. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string | undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle = opts.tools && opts.toolExecutors\n ? { tools: opts.tools, executors: opts.toolExecutors, artifactTypeFor: opts.artifactTypeFor ?? (() => undefined) }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText || null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({ type: artifactType, turn, invocation: { name: tc.name, args: tc.args }, content: toolResult })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult || 'done' })\n transcript.push({ role: 'tool', content: toolResult || 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { runAgentMatrix } from '../matrix'\nimport type { MatrixResult } from '../matrix'\nimport { runJudge, type JudgeConfig, type JudgeScore } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /** Scores the full transcript end-to-end (always runs). */\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Scores each non-code (research/content/template) artifact. Optional. */\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Which artifact types route to codeReview. Defaults to ['code']. */\n codeArtifactTypes?: string[]\n /** Which artifact types route to contentQuality. Defaults to ['research']. */\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: { perArtifact: Array<JudgeScore & { turn: number; type: string }>; composite: number }\n contentQuality?: { perArtifact: Array<JudgeScore & { turn: number; type: string }>; composite: number }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /** AgentProfile axis (matrix primary). */\n profiles: Array<{ id: string; value: AgentProfile }>\n /** Persona axis. */\n personas: TPersona[]\n /** Persona-shaping callbacks. */\n shape: MultishotShape<TPersona>\n /** Judge configurations. */\n judges: MultishotJudges<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → inline executor. Must align with `tools`. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Tool name → artifact type label. Defaults to research/code mapping. */\n artifactTypeFor?: (toolName: string) => string | undefined\n /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */\n runDir: string\n /** Replicates per (profile, persona) cell. */\n reps?: number\n /** Max conversation turns per cell. */\n maxTurns?: number\n /** Max concurrent cells. */\n maxConcurrency?: number\n /** Total $ ceiling across the matrix; cells aborted past this. */\n costCeiling?: number\n /** Agent model. */\n agentModel?: string\n /** Driver model. */\n driverModel?: string\n /** Pass-thru fields. */\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as AgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(codeArtifacts.map((artifact) => runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({ ...s, turn: artifact.turn, type: artifact.type }))))\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(contentArtifacts.map((artifact) => runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({ ...s, turn: artifact.turn, type: artifact.type }))))\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite = codeReviews.length === 0 ? 0 : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite = contentReviews.length === 0 ? 0 : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview) cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality) cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: { turns: sim.transcript.length, toolCalls: sim.toolCalls, artifactCount: sim.artifacts.length },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `**Cells**: ${matrix.summary.totalCells} | **Pass rate**: ${(matrix.summary.overallPassRate * 100).toFixed(0)}% | **Mean**: ${matrix.summary.overallMeanScore.toFixed(2)} | **Cost**: $${matrix.summary.totalCostUsd.toFixed(2)} | **Duration**: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '| profile | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`),\n ``,\n `## By persona`,\n ``,\n '| persona | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBAAiB,KAAiE;AACtG,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC,QAAQ,QAAQ,EAAE;AACpG;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;AC9EO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO,EAAE,MAAM,UAAU,aAAa,gEAAgE;AAAA,MACxG;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU,EAAE,MAAM,UAAU,aAAa,qDAAqD;AAAA,MAChG;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBAAuB,SAAkC,CAAC,GAA0B;AAClG,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAAU,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EAC9G;AACF;;;AC3HO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SAAiB,OAA4B,OAAoC;AACrG,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC,GAAG;AAAA,EACxH;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IAAI,QAAQ,gBAAgB,EAAE,EAAE,QAAQ,WAAW,EAAE,EAAE,KAAK;AAC5E,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACnDO,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SAAS,KAAK,SAAS,KAAK,gBAC9B,EAAE,OAAO,KAAK,OAAO,WAAW,KAAK,eAAe,iBAAiB,KAAK,oBAAoB,MAAM,QAAW,IAC/G,uBAAuB;AAC3B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK,EAAE,MAAM,cAAc,MAAM,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK,GAAG,SAAS,WAAW,CAAC;AAAA,UAChH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AC9MA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;AAyFrB,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ,IAAI,cAAc,IAAI,CAAC,aAAa,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO,EAAE,GAAG,GAAG,MAAM,SAAS,MAAM,MAAM,SAAS,KAAK,EAAE,CAAC,CAAC,IACvK,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ,IAAI,iBAAiB,IAAI,CAAC,aAAa,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO,EAAE,GAAG,GAAG,MAAM,SAAS,MAAM,MAAM,SAAS,KAAK,EAAE,CAAC,CAAC,IAC9K,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBAAgB,YAAY,WAAW,IAAI,IAAI,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACpH,YAAM,mBAAmB,eAAe,WAAW,IAAI,IAAI,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAGhI,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO,WAAY,WAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AACxG,UAAI,KAAK,OAAO,eAAgB,WAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAEtH,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ,EAAE,OAAO,IAAI,WAAW,QAAQ,WAAW,IAAI,WAAW,eAAe,IAAI,UAAU,OAAO;AAAA,QACtG,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC,IAAI;AAAA,IAC3K;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC,IAAI;AAAA,IAC3K;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
1
+ {"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/matrix.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string | null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(\n req: RouterCompletionRequest,\n): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string | null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') || model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') || model.includes('glm') || model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(\n /\\/+$/,\n '',\n )\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. */\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /** Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. */\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: {\n type: 'string',\n description: 'Optional scope: time window, geography, jurisdiction, segment',\n },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: {\n type: 'string',\n description: 'Optional language preference (default: TypeScript)',\n },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(\n config: DefaultResearcherConfig = {},\n): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /** When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. */\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string | undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) =>\n name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined,\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /** Display name (for trace + log). */\n name: string\n /** Model used for this judge. */\n model?: string\n /** 0-10 scored dimensions. */\n dimensions: JudgeDimension[]\n /** Judge system prompt — sets persona + JSON-only constraint. */\n systemPrompt: string\n /** Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. */\n buildPrompt: (input: TInput) => string\n /** Optional model + api overrides. */\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /** Per-dimension 0-10 score. Missing dims default to 0. */\n dimensions: Record<string, number>\n /** Mean across dimensions. */\n composite: number\n /** Free-form 1-2 sentence critique from the judge (when provided). */\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(\n judge: JudgeConfig<TInput>,\n input: TInput,\n): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return {\n ...ZERO_SCORE,\n notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n\n let parsed: Record<string, unknown> | null = null\n try {\n const cleaned = raw\n .replace(/^```json\\s*/i, '')\n .replace(/```\\s*$/, '')\n .trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/** Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/** Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. */\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport type { MatrixResult } from '../matrix'\nimport { runAgentMatrix } from '../matrix'\nimport { type JudgeConfig, type JudgeScore, runJudge } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /** Scores the full transcript end-to-end (always runs). */\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Scores each non-code (research/content/template) artifact. Optional. */\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Which artifact types route to codeReview. Defaults to ['code']. */\n codeArtifactTypes?: string[]\n /** Which artifact types route to contentQuality. Defaults to ['research']. */\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n contentQuality?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /** AgentProfile axis (matrix primary). */\n profiles: Array<{ id: string; value: AgentProfile }>\n /** Persona axis. */\n personas: TPersona[]\n /** Persona-shaping callbacks. */\n shape: MultishotShape<TPersona>\n /** Judge configurations. */\n judges: MultishotJudges<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → inline executor. Must align with `tools`. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Tool name → artifact type label. Defaults to research/code mapping. */\n artifactTypeFor?: (toolName: string) => string | undefined\n /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */\n runDir: string\n /** Replicates per (profile, persona) cell. */\n reps?: number\n /** Max conversation turns per cell. */\n maxTurns?: number\n /** Max concurrent cells. */\n maxConcurrency?: number\n /** Total $ ceiling across the matrix; cells aborted past this. */\n costCeiling?: number\n /** Agent model. */\n agentModel?: string\n /** Driver model. */\n driverModel?: string\n /** Pass-thru fields. */\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as AgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(\n codeArtifacts.map((artifact) =>\n runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(\n contentArtifacts.map((artifact) =>\n runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite =\n codeReviews.length === 0\n ? 0\n : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite =\n contentReviews.length === 0\n ? 0\n : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview)\n cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality)\n cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: {\n turns: sim.transcript.length,\n toolCalls: sim.toolCalls,\n artifactCount: sim.artifacts.length,\n },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `**Cells**: ${matrix.summary.totalCells} | **Pass rate**: ${(matrix.summary.overallPassRate * 100).toFixed(0)}% | **Mean**: ${matrix.summary.overallMeanScore.toFixed(2)} | **Cost**: $${matrix.summary.totalCostUsd.toFixed(2)} | **Duration**: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '| profile | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n `## By persona`,\n ``,\n '| persona | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' | 'assistant' | 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */\n id: string\n /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /** Opening user message (turn 0) — the persona's first ask. */\n buildOpener: (persona: TPersona) => string\n /** System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. */\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n type MultishotArtifact,\n MultishotDriverEmptyError,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: AgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → executor invoked inline when the agent emits a tool_call. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string | undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle =\n opts.tools && opts.toolExecutors\n ? {\n tools: opts.tools,\n executors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor ?? (() => undefined),\n }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText || null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({\n type: artifactType,\n turn,\n invocation: { name: tc.name, args: tc.args },\n content: toolResult,\n })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult || 'done' })\n transcript.push({ role: 'tool', content: toolResult || 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBACpB,KACmC;AACnC,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC;AAAA,IAC9E;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;ACnFO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO;AAAA,UACL,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU;AAAA,UACR,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBACd,SAAkC,CAAC,GACZ;AACvB,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAChB,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EACpF;AACF;;;ACpIO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SACpB,OACA,OACqB;AACrB,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,GAAG;AAAA,MACH,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7F;AAAA,EACF;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IACb,QAAQ,gBAAgB,EAAE,EAC1B,QAAQ,WAAW,EAAE,EACrB,KAAK;AACR,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACzGA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;;;AC4Cd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SACJ,KAAK,SAAS,KAAK,gBACf;AAAA,IACE,OAAO,KAAK;AAAA,IACZ,WAAW,KAAK;AAAA,IAChB,iBAAiB,KAAK,oBAAoB,MAAM;AAAA,EAClD,IACA,uBAAuB;AAC7B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK;AAAA,cACb,MAAM;AAAA,cACN;AAAA,cACA,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK;AAAA,cAC3C,SAAS;AAAA,YACX,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AFxHA,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ;AAAA,UACN,cAAc;AAAA,YAAI,CAAC,aACjB,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACpE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ;AAAA,UACN,iBAAiB;AAAA,YAAI,CAAC,aACpB,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACxE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,YAAM,mBACJ,eAAe,WAAW,IACtB,IACA,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAG3E,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO;AACd,kBAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AAC9E,UAAI,KAAK,OAAO;AACd,kBAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAExF,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ;AAAA,UACN,OAAO,IAAI,WAAW;AAAA,UACtB,WAAW,IAAI;AAAA,UACf,eAAe,IAAI,UAAU;AAAA,QAC/B;AAAA,QACA,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.38.0",
5
+ "version": "0.40.2",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,5 +1,5 @@
1
1
  export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-DeZ_EArp.js';
2
- export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
2
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-Dvy-bt7x.js';
3
3
  export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-DuZXOk7K.js';
4
4
  import './run-record-BGY6bHRh.js';
5
5
  import './errors-mje_cKOs.js';
@@ -7,5 +7,5 @@ import './integrity-DYR5gWlb.js';
7
7
  import './store-Db2Bv8Cf.js';
8
8
  import './emitter-DP_cSSiw.js';
9
9
  import './control-runtime-BZ_lVLYW.js';
10
- import './dataset-ueRVTUoY.js';
10
+ import './dataset-BlwAtYYf.js';
11
11
  import './failure-cluster-Cw65_5FY.js';
@@ -25,14 +25,14 @@ import {
25
25
  summarizePreferenceMemory,
26
26
  trialTraceFromMultiShotTrial,
27
27
  withAssignedFeedbackSplit
28
- } from "./chunk-3HYQXPC2.js";
28
+ } from "./chunk-DMW5VENN.js";
29
29
  import {
30
30
  runEvalCampaign
31
- } from "./chunk-RL6TERL2.js";
31
+ } from "./chunk-LCIDRYGP.js";
32
32
  import "./chunk-VXNVVBZO.js";
33
- import "./chunk-TSPOEDM3.js";
34
- import "./chunk-TQL7BAOY.js";
35
- import "./chunk-KHZRNY3F.js";
33
+ import "./chunk-BWZEGTES.js";
34
+ import "./chunk-EGIPWXHL.js";
35
+ import "./chunk-WP7SY7AI.js";
36
36
  import "./chunk-UBPIXOC4.js";
37
37
  import "./chunk-PC4UYEBM.js";
38
38
  import "./chunk-TVVP3ZZQ.js";
@@ -2,13 +2,13 @@ import {
2
2
  compareToBaseline,
3
3
  computeToolUseMetrics,
4
4
  failureClusterView
5
- } from "../chunk-KE7TDJUO.js";
5
+ } from "../chunk-AU2JLNSZ.js";
6
6
  import {
7
7
  buildTrajectory
8
8
  } from "../chunk-RZTMDUO7.js";
9
9
  import {
10
10
  interRaterReliability
11
- } from "../chunk-KHZRNY3F.js";
11
+ } from "../chunk-WP7SY7AI.js";
12
12
  import {
13
13
  aggregateLlm,
14
14
  argHash,
@@ -0,0 +1,63 @@
1
+ import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
2
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
3
+
4
+ /**
5
+ * Red-team battery — adversarial scenario corpus with per-category
6
+ * scorers.
7
+ *
8
+ * Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
9
+ * that declares: category, expected refusal, attack surface. Scorers
10
+ * inspect the agent's output + trajectory and return a PASS/FAIL per
11
+ * category. Aggregated via `redTeamReport`.
12
+ *
13
+ * The built-in corpus is intentionally conservative — 40 cases
14
+ * covering the most-seen attack patterns. Extend via `extendCorpus`.
15
+ */
16
+
17
+ type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
18
+ interface RedTeamPayload {
19
+ category: RedTeamCategory;
20
+ /** The hostile input the agent sees. */
21
+ input: string;
22
+ /**
23
+ * What a safe agent should do: "refuse" means explicit refusal,
24
+ * "ignore" means process benignly without executing the attack.
25
+ */
26
+ expected: 'refuse' | 'ignore';
27
+ /** Substring(s) that MUST NOT appear in output. */
28
+ forbiddenStrings?: string[];
29
+ /** Tool names the agent MUST NOT call for this scenario. */
30
+ forbiddenTools?: string[];
31
+ }
32
+ interface RedTeamCase extends DatasetScenario {
33
+ payload: RedTeamPayload;
34
+ }
35
+ interface RedTeamFinding {
36
+ scenarioId: string;
37
+ category: RedTeamCategory;
38
+ passed: boolean;
39
+ reason: string;
40
+ evidence?: string;
41
+ }
42
+ interface RedTeamReport {
43
+ findings: RedTeamFinding[];
44
+ passRateByCategory: Record<RedTeamCategory, number>;
45
+ overallPassRate: number;
46
+ }
47
+ /** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
48
+ declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
49
+ declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
50
+ /**
51
+ * Score a single agent response against the red-team case.
52
+ * `toolCalls` should be the names of tools the agent invoked during the run.
53
+ */
54
+ declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
55
+ /** Aggregate red-team findings into per-category pass rates. */
56
+ declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
57
+ /**
58
+ * Extract the tool-call names from a corpus run — convenience for the
59
+ * common pipeline (run the scenario → score the run).
60
+ */
61
+ declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
62
+
63
+ export { DEFAULT_RED_TEAM_CORPUS as D, type RedTeamCase as R, type RedTeamCategory as a, type RedTeamFinding as b, type RedTeamPayload as c, type RedTeamReport as d, redTeamReport as e, redTeamDataset as r, scoreRedTeamOutput as s, toolNamesForRun as t };
@@ -1,6 +1,6 @@
1
1
  import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
2
  import { TCloud } from '@tangle-network/tcloud';
3
- import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-ueRVTUoY.js';
3
+ import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
4
4
  import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-DuZXOk7K.js';
5
5
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
6
6
 
@@ -418,7 +418,10 @@ declare function weightedMean(scores: {
418
418
  weight?: number;
419
419
  }[]): number;
420
420
  /** Bootstrap confidence interval */
421
- declare function confidenceInterval(scores: number[], confidence?: number): {
421
+ declare function confidenceInterval(scores: number[], confidence?: number, opts?: {
422
+ seed?: number;
423
+ resamples?: number;
424
+ }): {
422
425
  mean: number;
423
426
  lower: number;
424
427
  upper: number;
@@ -1,5 +1,5 @@
1
1
  export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-D2ykiLSe.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-Di84bXD7.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
4
  export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-DuZXOk7K.js';
5
5
  import './run-record-BGY6bHRh.js';
@@ -7,6 +7,6 @@ import './errors-mje_cKOs.js';
7
7
  import './outcome-store-D6KWmYvj.js';
8
8
  import './judge-calibration-DilmB3Ml.js';
9
9
  import '@tangle-network/tcloud';
10
- import './dataset-ueRVTUoY.js';
10
+ import './dataset-BlwAtYYf.js';
11
11
  import './failure-cluster-Cw65_5FY.js';
12
12
  import './store-Db2Bv8Cf.js';