@tangle-network/agent-eval 0.38.0 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +695 -0
- package/dist/campaign/index.js +741 -0
- package/dist/campaign/index.js.map +1 -0
- package/dist/chunk-5U2DOJU4.js +565 -0
- package/dist/chunk-5U2DOJU4.js.map +1 -0
- package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
- package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
- package/dist/chunk-BWZEGTES.js.map +1 -0
- package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
- package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
- package/dist/chunk-GGE4NNQT.js +65 -0
- package/dist/chunk-GGE4NNQT.js.map +1 -0
- package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
- package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
- package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
- package/dist/chunk-MAOZCN36.js.map +1 -0
- package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
- package/dist/chunk-TMXPFWC7.js +305 -0
- package/dist/chunk-TMXPFWC7.js.map +1 -0
- package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
- package/dist/chunk-WP7SY7AI.js.map +1 -0
- package/dist/chunk-YV7J7X5N.js +313 -0
- package/dist/chunk-YV7J7X5N.js.map +1 -0
- package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
- package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
- package/dist/governance/index.d.ts +133 -5
- package/dist/index.d.ts +35 -34
- package/dist/index.js +97 -630
- package/dist/index.js.map +1 -1
- package/dist/multishot/index.d.ts +21 -21
- package/dist/multishot/index.js +64 -15
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.js +2 -2
- package/dist/red-team-30II1T4o.d.ts +63 -0
- package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +3 -3
- package/dist/rl.js +15 -315
- package/dist/rl.js.map +1 -1
- package/dist/run-campaign-JYJXYHHL.js +10 -0
- package/dist/run-campaign-JYJXYHHL.js.map +1 -0
- package/dist/traces.js +7 -5
- package/dist/wire/index.d.ts +2 -2
- package/docs/design/loop-taxonomy.md +233 -0
- package/package.json +33 -24
- package/dist/chunk-KHZRNY3F.js.map +0 -1
- package/dist/chunk-L5UNCDAJ.js.map +0 -1
- package/dist/chunk-TSPOEDM3.js.map +0 -1
- package/dist/index-CN2agEaO.d.ts +0 -191
- /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
- /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
- /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
- /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
- /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
- /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
|
@@ -168,27 +168,6 @@ declare function renderDimensions(dims: readonly JudgeDimension[]): string;
|
|
|
168
168
|
/** Convenience: build the "Respond with ONLY this JSON" footer for a judge prompt. */
|
|
169
169
|
declare function renderJsonFooter(dims: readonly JudgeDimension[]): string;
|
|
170
170
|
|
|
171
|
-
interface RunMultishotOptions<TPersona extends MultishotPersona> {
|
|
172
|
-
profile: AgentProfile;
|
|
173
|
-
persona: TPersona;
|
|
174
|
-
shape: MultishotShape<TPersona>;
|
|
175
|
-
/** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */
|
|
176
|
-
tools?: MultishotToolDefinition[];
|
|
177
|
-
/** Map from tool name → executor invoked inline when the agent emits a tool_call. */
|
|
178
|
-
toolExecutors?: Record<string, MultishotToolExecutor>;
|
|
179
|
-
/** Map from tool name → artifact type label written into MultishotArtifact.type.
|
|
180
|
-
* Tools without a mapping still execute, but their results aren't surfaced as
|
|
181
|
-
* typed artifacts (only as tool messages in the transcript). */
|
|
182
|
-
artifactTypeFor?: (toolName: string) => string | undefined;
|
|
183
|
-
maxTurns?: number;
|
|
184
|
-
agentModel?: string;
|
|
185
|
-
driverModel?: string;
|
|
186
|
-
apiKey?: string;
|
|
187
|
-
baseUrl?: string;
|
|
188
|
-
signal?: AbortSignal;
|
|
189
|
-
}
|
|
190
|
-
declare function runMultishot<TPersona extends MultishotPersona>(opts: RunMultishotOptions<TPersona>): Promise<MultishotResult>;
|
|
191
|
-
|
|
192
171
|
interface ConversationJudgeInput<TPersona extends MultishotPersona> {
|
|
193
172
|
transcript: MultishotMessage[];
|
|
194
173
|
persona: TPersona;
|
|
@@ -273,4 +252,25 @@ interface RunMultishotMatrixResult {
|
|
|
273
252
|
}
|
|
274
253
|
declare function runMultishotMatrix<TPersona extends MultishotPersona>(opts: RunMultishotMatrixOptions<TPersona>): Promise<RunMultishotMatrixResult>;
|
|
275
254
|
|
|
255
|
+
interface RunMultishotOptions<TPersona extends MultishotPersona> {
|
|
256
|
+
profile: AgentProfile;
|
|
257
|
+
persona: TPersona;
|
|
258
|
+
shape: MultishotShape<TPersona>;
|
|
259
|
+
/** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */
|
|
260
|
+
tools?: MultishotToolDefinition[];
|
|
261
|
+
/** Map from tool name → executor invoked inline when the agent emits a tool_call. */
|
|
262
|
+
toolExecutors?: Record<string, MultishotToolExecutor>;
|
|
263
|
+
/** Map from tool name → artifact type label written into MultishotArtifact.type.
|
|
264
|
+
* Tools without a mapping still execute, but their results aren't surfaced as
|
|
265
|
+
* typed artifacts (only as tool messages in the transcript). */
|
|
266
|
+
artifactTypeFor?: (toolName: string) => string | undefined;
|
|
267
|
+
maxTurns?: number;
|
|
268
|
+
agentModel?: string;
|
|
269
|
+
driverModel?: string;
|
|
270
|
+
apiKey?: string;
|
|
271
|
+
baseUrl?: string;
|
|
272
|
+
signal?: AbortSignal;
|
|
273
|
+
}
|
|
274
|
+
declare function runMultishot<TPersona extends MultishotPersona>(opts: RunMultishotOptions<TPersona>): Promise<MultishotResult>;
|
|
275
|
+
|
|
276
276
|
export { type ArtifactJudgeInput, type CellCompositeScore, type ConversationJudgeInput, DEFAULT_CODER_MODEL, DEFAULT_DELEGATE_CODE_TOOL, DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_JUDGE_MODEL, DEFAULT_RESEARCHER_MODEL, type DefaultCoderConfig, type DefaultResearcherConfig, type DefaultToolsBundle, type DefaultToolsConfig, type JudgeConfig, type JudgeDimension, type JudgeScore, type MultishotArtifact, MultishotDriverEmptyError, type MultishotJudges, type MultishotMessage, type MultishotPersona, type MultishotResult, type MultishotShape, type MultishotToolDefinition, type MultishotToolExecutor, type RouterCompletionRequest, type RouterCompletionResponse, type RouterToolCall, type RunMultishotMatrixOptions, type RunMultishotMatrixResult, type RunMultishotOptions, createCodeExecutor, createResearchExecutor, defaultDelegationTools, defaultRouterBaseUrl, estimateRouterCost, renderDimensions, renderJsonFooter, requireRouterApiKey, routerCompletion, runJudge, runMultishot, runMultishotMatrix };
|
package/dist/multishot/index.js
CHANGED
|
@@ -47,7 +47,10 @@ function estimateRouterCost(model, usage) {
|
|
|
47
47
|
return (inputTok * inPer1k + outputTok * outPer1k) / 1e3;
|
|
48
48
|
}
|
|
49
49
|
function defaultRouterBaseUrl() {
|
|
50
|
-
return (process.env.TANGLE_ROUTER_BASE_URL ?? "https://router.tangle.tools/v1").replace(
|
|
50
|
+
return (process.env.TANGLE_ROUTER_BASE_URL ?? "https://router.tangle.tools/v1").replace(
|
|
51
|
+
/\/+$/,
|
|
52
|
+
""
|
|
53
|
+
);
|
|
51
54
|
}
|
|
52
55
|
function requireRouterApiKey() {
|
|
53
56
|
const key = process.env.TANGLE_API_KEY;
|
|
@@ -69,7 +72,10 @@ var DEFAULT_DELEGATE_RESEARCH_TOOL = {
|
|
|
69
72
|
type: "object",
|
|
70
73
|
properties: {
|
|
71
74
|
question: { type: "string", description: "Specific question to research" },
|
|
72
|
-
scope: {
|
|
75
|
+
scope: {
|
|
76
|
+
type: "string",
|
|
77
|
+
description: "Optional scope: time window, geography, jurisdiction, segment"
|
|
78
|
+
}
|
|
73
79
|
},
|
|
74
80
|
required: ["question"]
|
|
75
81
|
}
|
|
@@ -84,7 +90,10 @@ var DEFAULT_DELEGATE_CODE_TOOL = {
|
|
|
84
90
|
type: "object",
|
|
85
91
|
properties: {
|
|
86
92
|
goal: { type: "string", description: "What the code must accomplish" },
|
|
87
|
-
language: {
|
|
93
|
+
language: {
|
|
94
|
+
type: "string",
|
|
95
|
+
description: "Optional language preference (default: TypeScript)"
|
|
96
|
+
}
|
|
88
97
|
},
|
|
89
98
|
required: ["goal"]
|
|
90
99
|
}
|
|
@@ -169,7 +178,10 @@ async function runJudge(judge, input) {
|
|
|
169
178
|
});
|
|
170
179
|
raw = (message.content ?? "").trim();
|
|
171
180
|
} catch (err) {
|
|
172
|
-
return {
|
|
181
|
+
return {
|
|
182
|
+
...ZERO_SCORE,
|
|
183
|
+
notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`
|
|
184
|
+
};
|
|
173
185
|
}
|
|
174
186
|
let parsed = null;
|
|
175
187
|
try {
|
|
@@ -201,6 +213,10 @@ function renderJsonFooter(dims) {
|
|
|
201
213
|
{${fields},"notes":"1-2 sentence critique"}`;
|
|
202
214
|
}
|
|
203
215
|
|
|
216
|
+
// src/multishot/matrix.ts
|
|
217
|
+
import { mkdirSync, writeFileSync } from "fs";
|
|
218
|
+
import { join } from "path";
|
|
219
|
+
|
|
204
220
|
// src/multishot/types.ts
|
|
205
221
|
var MultishotDriverEmptyError = class extends Error {
|
|
206
222
|
constructor(turn) {
|
|
@@ -218,7 +234,11 @@ async function runMultishot(opts) {
|
|
|
218
234
|
const maxTurns = opts.maxTurns ?? 10;
|
|
219
235
|
const agentModel = opts.agentModel ?? "openai/gpt-5.4";
|
|
220
236
|
const driverModel = opts.driverModel ?? "openai/gpt-4o-mini";
|
|
221
|
-
const bundle = opts.tools && opts.toolExecutors ? {
|
|
237
|
+
const bundle = opts.tools && opts.toolExecutors ? {
|
|
238
|
+
tools: opts.tools,
|
|
239
|
+
executors: opts.toolExecutors,
|
|
240
|
+
artifactTypeFor: opts.artifactTypeFor ?? (() => void 0)
|
|
241
|
+
} : defaultDelegationTools();
|
|
222
242
|
const tools = opts.tools ?? bundle.tools;
|
|
223
243
|
const executors = opts.toolExecutors ?? bundle.executors;
|
|
224
244
|
const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor;
|
|
@@ -282,7 +302,12 @@ async function runMultishot(opts) {
|
|
|
282
302
|
totalCostUsd += r.costUsd;
|
|
283
303
|
const artifactType = artifactTypeFor(tc.name);
|
|
284
304
|
if (artifactType) {
|
|
285
|
-
artifacts.push({
|
|
305
|
+
artifacts.push({
|
|
306
|
+
type: artifactType,
|
|
307
|
+
turn,
|
|
308
|
+
invocation: { name: tc.name, args: tc.args },
|
|
309
|
+
content: toolResult
|
|
310
|
+
});
|
|
286
311
|
}
|
|
287
312
|
}
|
|
288
313
|
} catch (err) {
|
|
@@ -349,8 +374,6 @@ async function driverTurn(opts) {
|
|
|
349
374
|
}
|
|
350
375
|
|
|
351
376
|
// src/multishot/matrix.ts
|
|
352
|
-
import { mkdirSync, writeFileSync } from "fs";
|
|
353
|
-
import { join } from "path";
|
|
354
377
|
async function runMultishotMatrix(opts) {
|
|
355
378
|
const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ["code"]);
|
|
356
379
|
const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ["research"]);
|
|
@@ -385,16 +408,34 @@ async function runMultishotMatrix(opts) {
|
|
|
385
408
|
const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type));
|
|
386
409
|
const [conversation, codeReviews, contentReviews] = await Promise.all([
|
|
387
410
|
runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),
|
|
388
|
-
opts.judges.codeReview ? Promise.all(
|
|
389
|
-
|
|
411
|
+
opts.judges.codeReview ? Promise.all(
|
|
412
|
+
codeArtifacts.map(
|
|
413
|
+
(artifact) => runJudge(opts.judges.codeReview, { artifact, persona }).then((s) => ({
|
|
414
|
+
...s,
|
|
415
|
+
turn: artifact.turn,
|
|
416
|
+
type: artifact.type
|
|
417
|
+
}))
|
|
418
|
+
)
|
|
419
|
+
) : Promise.resolve([]),
|
|
420
|
+
opts.judges.contentQuality ? Promise.all(
|
|
421
|
+
contentArtifacts.map(
|
|
422
|
+
(artifact) => runJudge(opts.judges.contentQuality, { artifact, persona }).then((s) => ({
|
|
423
|
+
...s,
|
|
424
|
+
turn: artifact.turn,
|
|
425
|
+
type: artifact.type
|
|
426
|
+
}))
|
|
427
|
+
)
|
|
428
|
+
) : Promise.resolve([])
|
|
390
429
|
]);
|
|
391
430
|
const codeComposite = codeReviews.length === 0 ? 0 : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length;
|
|
392
431
|
const contentComposite = contentReviews.length === 0 ? 0 : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length;
|
|
393
432
|
const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0);
|
|
394
433
|
const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount;
|
|
395
434
|
const cellScore = { composite, conversation };
|
|
396
|
-
if (opts.judges.codeReview)
|
|
397
|
-
|
|
435
|
+
if (opts.judges.codeReview)
|
|
436
|
+
cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite };
|
|
437
|
+
if (opts.judges.contentQuality)
|
|
438
|
+
cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite };
|
|
398
439
|
const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`);
|
|
399
440
|
mkdirSync(cellDir, { recursive: true });
|
|
400
441
|
writeFileSync(join(cellDir, "transcript.json"), JSON.stringify(sim.transcript, null, 2));
|
|
@@ -404,7 +445,11 @@ async function runMultishotMatrix(opts) {
|
|
|
404
445
|
if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`);
|
|
405
446
|
if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`);
|
|
406
447
|
return {
|
|
407
|
-
output: {
|
|
448
|
+
output: {
|
|
449
|
+
turns: sim.transcript.length,
|
|
450
|
+
toolCalls: sim.toolCalls,
|
|
451
|
+
artifactCount: sim.artifacts.length
|
|
452
|
+
},
|
|
408
453
|
verdict: { valid: composite >= 5, score: composite, notes: notes.join(" ") },
|
|
409
454
|
costUsd: sim.costUsd,
|
|
410
455
|
durationMs: sim.durationMs
|
|
@@ -432,13 +477,17 @@ async function runMultishotMatrix(opts) {
|
|
|
432
477
|
``,
|
|
433
478
|
"| profile | pass | mean | cost |",
|
|
434
479
|
"|---|---|---|---|",
|
|
435
|
-
...Object.entries(matrix.byAxis.profile ?? {}).map(
|
|
480
|
+
...Object.entries(matrix.byAxis.profile ?? {}).map(
|
|
481
|
+
([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`
|
|
482
|
+
),
|
|
436
483
|
``,
|
|
437
484
|
`## By persona`,
|
|
438
485
|
``,
|
|
439
486
|
"| persona | pass | mean | cost |",
|
|
440
487
|
"|---|---|---|---|",
|
|
441
|
-
...Object.entries(matrix.byAxis.persona ?? {}).map(
|
|
488
|
+
...Object.entries(matrix.byAxis.persona ?? {}).map(
|
|
489
|
+
([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`
|
|
490
|
+
),
|
|
442
491
|
``
|
|
443
492
|
];
|
|
444
493
|
writeFileSync(join(opts.runDir, "summary.md"), md.join("\n"));
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts","../../src/multishot/matrix.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string | null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(req: RouterCompletionRequest): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string | null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') || model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') || model.includes('glm') || model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(/\\/+$/, '')\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. */\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /** Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. */\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: { type: 'string', description: 'Optional scope: time window, geography, jurisdiction, segment' },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: { type: 'string', description: 'Optional language preference (default: TypeScript)' },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(config: DefaultResearcherConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /** When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. */\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string | undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) => (name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined),\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /** Display name (for trace + log). */\n name: string\n /** Model used for this judge. */\n model?: string\n /** 0-10 scored dimensions. */\n dimensions: JudgeDimension[]\n /** Judge system prompt — sets persona + JSON-only constraint. */\n systemPrompt: string\n /** Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. */\n buildPrompt: (input: TInput) => string\n /** Optional model + api overrides. */\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /** Per-dimension 0-10 score. Missing dims default to 0. */\n dimensions: Record<string, number>\n /** Mean across dimensions. */\n composite: number\n /** Free-form 1-2 sentence critique from the judge (when provided). */\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(judge: JudgeConfig<TInput>, input: TInput): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}` }\n }\n\n let parsed: Record<string, unknown> | null = null\n try {\n const cleaned = raw.replace(/^```json\\s*/i, '').replace(/```\\s*$/, '').trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/** Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/** Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. */\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' | 'assistant' | 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */\n id: string\n /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /** Opening user message (turn 0) — the persona's first ask. */\n buildOpener: (persona: TPersona) => string\n /** System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. */\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n MultishotDriverEmptyError,\n type MultishotArtifact,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: AgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → executor invoked inline when the agent emits a tool_call. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string | undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle = opts.tools && opts.toolExecutors\n ? { tools: opts.tools, executors: opts.toolExecutors, artifactTypeFor: opts.artifactTypeFor ?? (() => undefined) }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText || null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({ type: artifactType, turn, invocation: { name: tc.name, args: tc.args }, content: toolResult })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult || 'done' })\n transcript.push({ role: 'tool', content: toolResult || 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { runAgentMatrix } from '../matrix'\nimport type { MatrixResult } from '../matrix'\nimport { runJudge, type JudgeConfig, type JudgeScore } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /** Scores the full transcript end-to-end (always runs). */\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Scores each non-code (research/content/template) artifact. Optional. */\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Which artifact types route to codeReview. Defaults to ['code']. */\n codeArtifactTypes?: string[]\n /** Which artifact types route to contentQuality. Defaults to ['research']. */\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: { perArtifact: Array<JudgeScore & { turn: number; type: string }>; composite: number }\n contentQuality?: { perArtifact: Array<JudgeScore & { turn: number; type: string }>; composite: number }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /** AgentProfile axis (matrix primary). */\n profiles: Array<{ id: string; value: AgentProfile }>\n /** Persona axis. */\n personas: TPersona[]\n /** Persona-shaping callbacks. */\n shape: MultishotShape<TPersona>\n /** Judge configurations. */\n judges: MultishotJudges<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → inline executor. Must align with `tools`. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Tool name → artifact type label. Defaults to research/code mapping. */\n artifactTypeFor?: (toolName: string) => string | undefined\n /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */\n runDir: string\n /** Replicates per (profile, persona) cell. */\n reps?: number\n /** Max conversation turns per cell. */\n maxTurns?: number\n /** Max concurrent cells. */\n maxConcurrency?: number\n /** Total $ ceiling across the matrix; cells aborted past this. */\n costCeiling?: number\n /** Agent model. */\n agentModel?: string\n /** Driver model. */\n driverModel?: string\n /** Pass-thru fields. */\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as AgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(codeArtifacts.map((artifact) => runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({ ...s, turn: artifact.turn, type: artifact.type }))))\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(contentArtifacts.map((artifact) => runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({ ...s, turn: artifact.turn, type: artifact.type }))))\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite = codeReviews.length === 0 ? 0 : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite = contentReviews.length === 0 ? 0 : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview) cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality) cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: { turns: sim.transcript.length, toolCalls: sim.toolCalls, artifactCount: sim.artifacts.length },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `**Cells**: ${matrix.summary.totalCells} | **Pass rate**: ${(matrix.summary.overallPassRate * 100).toFixed(0)}% | **Mean**: ${matrix.summary.overallMeanScore.toFixed(2)} | **Cost**: $${matrix.summary.totalCostUsd.toFixed(2)} | **Duration**: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '| profile | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`),\n ``,\n `## By persona`,\n ``,\n '| persona | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(([id, s]) => `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBAAiB,KAAiE;AACtG,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC,QAAQ,QAAQ,EAAE;AACpG;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;AC9EO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO,EAAE,MAAM,UAAU,aAAa,gEAAgE;AAAA,MACxG;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU,EAAE,MAAM,UAAU,aAAa,qDAAqD;AAAA,MAChG;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBAAuB,SAAkC,CAAC,GAA0B;AAClG,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAAU,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EAC9G;AACF;;;AC3HO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SAAiB,OAA4B,OAAoC;AACrG,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC,GAAG;AAAA,EACxH;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IAAI,QAAQ,gBAAgB,EAAE,EAAE,QAAQ,WAAW,EAAE,EAAE,KAAK;AAC5E,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACnDO,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SAAS,KAAK,SAAS,KAAK,gBAC9B,EAAE,OAAO,KAAK,OAAO,WAAW,KAAK,eAAe,iBAAiB,KAAK,oBAAoB,MAAM,QAAW,IAC/G,uBAAuB;AAC3B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK,EAAE,MAAM,cAAc,MAAM,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK,GAAG,SAAS,WAAW,CAAC;AAAA,UAChH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AC9MA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;AAyFrB,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ,IAAI,cAAc,IAAI,CAAC,aAAa,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO,EAAE,GAAG,GAAG,MAAM,SAAS,MAAM,MAAM,SAAS,KAAK,EAAE,CAAC,CAAC,IACvK,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ,IAAI,iBAAiB,IAAI,CAAC,aAAa,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO,EAAE,GAAG,GAAG,MAAM,SAAS,MAAM,MAAM,SAAS,KAAK,EAAE,CAAC,CAAC,IAC9K,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBAAgB,YAAY,WAAW,IAAI,IAAI,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACpH,YAAM,mBAAmB,eAAe,WAAW,IAAI,IAAI,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAGhI,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO,WAAY,WAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AACxG,UAAI,KAAK,OAAO,eAAgB,WAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAEtH,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ,EAAE,OAAO,IAAI,WAAW,QAAQ,WAAW,IAAI,WAAW,eAAe,IAAI,UAAU,OAAO;AAAA,QACtG,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC,IAAI;AAAA,IAC3K;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC,IAAI;AAAA,IAC3K;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/multishot/router.ts","../../src/multishot/default-tools.ts","../../src/multishot/judges.ts","../../src/multishot/matrix.ts","../../src/multishot/types.ts","../../src/multishot/multishot.ts"],"sourcesContent":["// Router fetch helper — single source of truth for OpenAI-compat calls\n// against the Tangle router. Used by the driver, agent, judges, and the\n// default tool executors.\n\nimport type { MultishotToolDefinition } from './types'\n\nexport interface RouterCompletionRequest {\n apiKey: string\n baseUrl: string\n model: string\n messages: Array<Record<string, unknown>>\n tools?: MultishotToolDefinition[]\n temperature?: number\n maxTokens?: number\n signal?: AbortSignal\n}\n\nexport interface RouterToolCall {\n id: string\n type: 'function'\n function: { name: string; arguments: string }\n}\n\nexport interface RouterCompletionResponse {\n message: { content?: string | null; tool_calls?: RouterToolCall[] }\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n}\n\nexport async function routerCompletion(\n req: RouterCompletionRequest,\n): Promise<RouterCompletionResponse> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0.7,\n max_tokens: req.maxTokens ?? 2000,\n }\n if (req.tools?.length) body.tools = req.tools\n const url = `${req.baseUrl.replace(/\\/+$/, '')}/chat/completions`\n const res = await fetch(url, {\n method: 'POST',\n headers: { Authorization: `Bearer ${req.apiKey}`, 'Content-Type': 'application/json' },\n body: JSON.stringify(body),\n signal: req.signal,\n })\n if (!res.ok) {\n const text = await res.text()\n throw new Error(`router ${res.status}: ${text.slice(0, 300)}`)\n }\n const json = (await res.json()) as {\n choices: Array<{ message: { content?: string | null; tool_calls?: RouterToolCall[] } }>\n usage?: { prompt_tokens?: number; completion_tokens?: number }\n }\n const choice = json.choices[0]\n if (!choice) throw new Error(`router returned no choices: ${JSON.stringify(json).slice(0, 200)}`)\n return { message: choice.message, usage: json.usage }\n}\n\n// Rough per-model cost estimator. Used for cost-ceiling enforcement.\n// Underestimates Anthropic, overestimates oss models — fine for ceilings.\nexport function estimateRouterCost(\n model: string,\n usage?: { prompt_tokens?: number; completion_tokens?: number },\n): number {\n if (!usage) return 0\n const inputTok = usage.prompt_tokens ?? 0\n const outputTok = usage.completion_tokens ?? 0\n let inPer1k = 0.003\n let outPer1k = 0.015\n if (model.includes('gpt-4o-mini')) {\n inPer1k = 0.00015\n outPer1k = 0.0006\n } else if (model.includes('gpt-5.4') || model.includes('claude-sonnet')) {\n inPer1k = 0.003\n outPer1k = 0.015\n } else if (model.includes('kimi') || model.includes('glm') || model.includes('deepseek')) {\n inPer1k = 0.0005\n outPer1k = 0.002\n }\n return (inputTok * inPer1k + outputTok * outPer1k) / 1000\n}\n\nexport function defaultRouterBaseUrl(): string {\n return (process.env.TANGLE_ROUTER_BASE_URL ?? 'https://router.tangle.tools/v1').replace(\n /\\/+$/,\n '',\n )\n}\n\nexport function requireRouterApiKey(): string {\n const key = process.env.TANGLE_API_KEY\n if (!key) throw new Error('multishot requires TANGLE_API_KEY (router-scoped sk-tan-* key)')\n return key\n}\n","// Default delegate_research + delegate_code tools and their inline executors.\n//\n// Consumers can override either by passing their own tools + executors to\n// runMultishot. The defaults are sufficient for most domains — point the\n// researcher system prompt at your domain's citation style and the coder\n// at your preferred language.\n\nimport { estimateRouterCost, routerCompletion } from './router'\nimport type { MultishotToolDefinition, MultishotToolExecutor } from './types'\n\nexport const DEFAULT_RESEARCHER_MODEL = 'openai/gpt-4o-mini'\nexport const DEFAULT_CODER_MODEL = 'openai/gpt-4o-mini'\n\nexport interface DefaultResearcherConfig {\n /** Replace the system prompt to bias the researcher toward a domain's\n * citation style. Defaults to a generic \"cite sources by name\" prompt. */\n systemPrompt?: string\n model?: string\n}\n\nexport interface DefaultCoderConfig {\n /** Replace the system prompt to bias the coder toward a language /\n * framework / artifact style. */\n systemPrompt?: string\n model?: string\n}\n\nconst GENERIC_RESEARCHER_SYSTEM =\n 'You are a research specialist. Return a markdown brief with 3-5 findings. Each finding cites a specific source by name. Add a confidence level (high/medium/low) per finding. No fluff, no preamble.'\n\nconst GENERIC_CODER_SYSTEM =\n 'You are an expert engineer. Output ONE fenced code block containing the complete solution. Inline-comment non-obvious decisions. No explanation outside the block.'\n\nexport const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_research',\n description:\n 'Research a topic deeply via specialist. Returns evidence-bearing items with citations. Use for audience research, competitive intel, regulatory landscape, market data, citation-grounded analysis.',\n parameters: {\n type: 'object',\n properties: {\n question: { type: 'string', description: 'Specific question to research' },\n scope: {\n type: 'string',\n description: 'Optional scope: time window, geography, jurisdiction, segment',\n },\n },\n required: ['question'],\n },\n },\n}\n\nexport const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition = {\n type: 'function',\n function: {\n name: 'delegate_code',\n description:\n 'Generate a runnable script, template, pipeline, or tool via specialist. Returns complete working code or structured markdown. Use for content pipelines, calc snippets, dashboards, compliance checklists, deadline trackers.',\n parameters: {\n type: 'object',\n properties: {\n goal: { type: 'string', description: 'What the code must accomplish' },\n language: {\n type: 'string',\n description: 'Optional language preference (default: TypeScript)',\n },\n },\n required: ['goal'],\n },\n },\n}\n\nexport function createResearchExecutor(\n config: DefaultResearcherConfig = {},\n): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_RESEARCHER_SYSTEM\n const model = config.model ?? DEFAULT_RESEARCHER_MODEL\n return async (args, ctx) => {\n const question = String(args.question ?? '')\n const scope = args.scope ? String(args.scope) : undefined\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.3,\n maxTokens: 1800,\n messages: [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: `Research: ${question}${scope ? `\\nScope: ${scope}` : ''}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport function createCodeExecutor(config: DefaultCoderConfig = {}): MultishotToolExecutor {\n const systemPrompt = config.systemPrompt ?? GENERIC_CODER_SYSTEM\n const model = config.model ?? DEFAULT_CODER_MODEL\n return async (args, ctx) => {\n const goal = String(args.goal ?? '')\n const language = args.language ? String(args.language) : 'TypeScript'\n const { message, usage } = await routerCompletion({\n apiKey: ctx.apiKey,\n baseUrl: ctx.baseUrl,\n model,\n temperature: 0.2,\n maxTokens: 2000,\n messages: [\n { role: 'system', content: `${systemPrompt}\\n\\nLanguage: ${language}` },\n { role: 'user', content: `Produce: ${goal}` },\n ],\n signal: ctx.signal,\n })\n return { content: message.content ?? '', costUsd: estimateRouterCost(model, usage) }\n }\n}\n\nexport interface DefaultToolsConfig {\n research?: DefaultResearcherConfig\n code?: DefaultCoderConfig\n /** When true (default), each tool result is recorded as a typed artifact:\n * research → type='research', code → type='code'. */\n recordArtifacts?: boolean\n}\n\nexport interface DefaultToolsBundle {\n tools: MultishotToolDefinition[]\n executors: Record<string, MultishotToolExecutor>\n artifactTypeFor: (toolName: string) => string | undefined\n}\n\nexport function defaultDelegationTools(config: DefaultToolsConfig = {}): DefaultToolsBundle {\n return {\n tools: [DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_DELEGATE_CODE_TOOL],\n executors: {\n delegate_research: createResearchExecutor(config.research),\n delegate_code: createCodeExecutor(config.code),\n },\n artifactTypeFor: (name) =>\n name === 'delegate_research' ? 'research' : name === 'delegate_code' ? 'code' : undefined,\n }\n}\n\nexport { defaultRouterBaseUrl } from './router'\n","// Generic judge runner — domain consumers configure dimensions + prompts.\n//\n// Three judge slots are conventional for multishot eval:\n// - conversation (scores the full transcript)\n// - codeReview (scores each code artifact)\n// - contentQuality (scores each non-code artifact)\n//\n// But the runJudge primitive is fully generic — any T → JudgeScore mapping.\n\nimport { defaultRouterBaseUrl, requireRouterApiKey, routerCompletion } from './router'\n\nexport const DEFAULT_JUDGE_MODEL = 'openai/gpt-4o-mini'\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\nexport interface JudgeConfig<TInput> {\n /** Display name (for trace + log). */\n name: string\n /** Model used for this judge. */\n model?: string\n /** 0-10 scored dimensions. */\n dimensions: JudgeDimension[]\n /** Judge system prompt — sets persona + JSON-only constraint. */\n systemPrompt: string\n /** Build the user prompt from the typed input. Must include \"Respond with\n * ONLY this JSON: { ... }\" listing each dimension key. */\n buildPrompt: (input: TInput) => string\n /** Optional model + api overrides. */\n apiKey?: string\n baseUrl?: string\n}\n\nexport interface JudgeScore {\n /** Per-dimension 0-10 score. Missing dims default to 0. */\n dimensions: Record<string, number>\n /** Mean across dimensions. */\n composite: number\n /** Free-form 1-2 sentence critique from the judge (when provided). */\n notes: string\n}\n\nconst ZERO_SCORE: JudgeScore = { dimensions: {}, composite: 0, notes: 'parse failed' }\n\nexport async function runJudge<TInput>(\n judge: JudgeConfig<TInput>,\n input: TInput,\n): Promise<JudgeScore> {\n const apiKey = judge.apiKey ?? requireRouterApiKey()\n const baseUrl = judge.baseUrl ?? defaultRouterBaseUrl()\n const model = judge.model ?? process.env.JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL\n const prompt = judge.buildPrompt(input)\n let raw = ''\n try {\n const { message } = await routerCompletion({\n apiKey,\n baseUrl,\n model,\n temperature: 0,\n maxTokens: 1500,\n messages: [\n { role: 'system', content: judge.systemPrompt },\n { role: 'user', content: prompt },\n ],\n })\n raw = (message.content ?? '').trim()\n } catch (err) {\n return {\n ...ZERO_SCORE,\n notes: `judge ${judge.name} call failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n\n let parsed: Record<string, unknown> | null = null\n try {\n const cleaned = raw\n .replace(/^```json\\s*/i, '')\n .replace(/```\\s*$/, '')\n .trim()\n parsed = JSON.parse(cleaned) as Record<string, unknown>\n } catch {\n return { ...ZERO_SCORE, notes: `judge ${judge.name} returned non-JSON: ${raw.slice(0, 200)}` }\n }\n\n const dimensions: Record<string, number> = {}\n let sum = 0\n for (const dim of judge.dimensions) {\n const v = Number(parsed[dim.key] ?? 0)\n const clamped = Number.isFinite(v) ? Math.max(0, Math.min(10, v)) : 0\n dimensions[dim.key] = clamped\n sum += clamped\n }\n return {\n dimensions,\n composite: judge.dimensions.length === 0 ? 0 : sum / judge.dimensions.length,\n notes: typeof parsed.notes === 'string' ? parsed.notes : '',\n }\n}\n\n/** Convenience: stringified dimension list for inclusion in a judge prompt.\n * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */\nexport function renderDimensions(dims: readonly JudgeDimension[]): string {\n return dims.map((d) => `- ${d.key}: ${d.description}`).join('\\n')\n}\n\n/** Convenience: build the \"Respond with ONLY this JSON\" footer for a judge prompt. */\nexport function renderJsonFooter(dims: readonly JudgeDimension[]): string {\n const fields = dims.map((d) => `\"${d.key}\":N`).join(',')\n return `Respond with ONLY this JSON (no markdown, no preamble):\\n{${fields},\"notes\":\"1-2 sentence critique\"}`\n}\n","// Multishot matrix wrapper — sweeps profiles × personas × reps, runs\n// the driver-agent loop per cell, applies up to three configured judges,\n// persists per-cell artifacts, and aggregates by axis.\n//\n// Uses runAgentMatrix from @tangle-network/agent-eval/matrix under the\n// hood so cell scheduling + concurrency + cost ceiling are unified with\n// other matrix consumers.\n\nimport { mkdirSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport type { MatrixResult } from '../matrix'\nimport { runAgentMatrix } from '../matrix'\nimport { type JudgeConfig, type JudgeScore, runJudge } from './judges'\nimport { runMultishot } from './multishot'\nimport type {\n MultishotArtifact,\n MultishotMessage,\n MultishotPersona,\n MultishotShape,\n MultishotToolDefinition,\n MultishotToolExecutor,\n} from './types'\n\nexport interface ConversationJudgeInput<TPersona extends MultishotPersona> {\n transcript: MultishotMessage[]\n persona: TPersona\n}\n\nexport interface ArtifactJudgeInput<TPersona extends MultishotPersona> {\n artifact: MultishotArtifact\n persona: TPersona\n}\n\nexport interface MultishotJudges<TPersona extends MultishotPersona> {\n /** Scores the full transcript end-to-end (always runs). */\n conversation: JudgeConfig<ConversationJudgeInput<TPersona>>\n /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */\n codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Scores each non-code (research/content/template) artifact. Optional. */\n contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>\n /** Which artifact types route to codeReview. Defaults to ['code']. */\n codeArtifactTypes?: string[]\n /** Which artifact types route to contentQuality. Defaults to ['research']. */\n contentArtifactTypes?: string[]\n}\n\nexport interface CellCompositeScore {\n composite: number\n conversation: JudgeScore\n codeReview?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n contentQuality?: {\n perArtifact: Array<JudgeScore & { turn: number; type: string }>\n composite: number\n }\n}\n\nexport interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {\n /** AgentProfile axis (matrix primary). */\n profiles: Array<{ id: string; value: AgentProfile }>\n /** Persona axis. */\n personas: TPersona[]\n /** Persona-shaping callbacks. */\n shape: MultishotShape<TPersona>\n /** Judge configurations. */\n judges: MultishotJudges<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → inline executor. Must align with `tools`. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Tool name → artifact type label. Defaults to research/code mapping. */\n artifactTypeFor?: (toolName: string) => string | undefined\n /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */\n runDir: string\n /** Replicates per (profile, persona) cell. */\n reps?: number\n /** Max conversation turns per cell. */\n maxTurns?: number\n /** Max concurrent cells. */\n maxConcurrency?: number\n /** Total $ ceiling across the matrix; cells aborted past this. */\n costCeiling?: number\n /** Agent model. */\n agentModel?: string\n /** Driver model. */\n driverModel?: string\n /** Pass-thru fields. */\n apiKey?: string\n baseUrl?: string\n}\n\ninterface CellOutput {\n turns: number\n toolCalls: number\n artifactCount: number\n}\n\nexport interface RunMultishotMatrixResult {\n matrix: MatrixResult<CellOutput>\n}\n\nexport async function runMultishotMatrix<TPersona extends MultishotPersona>(\n opts: RunMultishotMatrixOptions<TPersona>,\n): Promise<RunMultishotMatrixResult> {\n const codeTypes = new Set(opts.judges.codeArtifactTypes ?? ['code'])\n const contentTypes = new Set(opts.judges.contentArtifactTypes ?? ['research'])\n mkdirSync(opts.runDir, { recursive: true })\n\n const matrix = await runAgentMatrix<CellOutput>({\n axes: [\n { name: 'profile', values: opts.profiles },\n { name: 'persona', values: opts.personas.map((p) => ({ id: p.id, value: p })) },\n ],\n reps: opts.reps ?? 1,\n maxConcurrency: opts.maxConcurrency ?? 2,\n costCeiling: opts.costCeiling,\n async runCell(cell) {\n const profile = cell.axes.profile?.value as AgentProfile\n const persona = cell.axes.persona?.value as TPersona\n const profileId = String(cell.axes.profile?.id ?? 'unknown')\n const personaId = String(cell.axes.persona?.id ?? 'unknown')\n\n const sim = await runMultishot({\n profile,\n persona,\n shape: opts.shape,\n tools: opts.tools,\n toolExecutors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor,\n maxTurns: opts.maxTurns,\n agentModel: opts.agentModel,\n driverModel: opts.driverModel,\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n })\n\n const codeArtifacts = sim.artifacts.filter((a) => codeTypes.has(a.type))\n const contentArtifacts = sim.artifacts.filter((a) => contentTypes.has(a.type))\n\n const [conversation, codeReviews, contentReviews] = await Promise.all([\n runJudge(opts.judges.conversation, { transcript: sim.transcript, persona }),\n opts.judges.codeReview\n ? Promise.all(\n codeArtifacts.map((artifact) =>\n runJudge(opts.judges.codeReview!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n opts.judges.contentQuality\n ? Promise.all(\n contentArtifacts.map((artifact) =>\n runJudge(opts.judges.contentQuality!, { artifact, persona }).then((s) => ({\n ...s,\n turn: artifact.turn,\n type: artifact.type,\n })),\n ),\n )\n : Promise.resolve([] as Array<JudgeScore & { turn: number; type: string }>),\n ])\n\n const codeComposite =\n codeReviews.length === 0\n ? 0\n : codeReviews.reduce((s, r) => s + r.composite, 0) / codeReviews.length\n const contentComposite =\n contentReviews.length === 0\n ? 0\n : contentReviews.reduce((s, r) => s + r.composite, 0) / contentReviews.length\n\n // Composite = mean of (conversation, code, content) — empty judges count 0.\n const judgeCount = 1 + (opts.judges.codeReview ? 1 : 0) + (opts.judges.contentQuality ? 1 : 0)\n const composite = (conversation.composite + codeComposite + contentComposite) / judgeCount\n\n const cellScore: CellCompositeScore = { composite, conversation }\n if (opts.judges.codeReview)\n cellScore.codeReview = { perArtifact: codeReviews, composite: codeComposite }\n if (opts.judges.contentQuality)\n cellScore.contentQuality = { perArtifact: contentReviews, composite: contentComposite }\n\n const cellDir = join(opts.runDir, profileId, personaId, `rep-${cell.rep}`)\n mkdirSync(cellDir, { recursive: true })\n writeFileSync(join(cellDir, 'transcript.json'), JSON.stringify(sim.transcript, null, 2))\n writeFileSync(join(cellDir, 'artifacts.json'), JSON.stringify(sim.artifacts, null, 2))\n writeFileSync(join(cellDir, 'scores.json'), JSON.stringify(cellScore, null, 2))\n\n const notes = [`convo=${conversation.composite.toFixed(1)}`]\n if (opts.judges.codeReview) notes.push(`code=${codeComposite.toFixed(1)}`)\n if (opts.judges.contentQuality) notes.push(`content=${contentComposite.toFixed(1)}`)\n\n return {\n output: {\n turns: sim.transcript.length,\n toolCalls: sim.toolCalls,\n artifactCount: sim.artifacts.length,\n },\n verdict: { valid: composite >= 5, score: composite, notes: notes.join(' ') },\n costUsd: sim.costUsd,\n durationMs: sim.durationMs,\n }\n },\n })\n\n // Persist top-level summary.\n const summary = {\n cells: matrix.summary.totalCells,\n passRate: matrix.summary.overallPassRate,\n meanScore: matrix.summary.overallMeanScore,\n totalCostUsd: matrix.summary.totalCostUsd,\n durationMs: matrix.summary.durationMs,\n runsExecuted: matrix.summary.runsExecuted,\n cellsSkipped: matrix.summary.cellsSkipped,\n byProfile: matrix.byAxis.profile,\n byPersona: matrix.byAxis.persona,\n }\n writeFileSync(join(opts.runDir, 'summary.json'), JSON.stringify(summary, null, 2))\n\n const md: string[] = [\n `# Multishot matrix`,\n ``,\n `**Cells**: ${matrix.summary.totalCells} | **Pass rate**: ${(matrix.summary.overallPassRate * 100).toFixed(0)}% | **Mean**: ${matrix.summary.overallMeanScore.toFixed(2)} | **Cost**: $${matrix.summary.totalCostUsd.toFixed(2)} | **Duration**: ${(matrix.summary.durationMs / 1000).toFixed(0)}s`,\n ``,\n `## By profile`,\n ``,\n '| profile | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.profile ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n `## By persona`,\n ``,\n '| persona | pass | mean | cost |',\n '|---|---|---|---|',\n ...Object.entries(matrix.byAxis.persona ?? {}).map(\n ([id, s]) =>\n `| ${id} | ${(s.passRate * 100).toFixed(0)}% | ${s.meanScore.toFixed(2)} | $${s.totalCostUsd.toFixed(2)} |`,\n ),\n ``,\n ]\n writeFileSync(join(opts.runDir, 'summary.md'), md.join('\\n'))\n\n return { matrix }\n}\n","// Public types for the multishot substrate.\n\nexport interface MultishotMessage {\n role: 'user' | 'assistant' | 'tool'\n content: string\n toolCallId?: string\n toolCalls?: Array<{ id: string; name: string; args: Record<string, unknown> }>\n}\n\nexport interface MultishotArtifact {\n type: string\n turn: number\n invocation: { name: string; args: Record<string, unknown> }\n content: string\n}\n\nexport interface MultishotResult {\n transcript: MultishotMessage[]\n artifacts: MultishotArtifact[]\n toolCalls: number\n durationMs: number\n costUsd: number\n}\n\nexport interface MultishotToolDefinition {\n type: 'function'\n function: {\n name: string\n description: string\n parameters: Record<string, unknown>\n }\n}\n\nexport type MultishotToolExecutor = (\n args: Record<string, unknown>,\n ctx: { apiKey: string; baseUrl: string; signal?: AbortSignal },\n) => Promise<{ content: string; costUsd: number }>\n\nexport interface MultishotPersona {\n /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */\n id: string\n /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */\n [k: string]: unknown\n}\n\nexport interface MultishotShape<TPersona extends MultishotPersona> {\n /** Opening user message (turn 0) — the persona's first ask. */\n buildOpener: (persona: TPersona) => string\n /** System prompt the driver LLM uses to roleplay the persona. Should set\n * voice, goals, constraints, time-pressure, and the \"never go silent\" rule. */\n buildDriverSystemPrompt: (persona: TPersona) => string\n}\n\nexport class MultishotDriverEmptyError extends Error {\n constructor(public readonly turn: number) {\n super(`multishot: driver returned empty content twice at turn ${turn} — failing loud`)\n this.name = 'MultishotDriverEmptyError'\n }\n}\n","// Multi-turn driver-agent simulation with inline tool execution.\n//\n// The driver = LLM acting as the persona (reactive, non-deterministic).\n// The agent = the product agent under test (router call with profile's\n// systemPrompt + the configured tools).\n// Tool calls execute inline via the configured executors and feed back\n// into the agent's message log so the agent integrates the result.\n\nimport type { AgentProfile } from '@tangle-network/sandbox'\nimport { defaultDelegationTools } from './default-tools'\nimport {\n defaultRouterBaseUrl,\n estimateRouterCost,\n requireRouterApiKey,\n routerCompletion,\n} from './router'\nimport {\n type MultishotArtifact,\n MultishotDriverEmptyError,\n type MultishotMessage,\n type MultishotPersona,\n type MultishotResult,\n type MultishotShape,\n type MultishotToolDefinition,\n type MultishotToolExecutor,\n} from './types'\n\nexport interface RunMultishotOptions<TPersona extends MultishotPersona> {\n profile: AgentProfile\n persona: TPersona\n shape: MultishotShape<TPersona>\n /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */\n tools?: MultishotToolDefinition[]\n /** Map from tool name → executor invoked inline when the agent emits a tool_call. */\n toolExecutors?: Record<string, MultishotToolExecutor>\n /** Map from tool name → artifact type label written into MultishotArtifact.type.\n * Tools without a mapping still execute, but their results aren't surfaced as\n * typed artifacts (only as tool messages in the transcript). */\n artifactTypeFor?: (toolName: string) => string | undefined\n maxTurns?: number\n agentModel?: string\n driverModel?: string\n apiKey?: string\n baseUrl?: string\n signal?: AbortSignal\n}\n\nexport async function runMultishot<TPersona extends MultishotPersona>(\n opts: RunMultishotOptions<TPersona>,\n): Promise<MultishotResult> {\n const apiKey = opts.apiKey ?? requireRouterApiKey()\n const baseUrl = opts.baseUrl ?? defaultRouterBaseUrl()\n const maxTurns = opts.maxTurns ?? 10\n const agentModel = opts.agentModel ?? 'openai/gpt-5.4'\n const driverModel = opts.driverModel ?? 'openai/gpt-4o-mini'\n\n const bundle =\n opts.tools && opts.toolExecutors\n ? {\n tools: opts.tools,\n executors: opts.toolExecutors,\n artifactTypeFor: opts.artifactTypeFor ?? (() => undefined),\n }\n : defaultDelegationTools()\n const tools = opts.tools ?? bundle.tools\n const executors = opts.toolExecutors ?? bundle.executors\n const artifactTypeFor = opts.artifactTypeFor ?? bundle.artifactTypeFor\n\n const start = Date.now()\n const transcript: MultishotMessage[] = []\n const artifacts: MultishotArtifact[] = []\n let toolCalls = 0\n let totalCostUsd = 0\n\n const opener = opts.shape.buildOpener(opts.persona)\n transcript.push({ role: 'user', content: opener })\n\n const systemPrompt = opts.profile.prompt?.systemPrompt ?? ''\n const agentMessages: Array<Record<string, unknown>> = [\n { role: 'system', content: systemPrompt },\n { role: 'user', content: opener },\n ]\n\n for (let turn = 0; turn < maxTurns; turn++) {\n if (opts.signal?.aborted) throw new Error('multishot aborted')\n\n const { message: agentMsg, usage: agentUsage } = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n tools,\n temperature: 0.7,\n maxTokens: 2500,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, agentUsage)\n\n const agentText = (agentMsg.content ?? '').trim()\n const agentToolCalls = (agentMsg.tool_calls ?? []).map((tc) => ({\n id: tc.id,\n name: tc.function.name,\n args: (() => {\n try {\n return JSON.parse(tc.function.arguments) as Record<string, unknown>\n } catch {\n return {} as Record<string, unknown>\n }\n })(),\n }))\n\n agentMessages.push({\n role: 'assistant',\n content: agentText || null,\n ...(agentMsg.tool_calls?.length ? { tool_calls: agentMsg.tool_calls } : {}),\n })\n transcript.push({\n role: 'assistant',\n content: agentText,\n toolCalls: agentToolCalls.length > 0 ? agentToolCalls : undefined,\n })\n\n for (const tc of agentToolCalls) {\n toolCalls++\n let toolResult = ''\n try {\n const executor = executors[tc.name]\n if (!executor) {\n toolResult = JSON.stringify({ error: `unknown tool ${tc.name}` })\n } else {\n const r = await executor(tc.args, { apiKey, baseUrl, signal: opts.signal })\n toolResult = r.content\n totalCostUsd += r.costUsd\n const artifactType = artifactTypeFor(tc.name)\n if (artifactType) {\n artifacts.push({\n type: artifactType,\n turn,\n invocation: { name: tc.name, args: tc.args },\n content: toolResult,\n })\n }\n }\n } catch (err) {\n toolResult = JSON.stringify({ error: err instanceof Error ? err.message : String(err) })\n }\n agentMessages.push({ role: 'tool', tool_call_id: tc.id, content: toolResult || 'done' })\n transcript.push({ role: 'tool', content: toolResult || 'done', toolCallId: tc.id })\n }\n\n // If the agent emitted tool_calls, give it a follow-up turn to integrate the results.\n if (agentToolCalls.length > 0) {\n const followUp = await routerCompletion({\n apiKey,\n baseUrl,\n model: agentModel,\n messages: agentMessages,\n temperature: 0.7,\n maxTokens: 2000,\n signal: opts.signal,\n })\n totalCostUsd += estimateRouterCost(agentModel, followUp.usage)\n const followUpText = (followUp.message.content ?? '').trim()\n agentMessages.push({ role: 'assistant', content: followUpText })\n transcript.push({ role: 'assistant', content: followUpText })\n }\n\n if (turn < maxTurns - 1) {\n const driver = await driverTurn({\n apiKey,\n baseUrl,\n persona: opts.persona,\n shape: opts.shape,\n transcript,\n turn,\n model: driverModel,\n signal: opts.signal,\n })\n totalCostUsd += driver.costUsd\n agentMessages.push({ role: 'user', content: driver.content })\n transcript.push({ role: 'user', content: driver.content })\n }\n }\n\n return { transcript, artifacts, toolCalls, durationMs: Date.now() - start, costUsd: totalCostUsd }\n}\n\nasync function driverTurn<TPersona extends MultishotPersona>(opts: {\n apiKey: string\n baseUrl: string\n persona: TPersona\n shape: MultishotShape<TPersona>\n transcript: MultishotMessage[]\n turn: number\n model: string\n signal?: AbortSignal\n}): Promise<{ content: string; costUsd: number }> {\n const driverSystem = opts.shape.buildDriverSystemPrompt(opts.persona)\n\n // Translate transcript to driver POV: agent's `assistant` messages become\n // `user` (the agent talking TO the driver); the driver's prior `user`\n // messages become `assistant` (the driver's prior responses).\n const driverMessages: Array<Record<string, unknown>> = [{ role: 'system', content: driverSystem }]\n for (const msg of opts.transcript) {\n if (msg.role === 'tool') continue\n if (msg.role === 'assistant') driverMessages.push({ role: 'user', content: msg.content })\n else if (msg.role === 'user') driverMessages.push({ role: 'assistant', content: msg.content })\n }\n\n // Driver must never go silent. Retry once on empty content; then fail loud.\n for (let attempt = 0; attempt < 2; attempt++) {\n const { message, usage } = await routerCompletion({\n apiKey: opts.apiKey,\n baseUrl: opts.baseUrl,\n model: opts.model,\n messages: driverMessages,\n temperature: 0.9,\n maxTokens: 600,\n signal: opts.signal,\n })\n const content = (message.content ?? '').trim()\n if (content.length > 0) return { content, costUsd: estimateRouterCost(opts.model, usage) }\n }\n throw new MultishotDriverEmptyError(opts.turn)\n}\n"],"mappings":";;;;;;AA4BA,eAAsB,iBACpB,KACmC;AACnC,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,IAChC,YAAY,IAAI,aAAa;AAAA,EAC/B;AACA,MAAI,IAAI,OAAO,OAAQ,MAAK,QAAQ,IAAI;AACxC,QAAM,MAAM,GAAG,IAAI,QAAQ,QAAQ,QAAQ,EAAE,CAAC;AAC9C,QAAM,MAAM,MAAM,MAAM,KAAK;AAAA,IAC3B,QAAQ;AAAA,IACR,SAAS,EAAE,eAAe,UAAU,IAAI,MAAM,IAAI,gBAAgB,mBAAmB;AAAA,IACrF,MAAM,KAAK,UAAU,IAAI;AAAA,IACzB,QAAQ,IAAI;AAAA,EACd,CAAC;AACD,MAAI,CAAC,IAAI,IAAI;AACX,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,UAAM,IAAI,MAAM,UAAU,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EAC/D;AACA,QAAM,OAAQ,MAAM,IAAI,KAAK;AAI7B,QAAM,SAAS,KAAK,QAAQ,CAAC;AAC7B,MAAI,CAAC,OAAQ,OAAM,IAAI,MAAM,+BAA+B,KAAK,UAAU,IAAI,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAChG,SAAO,EAAE,SAAS,OAAO,SAAS,OAAO,KAAK,MAAM;AACtD;AAIO,SAAS,mBACd,OACA,OACQ;AACR,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,WAAW,MAAM,iBAAiB;AACxC,QAAM,YAAY,MAAM,qBAAqB;AAC7C,MAAI,UAAU;AACd,MAAI,WAAW;AACf,MAAI,MAAM,SAAS,aAAa,GAAG;AACjC,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,eAAe,GAAG;AACvE,cAAU;AACV,eAAW;AAAA,EACb,WAAW,MAAM,SAAS,MAAM,KAAK,MAAM,SAAS,KAAK,KAAK,MAAM,SAAS,UAAU,GAAG;AACxF,cAAU;AACV,eAAW;AAAA,EACb;AACA,UAAQ,WAAW,UAAU,YAAY,YAAY;AACvD;AAEO,SAAS,uBAA+B;AAC7C,UAAQ,QAAQ,IAAI,0BAA0B,kCAAkC;AAAA,IAC9E;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,sBAA8B;AAC5C,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,OAAM,IAAI,MAAM,gEAAgE;AAC1F,SAAO;AACT;;;ACnFO,IAAM,2BAA2B;AACjC,IAAM,sBAAsB;AAgBnC,IAAM,4BACJ;AAEF,IAAM,uBACJ;AAEK,IAAM,iCAA0D;AAAA,EACrE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,UAAU,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACzE,OAAO;AAAA,UACL,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,UAAU;AAAA,IACvB;AAAA,EACF;AACF;AAEO,IAAM,6BAAsD;AAAA,EACjE,MAAM;AAAA,EACN,UAAU;AAAA,IACR,MAAM;AAAA,IACN,aACE;AAAA,IACF,YAAY;AAAA,MACV,MAAM;AAAA,MACN,YAAY;AAAA,QACV,MAAM,EAAE,MAAM,UAAU,aAAa,gCAAgC;AAAA,QACrE,UAAU;AAAA,UACR,MAAM;AAAA,UACN,aAAa;AAAA,QACf;AAAA,MACF;AAAA,MACA,UAAU,CAAC,MAAM;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,uBACd,SAAkC,CAAC,GACZ;AACvB,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,WAAW,OAAO,KAAK,YAAY,EAAE;AAC3C,UAAM,QAAQ,KAAK,QAAQ,OAAO,KAAK,KAAK,IAAI;AAChD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,QACxC,EAAE,MAAM,QAAQ,SAAS,aAAa,QAAQ,GAAG,QAAQ;AAAA,SAAY,KAAK,KAAK,EAAE,GAAG;AAAA,MACtF;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAEO,SAAS,mBAAmB,SAA6B,CAAC,GAA0B;AACzF,QAAM,eAAe,OAAO,gBAAgB;AAC5C,QAAM,QAAQ,OAAO,SAAS;AAC9B,SAAO,OAAO,MAAM,QAAQ;AAC1B,UAAM,OAAO,OAAO,KAAK,QAAQ,EAAE;AACnC,UAAM,WAAW,KAAK,WAAW,OAAO,KAAK,QAAQ,IAAI;AACzD,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI;AAAA,MACb;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,GAAG,YAAY;AAAA;AAAA,YAAiB,QAAQ,GAAG;AAAA,QACtE,EAAE,MAAM,QAAQ,SAAS,YAAY,IAAI,GAAG;AAAA,MAC9C;AAAA,MACA,QAAQ,IAAI;AAAA,IACd,CAAC;AACD,WAAO,EAAE,SAAS,QAAQ,WAAW,IAAI,SAAS,mBAAmB,OAAO,KAAK,EAAE;AAAA,EACrF;AACF;AAgBO,SAAS,uBAAuB,SAA6B,CAAC,GAAuB;AAC1F,SAAO;AAAA,IACL,OAAO,CAAC,gCAAgC,0BAA0B;AAAA,IAClE,WAAW;AAAA,MACT,mBAAmB,uBAAuB,OAAO,QAAQ;AAAA,MACzD,eAAe,mBAAmB,OAAO,IAAI;AAAA,IAC/C;AAAA,IACA,iBAAiB,CAAC,SAChB,SAAS,sBAAsB,aAAa,SAAS,kBAAkB,SAAS;AAAA,EACpF;AACF;;;ACpIO,IAAM,sBAAsB;AAmCnC,IAAM,aAAyB,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAErF,eAAsB,SACpB,OACA,OACqB;AACrB,QAAM,SAAS,MAAM,UAAU,oBAAoB;AACnD,QAAM,UAAU,MAAM,WAAW,qBAAqB;AACtD,QAAM,QAAQ,MAAM,SAAS,QAAQ,IAAI,eAAe;AACxD,QAAM,SAAS,MAAM,YAAY,KAAK;AACtC,MAAI,MAAM;AACV,MAAI;AACF,UAAM,EAAE,QAAQ,IAAI,MAAM,iBAAiB;AAAA,MACzC;AAAA,MACA;AAAA,MACA;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,UAAU;AAAA,QACR,EAAE,MAAM,UAAU,SAAS,MAAM,aAAa;AAAA,QAC9C,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,MAClC;AAAA,IACF,CAAC;AACD,WAAO,QAAQ,WAAW,IAAI,KAAK;AAAA,EACrC,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,GAAG;AAAA,MACH,OAAO,SAAS,MAAM,IAAI,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7F;AAAA,EACF;AAEA,MAAI,SAAyC;AAC7C,MAAI;AACF,UAAM,UAAU,IACb,QAAQ,gBAAgB,EAAE,EAC1B,QAAQ,WAAW,EAAE,EACrB,KAAK;AACR,aAAS,KAAK,MAAM,OAAO;AAAA,EAC7B,QAAQ;AACN,WAAO,EAAE,GAAG,YAAY,OAAO,SAAS,MAAM,IAAI,uBAAuB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG;AAAA,EAC/F;AAEA,QAAM,aAAqC,CAAC;AAC5C,MAAI,MAAM;AACV,aAAW,OAAO,MAAM,YAAY;AAClC,UAAM,IAAI,OAAO,OAAO,IAAI,GAAG,KAAK,CAAC;AACrC,UAAM,UAAU,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI;AACpE,eAAW,IAAI,GAAG,IAAI;AACtB,WAAO;AAAA,EACT;AACA,SAAO;AAAA,IACL;AAAA,IACA,WAAW,MAAM,WAAW,WAAW,IAAI,IAAI,MAAM,MAAM,WAAW;AAAA,IACtE,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ;AAAA,EAC3D;AACF;AAIO,SAAS,iBAAiB,MAAyC;AACxE,SAAO,KAAK,IAAI,CAAC,MAAM,KAAK,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAClE;AAGO,SAAS,iBAAiB,MAAyC;AACxE,QAAM,SAAS,KAAK,IAAI,CAAC,MAAM,IAAI,EAAE,GAAG,KAAK,EAAE,KAAK,GAAG;AACvD,SAAO;AAAA,GAA6D,MAAM;AAC5E;;;ACzGA,SAAS,WAAW,qBAAqB;AACzC,SAAS,YAAY;;;AC4Cd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YAA4B,MAAc;AACxC,UAAM,0DAA0D,IAAI,sBAAiB;AAD3D;AAE1B,SAAK,OAAO;AAAA,EACd;AAAA,EAH4B;AAI9B;;;ACXA,eAAsB,aACpB,MAC0B;AAC1B,QAAM,SAAS,KAAK,UAAU,oBAAoB;AAClD,QAAM,UAAU,KAAK,WAAW,qBAAqB;AACrD,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,cAAc,KAAK,eAAe;AAExC,QAAM,SACJ,KAAK,SAAS,KAAK,gBACf;AAAA,IACE,OAAO,KAAK;AAAA,IACZ,WAAW,KAAK;AAAA,IAChB,iBAAiB,KAAK,oBAAoB,MAAM;AAAA,EAClD,IACA,uBAAuB;AAC7B,QAAM,QAAQ,KAAK,SAAS,OAAO;AACnC,QAAM,YAAY,KAAK,iBAAiB,OAAO;AAC/C,QAAM,kBAAkB,KAAK,mBAAmB,OAAO;AAEvD,QAAM,QAAQ,KAAK,IAAI;AACvB,QAAM,aAAiC,CAAC;AACxC,QAAM,YAAiC,CAAC;AACxC,MAAI,YAAY;AAChB,MAAI,eAAe;AAEnB,QAAM,SAAS,KAAK,MAAM,YAAY,KAAK,OAAO;AAClD,aAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAEjD,QAAM,eAAe,KAAK,QAAQ,QAAQ,gBAAgB;AAC1D,QAAM,gBAAgD;AAAA,IACpD,EAAE,MAAM,UAAU,SAAS,aAAa;AAAA,IACxC,EAAE,MAAM,QAAQ,SAAS,OAAO;AAAA,EAClC;AAEA,WAAS,OAAO,GAAG,OAAO,UAAU,QAAQ;AAC1C,QAAI,KAAK,QAAQ,QAAS,OAAM,IAAI,MAAM,mBAAmB;AAE7D,UAAM,EAAE,SAAS,UAAU,OAAO,WAAW,IAAI,MAAM,iBAAiB;AAAA,MACtE;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV;AAAA,MACA,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,oBAAgB,mBAAmB,YAAY,UAAU;AAEzD,UAAM,aAAa,SAAS,WAAW,IAAI,KAAK;AAChD,UAAM,kBAAkB,SAAS,cAAc,CAAC,GAAG,IAAI,CAAC,QAAQ;AAAA,MAC9D,IAAI,GAAG;AAAA,MACP,MAAM,GAAG,SAAS;AAAA,MAClB,OAAO,MAAM;AACX,YAAI;AACF,iBAAO,KAAK,MAAM,GAAG,SAAS,SAAS;AAAA,QACzC,QAAQ;AACN,iBAAO,CAAC;AAAA,QACV;AAAA,MACF,GAAG;AAAA,IACL,EAAE;AAEF,kBAAc,KAAK;AAAA,MACjB,MAAM;AAAA,MACN,SAAS,aAAa;AAAA,MACtB,GAAI,SAAS,YAAY,SAAS,EAAE,YAAY,SAAS,WAAW,IAAI,CAAC;AAAA,IAC3E,CAAC;AACD,eAAW,KAAK;AAAA,MACd,MAAM;AAAA,MACN,SAAS;AAAA,MACT,WAAW,eAAe,SAAS,IAAI,iBAAiB;AAAA,IAC1D,CAAC;AAED,eAAW,MAAM,gBAAgB;AAC/B;AACA,UAAI,aAAa;AACjB,UAAI;AACF,cAAM,WAAW,UAAU,GAAG,IAAI;AAClC,YAAI,CAAC,UAAU;AACb,uBAAa,KAAK,UAAU,EAAE,OAAO,gBAAgB,GAAG,IAAI,GAAG,CAAC;AAAA,QAClE,OAAO;AACL,gBAAM,IAAI,MAAM,SAAS,GAAG,MAAM,EAAE,QAAQ,SAAS,QAAQ,KAAK,OAAO,CAAC;AAC1E,uBAAa,EAAE;AACf,0BAAgB,EAAE;AAClB,gBAAM,eAAe,gBAAgB,GAAG,IAAI;AAC5C,cAAI,cAAc;AAChB,sBAAU,KAAK;AAAA,cACb,MAAM;AAAA,cACN;AAAA,cACA,YAAY,EAAE,MAAM,GAAG,MAAM,MAAM,GAAG,KAAK;AAAA,cAC3C,SAAS;AAAA,YACX,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,qBAAa,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC;AAAA,MACzF;AACA,oBAAc,KAAK,EAAE,MAAM,QAAQ,cAAc,GAAG,IAAI,SAAS,cAAc,OAAO,CAAC;AACvF,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,cAAc,QAAQ,YAAY,GAAG,GAAG,CAAC;AAAA,IACpF;AAGA,QAAI,eAAe,SAAS,GAAG;AAC7B,YAAM,WAAW,MAAM,iBAAiB;AAAA,QACtC;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,UAAU;AAAA,QACV,aAAa;AAAA,QACb,WAAW;AAAA,QACX,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,mBAAmB,YAAY,SAAS,KAAK;AAC7D,YAAM,gBAAgB,SAAS,QAAQ,WAAW,IAAI,KAAK;AAC3D,oBAAc,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAC/D,iBAAW,KAAK,EAAE,MAAM,aAAa,SAAS,aAAa,CAAC;AAAA,IAC9D;AAEA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,MAAM,WAAW;AAAA,QAC9B;AAAA,QACA;AAAA,QACA,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ;AAAA,QACA;AAAA,QACA,OAAO;AAAA,QACP,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,sBAAgB,OAAO;AACvB,oBAAc,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAC5D,iBAAW,KAAK,EAAE,MAAM,QAAQ,SAAS,OAAO,QAAQ,CAAC;AAAA,IAC3D;AAAA,EACF;AAEA,SAAO,EAAE,YAAY,WAAW,WAAW,YAAY,KAAK,IAAI,IAAI,OAAO,SAAS,aAAa;AACnG;AAEA,eAAe,WAA8C,MASX;AAChD,QAAM,eAAe,KAAK,MAAM,wBAAwB,KAAK,OAAO;AAKpE,QAAM,iBAAiD,CAAC,EAAE,MAAM,UAAU,SAAS,aAAa,CAAC;AACjG,aAAW,OAAO,KAAK,YAAY;AACjC,QAAI,IAAI,SAAS,OAAQ;AACzB,QAAI,IAAI,SAAS,YAAa,gBAAe,KAAK,EAAE,MAAM,QAAQ,SAAS,IAAI,QAAQ,CAAC;AAAA,aAC/E,IAAI,SAAS,OAAQ,gBAAe,KAAK,EAAE,MAAM,aAAa,SAAS,IAAI,QAAQ,CAAC;AAAA,EAC/F;AAGA,WAAS,UAAU,GAAG,UAAU,GAAG,WAAW;AAC5C,UAAM,EAAE,SAAS,MAAM,IAAI,MAAM,iBAAiB;AAAA,MAChD,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,MACV,aAAa;AAAA,MACb,WAAW;AAAA,MACX,QAAQ,KAAK;AAAA,IACf,CAAC;AACD,UAAM,WAAW,QAAQ,WAAW,IAAI,KAAK;AAC7C,QAAI,QAAQ,SAAS,EAAG,QAAO,EAAE,SAAS,SAAS,mBAAmB,KAAK,OAAO,KAAK,EAAE;AAAA,EAC3F;AACA,QAAM,IAAI,0BAA0B,KAAK,IAAI;AAC/C;;;AFxHA,eAAsB,mBACpB,MACmC;AACnC,QAAM,YAAY,IAAI,IAAI,KAAK,OAAO,qBAAqB,CAAC,MAAM,CAAC;AACnE,QAAM,eAAe,IAAI,IAAI,KAAK,OAAO,wBAAwB,CAAC,UAAU,CAAC;AAC7E,YAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAE1C,QAAM,SAAS,MAAM,eAA2B;AAAA,IAC9C,MAAM;AAAA,MACJ,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS;AAAA,MACzC,EAAE,MAAM,WAAW,QAAQ,KAAK,SAAS,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,EAAE,EAAE;AAAA,IAChF;AAAA,IACA,MAAM,KAAK,QAAQ;AAAA,IACnB,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,aAAa,KAAK;AAAA,IAClB,MAAM,QAAQ,MAAM;AAClB,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,UAAU,KAAK,KAAK,SAAS;AACnC,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAC3D,YAAM,YAAY,OAAO,KAAK,KAAK,SAAS,MAAM,SAAS;AAE3D,YAAM,MAAM,MAAM,aAAa;AAAA,QAC7B;AAAA,QACA;AAAA,QACA,OAAO,KAAK;AAAA,QACZ,OAAO,KAAK;AAAA,QACZ,eAAe,KAAK;AAAA,QACpB,iBAAiB,KAAK;AAAA,QACtB,UAAU,KAAK;AAAA,QACf,YAAY,KAAK;AAAA,QACjB,aAAa,KAAK;AAAA,QAClB,QAAQ,KAAK;AAAA,QACb,SAAS,KAAK;AAAA,MAChB,CAAC;AAED,YAAM,gBAAgB,IAAI,UAAU,OAAO,CAAC,MAAM,UAAU,IAAI,EAAE,IAAI,CAAC;AACvE,YAAM,mBAAmB,IAAI,UAAU,OAAO,CAAC,MAAM,aAAa,IAAI,EAAE,IAAI,CAAC;AAE7E,YAAM,CAAC,cAAc,aAAa,cAAc,IAAI,MAAM,QAAQ,IAAI;AAAA,QACpE,SAAS,KAAK,OAAO,cAAc,EAAE,YAAY,IAAI,YAAY,QAAQ,CAAC;AAAA,QAC1E,KAAK,OAAO,aACR,QAAQ;AAAA,UACN,cAAc;AAAA,YAAI,CAAC,aACjB,SAAS,KAAK,OAAO,YAAa,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACpE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,QAC5E,KAAK,OAAO,iBACR,QAAQ;AAAA,UACN,iBAAiB;AAAA,YAAI,CAAC,aACpB,SAAS,KAAK,OAAO,gBAAiB,EAAE,UAAU,QAAQ,CAAC,EAAE,KAAK,CAAC,OAAO;AAAA,cACxE,GAAG;AAAA,cACH,MAAM,SAAS;AAAA,cACf,MAAM,SAAS;AAAA,YACjB,EAAE;AAAA,UACJ;AAAA,QACF,IACA,QAAQ,QAAQ,CAAC,CAAuD;AAAA,MAC9E,CAAC;AAED,YAAM,gBACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,YAAM,mBACJ,eAAe,WAAW,IACtB,IACA,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,eAAe;AAG3E,YAAM,aAAa,KAAK,KAAK,OAAO,aAAa,IAAI,MAAM,KAAK,OAAO,iBAAiB,IAAI;AAC5F,YAAM,aAAa,aAAa,YAAY,gBAAgB,oBAAoB;AAEhF,YAAM,YAAgC,EAAE,WAAW,aAAa;AAChE,UAAI,KAAK,OAAO;AACd,kBAAU,aAAa,EAAE,aAAa,aAAa,WAAW,cAAc;AAC9E,UAAI,KAAK,OAAO;AACd,kBAAU,iBAAiB,EAAE,aAAa,gBAAgB,WAAW,iBAAiB;AAExF,YAAM,UAAU,KAAK,KAAK,QAAQ,WAAW,WAAW,OAAO,KAAK,GAAG,EAAE;AACzE,gBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,oBAAc,KAAK,SAAS,iBAAiB,GAAG,KAAK,UAAU,IAAI,YAAY,MAAM,CAAC,CAAC;AACvF,oBAAc,KAAK,SAAS,gBAAgB,GAAG,KAAK,UAAU,IAAI,WAAW,MAAM,CAAC,CAAC;AACrF,oBAAc,KAAK,SAAS,aAAa,GAAG,KAAK,UAAU,WAAW,MAAM,CAAC,CAAC;AAE9E,YAAM,QAAQ,CAAC,SAAS,aAAa,UAAU,QAAQ,CAAC,CAAC,EAAE;AAC3D,UAAI,KAAK,OAAO,WAAY,OAAM,KAAK,QAAQ,cAAc,QAAQ,CAAC,CAAC,EAAE;AACzE,UAAI,KAAK,OAAO,eAAgB,OAAM,KAAK,WAAW,iBAAiB,QAAQ,CAAC,CAAC,EAAE;AAEnF,aAAO;AAAA,QACL,QAAQ;AAAA,UACN,OAAO,IAAI,WAAW;AAAA,UACtB,WAAW,IAAI;AAAA,UACf,eAAe,IAAI,UAAU;AAAA,QAC/B;AAAA,QACA,SAAS,EAAE,OAAO,aAAa,GAAG,OAAO,WAAW,OAAO,MAAM,KAAK,GAAG,EAAE;AAAA,QAC3E,SAAS,IAAI;AAAA,QACb,YAAY,IAAI;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AAGD,QAAM,UAAU;AAAA,IACd,OAAO,OAAO,QAAQ;AAAA,IACtB,UAAU,OAAO,QAAQ;AAAA,IACzB,WAAW,OAAO,QAAQ;AAAA,IAC1B,cAAc,OAAO,QAAQ;AAAA,IAC7B,YAAY,OAAO,QAAQ;AAAA,IAC3B,cAAc,OAAO,QAAQ;AAAA,IAC7B,cAAc,OAAO,QAAQ;AAAA,IAC7B,WAAW,OAAO,OAAO;AAAA,IACzB,WAAW,OAAO,OAAO;AAAA,EAC3B;AACA,gBAAc,KAAK,KAAK,QAAQ,cAAc,GAAG,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAEjF,QAAM,KAAe;AAAA,IACnB;AAAA,IACA;AAAA,IACA,cAAc,OAAO,QAAQ,UAAU,sBAAsB,OAAO,QAAQ,kBAAkB,KAAK,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,iBAAiB,QAAQ,CAAC,CAAC,iBAAiB,OAAO,QAAQ,aAAa,QAAQ,CAAC,CAAC,qBAAqB,OAAO,QAAQ,aAAa,KAAM,QAAQ,CAAC,CAAC;AAAA,IAChS;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,OAAO,QAAQ,OAAO,OAAO,WAAW,CAAC,CAAC,EAAE;AAAA,MAC7C,CAAC,CAAC,IAAI,CAAC,MACL,KAAK,EAAE,OAAO,EAAE,WAAW,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,UAAU,QAAQ,CAAC,CAAC,OAAO,EAAE,aAAa,QAAQ,CAAC,CAAC;AAAA,IAC3G;AAAA,IACA;AAAA,EACF;AACA,gBAAc,KAAK,KAAK,QAAQ,YAAY,GAAG,GAAG,KAAK,IAAI,CAAC;AAE5D,SAAO,EAAE,OAAO;AAClB;","names":[]}
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.40.1",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/optimization.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-DeZ_EArp.js';
|
|
2
|
-
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
2
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-Dvy-bt7x.js';
|
|
3
3
|
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-DuZXOk7K.js';
|
|
4
4
|
import './run-record-BGY6bHRh.js';
|
|
5
5
|
import './errors-mje_cKOs.js';
|
|
@@ -7,5 +7,5 @@ import './integrity-DYR5gWlb.js';
|
|
|
7
7
|
import './store-Db2Bv8Cf.js';
|
|
8
8
|
import './emitter-DP_cSSiw.js';
|
|
9
9
|
import './control-runtime-BZ_lVLYW.js';
|
|
10
|
-
import './dataset-
|
|
10
|
+
import './dataset-BlwAtYYf.js';
|
|
11
11
|
import './failure-cluster-Cw65_5FY.js';
|
package/dist/optimization.js
CHANGED
|
@@ -25,14 +25,14 @@ import {
|
|
|
25
25
|
summarizePreferenceMemory,
|
|
26
26
|
trialTraceFromMultiShotTrial,
|
|
27
27
|
withAssignedFeedbackSplit
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-DMW5VENN.js";
|
|
29
29
|
import {
|
|
30
30
|
runEvalCampaign
|
|
31
|
-
} from "./chunk-
|
|
31
|
+
} from "./chunk-LCIDRYGP.js";
|
|
32
32
|
import "./chunk-VXNVVBZO.js";
|
|
33
|
-
import "./chunk-
|
|
34
|
-
import "./chunk-
|
|
35
|
-
import "./chunk-
|
|
33
|
+
import "./chunk-BWZEGTES.js";
|
|
34
|
+
import "./chunk-EGIPWXHL.js";
|
|
35
|
+
import "./chunk-WP7SY7AI.js";
|
|
36
36
|
import "./chunk-UBPIXOC4.js";
|
|
37
37
|
import "./chunk-PC4UYEBM.js";
|
|
38
38
|
import "./chunk-TVVP3ZZQ.js";
|
package/dist/pipelines/index.js
CHANGED
|
@@ -2,13 +2,13 @@ import {
|
|
|
2
2
|
compareToBaseline,
|
|
3
3
|
computeToolUseMetrics,
|
|
4
4
|
failureClusterView
|
|
5
|
-
} from "../chunk-
|
|
5
|
+
} from "../chunk-AU2JLNSZ.js";
|
|
6
6
|
import {
|
|
7
7
|
buildTrajectory
|
|
8
8
|
} from "../chunk-RZTMDUO7.js";
|
|
9
9
|
import {
|
|
10
10
|
interRaterReliability
|
|
11
|
-
} from "../chunk-
|
|
11
|
+
} from "../chunk-WP7SY7AI.js";
|
|
12
12
|
import {
|
|
13
13
|
aggregateLlm,
|
|
14
14
|
argHash,
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
|
|
2
|
+
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Red-team battery — adversarial scenario corpus with per-category
|
|
6
|
+
* scorers.
|
|
7
|
+
*
|
|
8
|
+
* Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
|
|
9
|
+
* that declares: category, expected refusal, attack surface. Scorers
|
|
10
|
+
* inspect the agent's output + trajectory and return a PASS/FAIL per
|
|
11
|
+
* category. Aggregated via `redTeamReport`.
|
|
12
|
+
*
|
|
13
|
+
* The built-in corpus is intentionally conservative — 40 cases
|
|
14
|
+
* covering the most-seen attack patterns. Extend via `extendCorpus`.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
|
|
18
|
+
interface RedTeamPayload {
|
|
19
|
+
category: RedTeamCategory;
|
|
20
|
+
/** The hostile input the agent sees. */
|
|
21
|
+
input: string;
|
|
22
|
+
/**
|
|
23
|
+
* What a safe agent should do: "refuse" means explicit refusal,
|
|
24
|
+
* "ignore" means process benignly without executing the attack.
|
|
25
|
+
*/
|
|
26
|
+
expected: 'refuse' | 'ignore';
|
|
27
|
+
/** Substring(s) that MUST NOT appear in output. */
|
|
28
|
+
forbiddenStrings?: string[];
|
|
29
|
+
/** Tool names the agent MUST NOT call for this scenario. */
|
|
30
|
+
forbiddenTools?: string[];
|
|
31
|
+
}
|
|
32
|
+
interface RedTeamCase extends DatasetScenario {
|
|
33
|
+
payload: RedTeamPayload;
|
|
34
|
+
}
|
|
35
|
+
interface RedTeamFinding {
|
|
36
|
+
scenarioId: string;
|
|
37
|
+
category: RedTeamCategory;
|
|
38
|
+
passed: boolean;
|
|
39
|
+
reason: string;
|
|
40
|
+
evidence?: string;
|
|
41
|
+
}
|
|
42
|
+
interface RedTeamReport {
|
|
43
|
+
findings: RedTeamFinding[];
|
|
44
|
+
passRateByCategory: Record<RedTeamCategory, number>;
|
|
45
|
+
overallPassRate: number;
|
|
46
|
+
}
|
|
47
|
+
/** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
|
|
48
|
+
declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
|
|
49
|
+
declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
|
|
50
|
+
/**
|
|
51
|
+
* Score a single agent response against the red-team case.
|
|
52
|
+
* `toolCalls` should be the names of tools the agent invoked during the run.
|
|
53
|
+
*/
|
|
54
|
+
declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
|
|
55
|
+
/** Aggregate red-team findings into per-category pass rates. */
|
|
56
|
+
declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
|
|
57
|
+
/**
|
|
58
|
+
* Extract the tool-call names from a corpus run — convenience for the
|
|
59
|
+
* common pipeline (run the scenario → score the run).
|
|
60
|
+
*/
|
|
61
|
+
declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
|
|
62
|
+
|
|
63
|
+
export { DEFAULT_RED_TEAM_CORPUS as D, type RedTeamCase as R, type RedTeamCategory as a, type RedTeamFinding as b, type RedTeamPayload as c, type RedTeamReport as d, redTeamReport as e, redTeamDataset as r, scoreRedTeamOutput as s, toolNamesForRun as t };
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
2
2
|
import { TCloud } from '@tangle-network/tcloud';
|
|
3
|
-
import { D as DatasetSplit,
|
|
3
|
+
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
|
|
4
4
|
import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-DuZXOk7K.js';
|
|
5
5
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
6
6
|
|
|
@@ -418,7 +418,10 @@ declare function weightedMean(scores: {
|
|
|
418
418
|
weight?: number;
|
|
419
419
|
}[]): number;
|
|
420
420
|
/** Bootstrap confidence interval */
|
|
421
|
-
declare function confidenceInterval(scores: number[], confidence?: number
|
|
421
|
+
declare function confidenceInterval(scores: number[], confidence?: number, opts?: {
|
|
422
|
+
seed?: number;
|
|
423
|
+
resamples?: number;
|
|
424
|
+
}): {
|
|
422
425
|
mean: number;
|
|
423
426
|
lower: number;
|
|
424
427
|
upper: number;
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-Di84bXD7.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
4
|
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-DuZXOk7K.js';
|
|
5
5
|
import './run-record-BGY6bHRh.js';
|
|
@@ -7,6 +7,6 @@ import './errors-mje_cKOs.js';
|
|
|
7
7
|
import './outcome-store-D6KWmYvj.js';
|
|
8
8
|
import './judge-calibration-DilmB3Ml.js';
|
|
9
9
|
import '@tangle-network/tcloud';
|
|
10
|
-
import './dataset-
|
|
10
|
+
import './dataset-BlwAtYYf.js';
|
|
11
11
|
import './failure-cluster-Cw65_5FY.js';
|
|
12
12
|
import './store-Db2Bv8Cf.js';
|