@waniwani/sdk 0.6.1-beta.3 → 0.6.1-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evals/index.d.ts +1 -81
- package/dist/evals/index.js +1 -7
- package/dist/evals/index.js.map +1 -1
- package/dist/evals/scorers.d.ts +92 -0
- package/dist/evals/scorers.js +8 -0
- package/dist/evals/scorers.js.map +1 -0
- package/package.json +6 -1
package/dist/evals/index.d.ts
CHANGED
|
@@ -153,84 +153,4 @@ declare function conversation(url: string, turns: ConversationTurn[]): Promise<C
|
|
|
153
153
|
*/
|
|
154
154
|
declare function replayScenario(url: string, scenario: EvalScenario): Promise<ConversationResult>;
|
|
155
155
|
|
|
156
|
-
|
|
157
|
-
* Create a local Braintrust reporter that writes JSON results to a directory and
|
|
158
|
-
* prints a summary to console.
|
|
159
|
-
*
|
|
160
|
-
* Requires the `braintrust` package: bun add -d braintrust
|
|
161
|
-
*
|
|
162
|
-
* @param outputDir - Directory to write JSON result files (default: "evals/runs")
|
|
163
|
-
*/
|
|
164
|
-
declare function createLocalReporter(outputDir?: string): unknown;
|
|
165
|
-
|
|
166
|
-
/**
|
|
167
|
-
* Parse the JSON-stringified ChatResult from a Braintrust task output.
|
|
168
|
-
*/
|
|
169
|
-
declare function parseTaskOutput(output: unknown): ChatResult;
|
|
170
|
-
/**
|
|
171
|
-
* Checks whether the expected tool was called.
|
|
172
|
-
* Looks for the tool name in `metadata.expectedTool` first (for cases where `expected` is a
|
|
173
|
-
* reference answer), then falls back to `expected` directly.
|
|
174
|
-
*/
|
|
175
|
-
declare function calledExpectedTool({ output, expected, metadata, }: {
|
|
176
|
-
output: unknown;
|
|
177
|
-
expected?: unknown;
|
|
178
|
-
metadata?: Record<string, unknown>;
|
|
179
|
-
}): {
|
|
180
|
-
name: string;
|
|
181
|
-
score: number;
|
|
182
|
-
metadata: {
|
|
183
|
-
expected: string;
|
|
184
|
-
actual: string[];
|
|
185
|
-
};
|
|
186
|
-
};
|
|
187
|
-
/**
|
|
188
|
-
* Checks whether the assistant produced any text output.
|
|
189
|
-
*/
|
|
190
|
-
declare function hasOutput({ output }: {
|
|
191
|
-
output: unknown;
|
|
192
|
-
}): {
|
|
193
|
-
name: string;
|
|
194
|
-
score: number;
|
|
195
|
-
};
|
|
196
|
-
/**
|
|
197
|
-
* Checks specific fields in the first tool call's `stateUpdates` against expected values.
|
|
198
|
-
* Supports nested fields via dot notation (e.g. "mixedBreed.knowsBreeds").
|
|
199
|
-
* Returns partial credit (fraction of matching fields).
|
|
200
|
-
*/
|
|
201
|
-
declare function toolInputFieldsMatch({ output, metadata, }: {
|
|
202
|
-
output: unknown;
|
|
203
|
-
metadata?: Record<string, unknown>;
|
|
204
|
-
}): {
|
|
205
|
-
name: string;
|
|
206
|
-
score: number;
|
|
207
|
-
metadata?: undefined;
|
|
208
|
-
} | {
|
|
209
|
-
name: string;
|
|
210
|
-
score: number;
|
|
211
|
-
metadata: Record<string, {
|
|
212
|
-
expected: unknown;
|
|
213
|
-
actual: unknown;
|
|
214
|
-
match: boolean;
|
|
215
|
-
}>;
|
|
216
|
-
};
|
|
217
|
-
/** ClosedQA — checks if the answer correctly addresses the question given a reference answer. */
|
|
218
|
-
declare const FaqAccuracy: (args: {
|
|
219
|
-
input: unknown;
|
|
220
|
-
output: unknown;
|
|
221
|
-
expected?: unknown;
|
|
222
|
-
}) => Promise<unknown>;
|
|
223
|
-
/** Factuality — checks if the output is factually consistent with the expected output. */
|
|
224
|
-
declare const OutputFactuality: (args: {
|
|
225
|
-
input: unknown;
|
|
226
|
-
output: unknown;
|
|
227
|
-
expected?: unknown;
|
|
228
|
-
}) => Promise<unknown>;
|
|
229
|
-
/** Moderation — flags unsafe or inappropriate content. */
|
|
230
|
-
declare const SafetyCheck: (args: {
|
|
231
|
-
input: unknown;
|
|
232
|
-
output: unknown;
|
|
233
|
-
expected?: unknown;
|
|
234
|
-
}) => Promise<unknown>;
|
|
235
|
-
|
|
236
|
-
export { type ChatResult, type ConversationResult, type ConversationTurn, type ConversationTurnResult, type EvalScenario, type EvalScenarioType, FaqAccuracy, OutputFactuality, SafetyCheck, type Scenario, type SimulationResult, type SimulationTurn, type ToolCallTrace, type TurnAssertion, calledExpectedTool, chat, conversation, createLocalReporter, hasOutput, loadScenarios, parseTaskOutput, replayScenario, saveScenario, toolInputFieldsMatch };
|
|
156
|
+
export { type ChatResult, type ConversationResult, type ConversationTurn, type ConversationTurnResult, type EvalScenario, type EvalScenarioType, type Scenario, type SimulationResult, type SimulationTurn, type ToolCallTrace, type TurnAssertion, chat, conversation, loadScenarios, replayScenario, saveScenario };
|
package/dist/evals/index.js
CHANGED
|
@@ -1,8 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
\u{1F4CA} ${r} (${o.results.length} cases):
|
|
3
|
-
`);let u=0;for(let c of o.results){let d=c.scores??{},l=d.called_expected_tool===1;l||u++,console.log(` ${l?"\u2705":"\u274C"} ${c.input.slice(0,70)}`);for(let[g,f]of Object.entries(d))console.log(` ${g}: ${f}`)}return console.log(`
|
|
4
|
-
${o.results.length-u}/${o.results.length} passed`),console.log(` \u2192 ${p}
|
|
5
|
-
`),u===0},reportRun(n){let o=n.every(r=>r===!0);return console.log(o?`
|
|
6
|
-
\u2705 All experiments passed`:`
|
|
7
|
-
\u274C Some experiments failed`),o}})}export{D as FaqAccuracy,Q as OutputFactuality,G as SafetyCheck,W as calledExpectedTool,J as chat,_ as conversation,X as createLocalReporter,z as hasOutput,q as loadScenarios,m as parseTaskOutput,L as replayScenario,P as saveScenario,B as toolInputFieldsMatch};
|
|
1
|
+
import{mkdirSync as M,readdirSync as U,readFileSync as I,writeFileSync as R}from"fs";import{join as c}from"path";import{parseJsonEventStream as E,readUIMessageStream as j,uiMessageChunkSchema as b}from"ai";import{z as a}from"zod";var k=a.object({name:a.string(),type:a.enum(["regulatory","functional","tone"]).optional(),mode:a.enum(["synthetic","manual"]).optional(),outcome:a.object({toolsCalled:a.array(a.string())}).optional(),messages:a.array(a.looseObject({id:a.string(),role:a.enum(["user","assistant","system","data"]),parts:a.array(a.record(a.string(),a.unknown()))}))});function h(n){let t=n.parts.filter(e=>e.type==="text").map(e=>e.text).join(""),s=n.parts.filter(e=>e.type.startsWith("tool-")||e.type==="dynamic-tool").map(e=>e),o=s.map(e=>e.toolName),r=s.map(e=>({name:e.toolName,input:e.input??{},output:e.output}));return{output:t,toolsCalled:o,toolCallTraces:r}}function f(n){return n.parts.filter(t=>t.type==="text").map(t=>t.text).join("")}function N(n){return n.parts.filter(t=>t.type==="dynamic-tool"||t.type.startsWith("tool-")).map(t=>t.toolName).filter(Boolean)}async function p(n,t){let s=await fetch(`${n}/api/waniwani`,{method:"POST",headers:{"Content-Type":"application/json"},signal:AbortSignal.timeout(6e4),body:JSON.stringify({messages:t})});if(!s.ok)throw new Error(`Chat returned ${s.status}: ${await s.text()}`);if(!s.body)throw new Error("Chat response has no body");let o=E({stream:s.body,schema:b}).pipeThrough(new TransformStream({transform(e,i){e.success&&i.enqueue(e.value)}})),r;for await(let e of j({stream:o}))r=e;if(!r)throw new Error("No message received from stream");return{result:h(r),message:r}}function P(n,t="evals/scenarios"){let s=c(process.cwd(),t);M(s,{recursive:!0});let o=`${n.name}.json`;return R(c(s,o),JSON.stringify(n,null,2)),o}function A(n="evals/scenarios"){let t=c(process.cwd(),n);return U(t).filter(s=>s.endsWith(".json")).sort().map(s=>{let o=JSON.parse(I(c(t,s),"utf8"));return k.parse(o)})}async function O(n,t){let s={id:crypto.randomUUID(),role:"user",parts:[{type:"text",text:t}]},{result:o}=await p(n,[s]);return o}async function J(n,t){let s=[],o=[];for(let r of t){s.push({id:crypto.randomUUID(),role:"user",parts:[{type:"text",text:r.input}]});let{result:e,message:i}=await p(n,s);s.push(i),o.push({input:r.input,response:e,assertions:[]})}return{turns:o}}async function $(n,t){let s=t.mode??"regenerate",o=[],r=[],e=[];for(let i=0;i<t.messages.length;i++){let l=t.messages[i];if(l.role==="user"){let u=t.messages[i+1];e.push({userMsg:l,assistantMsg:u?.role==="assistant"?u:void 0})}}for(let i=0;i<e.length;i++){let{userMsg:l,assistantMsg:u}=e[i],S=i===e.length-1,m=u?N(u):[];if(o.push(l),s==="inject"&&!S&&u){o.push(u);let d=h(u),w=y(m,d.toolsCalled);r.push({input:f(l),response:d,assertions:w});continue}let{result:g,message:v}=await p(n,o);o.push(v);let C=y(m,g.toolsCalled);r.push({input:f(l),response:g,assertions:C})}return{turns:r}}function y(n,t){if(n.length===0)return[];let s=new Set(t);return[...new Set(n)].map(r=>({passed:s.has(r),expected:[r],actual:t}))}export{O as chat,J as conversation,A as loadScenarios,$ as replayScenario,P as saveScenario};
|
|
8
2
|
//# sourceMappingURL=index.js.map
|
package/dist/evals/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/evals/chat.ts","../../src/evals/reporter.ts","../../src/evals/scorers.ts"],"sourcesContent":["import { mkdirSync, readdirSync, readFileSync, writeFileSync } from \"node:fs\";\nimport { join } from \"node:path\";\nimport {\n\tparseJsonEventStream,\n\treadUIMessageStream,\n\ttype UIMessage,\n\tuiMessageChunkSchema,\n} from \"ai\";\nimport { z } from \"zod\";\nimport type {\n\tChatResult,\n\tConversationResult,\n\tConversationTurn,\n\tConversationTurnResult,\n\tEvalScenario,\n\tToolCallTrace,\n\tTurnAssertion,\n} from \"./types\";\n\n// UIMessage parts are heterogeneous — validate the fields we need, pass extras through\nconst evalScenarioSchema = z.object({\n\tname: z.string(),\n\ttype: z.enum([\"regulatory\", \"functional\", \"tone\"]).optional(),\n\tmode: z.enum([\"synthetic\", \"manual\"]).optional(),\n\toutcome: z.object({ toolsCalled: z.array(z.string()) }).optional(),\n\tmessages: z.array(\n\t\tz.looseObject({\n\t\t\tid: z.string(),\n\t\t\trole: z.enum([\"user\", \"assistant\", \"system\", \"data\"]),\n\t\t\tparts: z.array(z.record(z.string(), z.unknown())),\n\t\t}),\n\t),\n});\n\n// --- Internal helpers ---\n\nfunction parseUIMessage(msg: UIMessage): ChatResult {\n\tconst output = msg.parts\n\t\t.filter((p): p is { type: \"text\"; text: string } => p.type === \"text\")\n\t\t.map((p) => p.text)\n\t\t.join(\"\");\n\n\tconst toolParts = msg.parts\n\t\t.filter((p) => p.type.startsWith(\"tool-\") || p.type === \"dynamic-tool\")\n\t\t.map(\n\t\t\t(p) =>\n\t\t\t\tp as unknown as {\n\t\t\t\t\ttoolName: string;\n\t\t\t\t\tinput?: Record<string, unknown>;\n\t\t\t\t\toutput?: unknown;\n\t\t\t\t},\n\t\t);\n\tconst toolsCalled = toolParts.map((p) => p.toolName);\n\tconst toolCallTraces: ToolCallTrace[] = toolParts.map((p) => ({\n\t\tname: p.toolName,\n\t\tinput: p.input ?? {},\n\t\toutput: p.output,\n\t}));\n\n\treturn { output, toolsCalled, toolCallTraces };\n}\n\nfunction textFromUIMessage(msg: UIMessage): string {\n\treturn msg.parts\n\t\t.filter((p): p is { type: \"text\"; text: string } => p.type === \"text\")\n\t\t.map((p) => p.text)\n\t\t.join(\"\");\n}\n\n/** Extract the tool names called in a recorded assistant UIMessage. */\nfunction extractRecordedTools(msg: UIMessage): string[] {\n\treturn msg.parts\n\t\t.filter((p) => p.type === \"dynamic-tool\" || p.type.startsWith(\"tool-\"))\n\t\t.map((p) => (p as unknown as { toolName: string }).toolName)\n\t\t.filter(Boolean);\n}\n\nasync function sendMessages(\n\turl: string,\n\tmessages: UIMessage[],\n): Promise<{ result: ChatResult; message: UIMessage }> {\n\tconst response = await fetch(`${url}/api/waniwani`, {\n\t\tmethod: \"POST\",\n\t\theaders: { \"Content-Type\": \"application/json\" },\n\t\tsignal: AbortSignal.timeout(60_000),\n\t\tbody: JSON.stringify({ messages }),\n\t});\n\n\tif (!response.ok) {\n\t\tthrow new Error(\n\t\t\t`Chat returned ${response.status}: ${await response.text()}`,\n\t\t);\n\t}\n\n\tif (!response.body) {\n\t\tthrow new Error(\"Chat response has no body\");\n\t}\n\n\tconst chunkStream = parseJsonEventStream({\n\t\tstream: response.body,\n\t\tschema: uiMessageChunkSchema,\n\t}).pipeThrough(\n\t\tnew TransformStream({\n\t\t\ttransform(chunk, controller) {\n\t\t\t\tif (chunk.success) {\n\t\t\t\t\tcontroller.enqueue(chunk.value);\n\t\t\t\t}\n\t\t\t},\n\t\t}),\n\t);\n\n\tlet finalMessage: UIMessage | undefined;\n\tfor await (const msg of readUIMessageStream({ stream: chunkStream })) {\n\t\tfinalMessage = msg;\n\t}\n\n\tif (!finalMessage) {\n\t\tthrow new Error(\"No message received from stream\");\n\t}\n\n\treturn { result: parseUIMessage(finalMessage), message: finalMessage };\n}\n\n// --- Public API ---\n\n/**\n * Load all session replay JSON files from a directory.\n * Drop any exported session JSON there — it just works.\n *\n * @param dir - Path to the sessions directory. Defaults to `evals/sessions`.\n */\n/**\n * Save an eval scenario JSON file to the scenarios directory.\n *\n * @param scenario - The scenario to save.\n * @param dir - Path to the scenarios directory. Defaults to `evals/scenarios`.\n * @returns The filename that was written.\n */\nexport function saveScenario(\n\tscenario: EvalScenario,\n\tdir = \"evals/scenarios\",\n): string {\n\tconst root = join(process.cwd(), dir);\n\tmkdirSync(root, { recursive: true });\n\tconst filename = `${scenario.name}.json`;\n\twriteFileSync(join(root, filename), JSON.stringify(scenario, null, 2));\n\treturn filename;\n}\n\nexport function loadScenarios(dir = \"evals/scenarios\"): EvalScenario[] {\n\tconst root = join(process.cwd(), dir);\n\treturn readdirSync(root)\n\t\t.filter((f) => f.endsWith(\".json\"))\n\t\t.sort()\n\t\t.map((f) => {\n\t\t\tconst raw = JSON.parse(readFileSync(join(root, f), \"utf8\"));\n\t\t\treturn evalScenarioSchema.parse(raw) as unknown as EvalScenario;\n\t\t});\n}\n\n/**\n * Send a single user message to a WaniWani MCP chat endpoint.\n */\nexport async function chat(url: string, message: string): Promise<ChatResult> {\n\tconst userMsg: UIMessage = {\n\t\tid: crypto.randomUUID(),\n\t\trole: \"user\",\n\t\tparts: [{ type: \"text\", text: message }],\n\t};\n\tconst { result } = await sendMessages(url, [userMsg]);\n\treturn result;\n}\n\n/**\n * Run a multi-turn conversation. Returns the result of each turn.\n */\nexport async function conversation(\n\turl: string,\n\tturns: ConversationTurn[],\n): Promise<ConversationResult> {\n\tconst history: UIMessage[] = [];\n\tconst turnResults: ConversationTurnResult[] = [];\n\n\tfor (const turn of turns) {\n\t\thistory.push({\n\t\t\tid: crypto.randomUUID(),\n\t\t\trole: \"user\",\n\t\t\tparts: [{ type: \"text\", text: turn.input }],\n\t\t});\n\n\t\tconst { result, message } = await sendMessages(url, history);\n\t\thistory.push(message);\n\n\t\tturnResults.push({ input: turn.input, response: result, assertions: [] });\n\t}\n\n\treturn { turns: turnResults };\n}\n\n/**\n * Replay a recorded eval scenario (exported from the chatbar debug button).\n * Uses UIMessage[] directly — same format as useChat's messages array.\n *\n * **\"regenerate\" mode** (default):\n * Sends only user messages. The LLM generates fresh responses.\n * Per-turn assertions are auto-derived by comparing actual tool calls\n * to the tool calls recorded in the scenario.\n *\n * **\"inject\" mode**:\n * Injects the recorded conversation as-is, only generates a fresh\n * response for the final user message.\n */\nexport async function replayScenario(\n\turl: string,\n\tscenario: EvalScenario,\n): Promise<ConversationResult> {\n\tconst mode = scenario.mode ?? \"regenerate\";\n\tconst history: UIMessage[] = [];\n\tconst turnResults: ConversationTurnResult[] = [];\n\n\t// Pair user messages with their assistant responses\n\tconst userTurns: { userMsg: UIMessage; assistantMsg?: UIMessage }[] = [];\n\tfor (let i = 0; i < scenario.messages.length; i++) {\n\t\tconst msg = scenario.messages[i];\n\t\tif (msg.role === \"user\") {\n\t\t\tconst next = scenario.messages[i + 1];\n\t\t\tuserTurns.push({\n\t\t\t\tuserMsg: msg,\n\t\t\t\tassistantMsg: next?.role === \"assistant\" ? next : undefined,\n\t\t\t});\n\t\t}\n\t}\n\n\tfor (let turnIdx = 0; turnIdx < userTurns.length; turnIdx++) {\n\t\tconst { userMsg, assistantMsg } = userTurns[turnIdx];\n\t\tconst isLastTurn = turnIdx === userTurns.length - 1;\n\n\t\t// Extract expected tools from the recorded assistant message\n\t\tconst expectedTools = assistantMsg\n\t\t\t? extractRecordedTools(assistantMsg)\n\t\t\t: [];\n\n\t\thistory.push(userMsg);\n\n\t\tif (mode === \"inject\" && !isLastTurn && assistantMsg) {\n\t\t\thistory.push(assistantMsg);\n\t\t\tconst response = parseUIMessage(assistantMsg);\n\t\t\tconst assertions = buildAssertions(expectedTools, response.toolsCalled);\n\t\t\tturnResults.push({\n\t\t\t\tinput: textFromUIMessage(userMsg),\n\t\t\t\tresponse,\n\t\t\t\tassertions,\n\t\t\t});\n\t\t\tcontinue;\n\t\t}\n\n\t\tconst { result, message } = await sendMessages(url, history);\n\t\thistory.push(message);\n\n\t\tconst assertions = buildAssertions(expectedTools, result.toolsCalled);\n\t\tturnResults.push({\n\t\t\tinput: textFromUIMessage(userMsg),\n\t\t\tresponse: result,\n\t\t\tassertions,\n\t\t});\n\t}\n\n\treturn { turns: turnResults };\n}\n\n/** Compare expected vs. actual tool calls and return assertion results. */\nfunction buildAssertions(\n\texpected: string[],\n\tactual: string[],\n): TurnAssertion[] {\n\tif (expected.length === 0) {\n\t\treturn [];\n\t}\n\n\t// Group expected tools and check each against actual calls\n\tconst actualSet = new Set(actual);\n\tconst expectedUnique = [...new Set(expected)];\n\n\treturn expectedUnique.map((tool) => ({\n\t\tpassed: actualSet.has(tool),\n\t\texpected: [tool],\n\t\tactual,\n\t}));\n}\n","import { existsSync, mkdirSync, writeFileSync } from \"node:fs\";\nimport { parseTaskOutput } from \"./scorers\";\n\ntype ReporterFn = (\n\tname: string,\n\thandlers: {\n\t\treportEval(\n\t\t\tevaluator: unknown,\n\t\t\tresult: {\n\t\t\t\tresults: Array<{\n\t\t\t\t\tinput: unknown;\n\t\t\t\t\toutput: unknown;\n\t\t\t\t\tscores?: Record<string, number>;\n\t\t\t\t}>;\n\t\t\t},\n\t\t): boolean;\n\t\treportRun(results: boolean[]): boolean;\n\t},\n) => unknown;\n\n/**\n * Create a local Braintrust reporter that writes JSON results to a directory and\n * prints a summary to console.\n *\n * Requires the `braintrust` package: bun add -d braintrust\n *\n * @param outputDir - Directory to write JSON result files (default: \"evals/runs\")\n */\nexport function createLocalReporter(outputDir = \"evals/runs\") {\n\tlet Reporter: ReporterFn;\n\ttry {\n\t\tReporter = (require(\"braintrust\") as { Reporter: ReporterFn }).Reporter;\n\t} catch {\n\t\tthrow new Error(\n\t\t\t'Local reporter requires the \"braintrust\" package: bun add -d braintrust',\n\t\t);\n\t}\n\n\tif (!existsSync(outputDir)) {\n\t\tmkdirSync(outputDir, { recursive: true });\n\t}\n\n\treturn Reporter(\"local\", {\n\t\treportEval(\n\t\t\tevaluator: unknown,\n\t\t\tresult: {\n\t\t\t\tresults: Array<{\n\t\t\t\t\tinput: unknown;\n\t\t\t\t\toutput: unknown;\n\t\t\t\t\tscores?: Record<string, number>;\n\t\t\t\t}>;\n\t\t\t},\n\t\t) {\n\t\t\tconst name =\n\t\t\t\t(evaluator as { experimentName?: string }).experimentName ?? \"unknown\";\n\t\t\tconst timestamp = new Date().toISOString().replace(/[:.]/g, \"-\");\n\t\t\tconst rows = result.results.map((r) => {\n\t\t\t\tconst scores = r.scores ?? {};\n\t\t\t\tconst parsed = parseTaskOutput(r.output);\n\t\t\t\treturn {\n\t\t\t\t\tinput: r.input,\n\t\t\t\t\toutput: parsed.output,\n\t\t\t\t\ttoolsCalled: parsed.toolsCalled,\n\t\t\t\t\ttoolCallTraces: parsed.toolCallTraces,\n\t\t\t\t\tscores,\n\t\t\t\t};\n\t\t\t});\n\n\t\t\tconst outPath = `${outputDir}/${name}-${timestamp}.json`;\n\t\t\twriteFileSync(outPath, JSON.stringify(rows, null, 2));\n\n\t\t\tconsole.log(`\\n📊 ${name} (${result.results.length} cases):\\n`);\n\n\t\t\tlet failures = 0;\n\t\t\tfor (const r of result.results) {\n\t\t\t\tconst scores = r.scores ?? {};\n\t\t\t\tconst pass = scores.called_expected_tool === 1;\n\t\t\t\tif (!pass) {\n\t\t\t\t\tfailures++;\n\t\t\t\t}\n\n\t\t\t\tconsole.log(\n\t\t\t\t\t` ${pass ? \"✅\" : \"❌\"} ${(r.input as string).slice(0, 70)}`,\n\t\t\t\t);\n\t\t\t\tfor (const [scoreName, value] of Object.entries(scores)) {\n\t\t\t\t\tconsole.log(` ${scoreName}: ${value}`);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tconsole.log(\n\t\t\t\t`\\n ${result.results.length - failures}/${result.results.length} passed`,\n\t\t\t);\n\t\t\tconsole.log(` → ${outPath}\\n`);\n\t\t\treturn failures === 0;\n\t\t},\n\n\t\treportRun(results: boolean[]) {\n\t\t\tconst allPassed = results.every((r) => r === true);\n\t\t\tconsole.log(\n\t\t\t\tallPassed\n\t\t\t\t\t? \"\\n✅ All experiments passed\"\n\t\t\t\t\t: \"\\n❌ Some experiments failed\",\n\t\t\t);\n\t\t\treturn allPassed;\n\t\t},\n\t});\n}\n","import type { ChatResult } from \"./types\";\n\n/**\n * Parse the JSON-stringified ChatResult from a Braintrust task output.\n */\nexport function parseTaskOutput(output: unknown): ChatResult {\n\ttry {\n\t\treturn JSON.parse(output as string);\n\t} catch {\n\t\treturn { output: \"\", toolsCalled: [], toolCallTraces: [] };\n\t}\n}\n\n/**\n * Checks whether the expected tool was called.\n * Looks for the tool name in `metadata.expectedTool` first (for cases where `expected` is a\n * reference answer), then falls back to `expected` directly.\n */\nexport function calledExpectedTool({\n\toutput,\n\texpected,\n\tmetadata,\n}: {\n\toutput: unknown;\n\texpected?: unknown;\n\tmetadata?: Record<string, unknown>;\n}) {\n\tconst parsed = parseTaskOutput(output);\n\tconst expectedTool =\n\t\t(metadata?.expectedTool as string) ?? (expected as string);\n\tconst found = parsed.toolsCalled.includes(expectedTool);\n\treturn {\n\t\tname: \"called_expected_tool\",\n\t\tscore: found ? 1 : 0,\n\t\tmetadata: { expected: expectedTool, actual: parsed.toolsCalled },\n\t};\n}\n\n/**\n * Checks whether the assistant produced any text output.\n */\nexport function hasOutput({ output }: { output: unknown }) {\n\tconst parsed = parseTaskOutput(output);\n\treturn {\n\t\tname: \"has_output\",\n\t\tscore: parsed.output.length > 0 ? 1 : 0,\n\t};\n}\n\n/**\n * Checks specific fields in the first tool call's `stateUpdates` against expected values.\n * Supports nested fields via dot notation (e.g. \"mixedBreed.knowsBreeds\").\n * Returns partial credit (fraction of matching fields).\n */\nexport function toolInputFieldsMatch({\n\toutput,\n\tmetadata,\n}: {\n\toutput: unknown;\n\tmetadata?: Record<string, unknown>;\n}) {\n\tconst parsed = parseTaskOutput(output);\n\tconst expectedFields = (metadata?.expectedFields ?? {}) as Record<\n\t\tstring,\n\t\tunknown\n\t>;\n\tconst fieldNames = Object.keys(expectedFields);\n\n\tif (fieldNames.length === 0) {\n\t\treturn { name: \"field_extraction\", score: 1 };\n\t}\n\n\tconst trace = parsed.toolCallTraces[0];\n\tconst stateUpdates = (trace?.input?.stateUpdates ?? {}) as Record<\n\t\tstring,\n\t\tunknown\n\t>;\n\n\tlet matches = 0;\n\tconst details: Record<\n\t\tstring,\n\t\t{ expected: unknown; actual: unknown; match: boolean }\n\t> = {};\n\n\tfor (const field of fieldNames) {\n\t\tconst expected = expectedFields[field];\n\t\tlet actual: unknown;\n\n\t\tif (field.includes(\".\")) {\n\t\t\tconst [parent, child] = field.split(\".\");\n\t\t\tactual = (stateUpdates[parent] as Record<string, unknown>)?.[child];\n\t\t} else {\n\t\t\tactual = stateUpdates[field];\n\t\t}\n\n\t\tconst match = JSON.stringify(actual) === JSON.stringify(expected);\n\t\tif (match) {\n\t\t\tmatches++;\n\t\t}\n\t\tdetails[field] = { expected, actual, match };\n\t}\n\n\treturn {\n\t\tname: \"field_extraction\",\n\t\tscore: matches / fieldNames.length,\n\t\tmetadata: details,\n\t};\n}\n\n/**\n * Wraps an autoevals scorer to extract the text output from the JSON-stringified ChatResult.\n * Requires the `autoevals` package: bun add -d autoevals\n */\nfunction wrapAutoeval(\n\tscorer: (args: {\n\t\tinput: unknown;\n\t\toutput: string;\n\t\texpected?: unknown;\n\t}) => unknown,\n) {\n\treturn async (args: {\n\t\tinput: unknown;\n\t\toutput: unknown;\n\t\texpected?: unknown;\n\t}) => {\n\t\tconst parsed = parseTaskOutput(args.output);\n\t\treturn scorer({\n\t\t\tinput: args.input,\n\t\t\toutput: parsed.output,\n\t\t\texpected: args.expected,\n\t\t});\n\t};\n}\n\n// LLM-based scorers — require `autoevals` as a dev dependency.\n// These are dynamically imported so the module loads even if autoevals is not installed.\n// Using LLM scorers without autoevals installed will throw at call time.\n\nasync function getAutoeval(name: string) {\n\tconst mod = await import(\"autoevals\").catch(() => {\n\t\tthrow new Error(\n\t\t\t`LLM scorer \"${name}\" requires the \"autoevals\" package: bun add -d autoevals`,\n\t\t);\n\t});\n\treturn (mod as Record<string, unknown>)[name] as (args: {\n\t\tinput: unknown;\n\t\toutput: string;\n\t\texpected?: unknown;\n\t}) => unknown;\n}\n\n/** ClosedQA — checks if the answer correctly addresses the question given a reference answer. */\nexport const FaqAccuracy = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"ClosedQA\"))(args);\n\n/** Factuality — checks if the output is factually consistent with the expected output. */\nexport const OutputFactuality = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"Factuality\"))(args);\n\n/** Moderation — flags unsafe or inappropriate content. */\nexport const SafetyCheck = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"Moderation\"))(args);\n"],"mappings":"yPAAA,OAAS,aAAAA,EAAW,eAAAC,EAAa,gBAAAC,EAAc,iBAAAC,MAAqB,KACpE,OAAS,QAAAC,MAAY,OACrB,OACC,wBAAAC,EACA,uBAAAC,EAEA,wBAAAC,MACM,KACP,OAAS,KAAAC,MAAS,MAYlB,IAAMC,EAAqBD,EAAE,OAAO,CACnC,KAAMA,EAAE,OAAO,EACf,KAAMA,EAAE,KAAK,CAAC,aAAc,aAAc,MAAM,CAAC,EAAE,SAAS,EAC5D,KAAMA,EAAE,KAAK,CAAC,YAAa,QAAQ,CAAC,EAAE,SAAS,EAC/C,QAASA,EAAE,OAAO,CAAE,YAAaA,EAAE,MAAMA,EAAE,OAAO,CAAC,CAAE,CAAC,EAAE,SAAS,EACjE,SAAUA,EAAE,MACXA,EAAE,YAAY,CACb,GAAIA,EAAE,OAAO,EACb,KAAMA,EAAE,KAAK,CAAC,OAAQ,YAAa,SAAU,MAAM,CAAC,EACpD,MAAOA,EAAE,MAAMA,EAAE,OAAOA,EAAE,OAAO,EAAGA,EAAE,QAAQ,CAAC,CAAC,CACjD,CAAC,CACF,CACD,CAAC,EAID,SAASE,EAAeC,EAA4B,CACnD,IAAMC,EAASD,EAAI,MACjB,OAAQE,GAA2CA,EAAE,OAAS,MAAM,EACpE,IAAKA,GAAMA,EAAE,IAAI,EACjB,KAAK,EAAE,EAEHC,EAAYH,EAAI,MACpB,OAAQE,GAAMA,EAAE,KAAK,WAAW,OAAO,GAAKA,EAAE,OAAS,cAAc,EACrE,IACCA,GACAA,CAKF,EACKE,EAAcD,EAAU,IAAKD,GAAMA,EAAE,QAAQ,EAC7CG,EAAkCF,EAAU,IAAKD,IAAO,CAC7D,KAAMA,EAAE,SACR,MAAOA,EAAE,OAAS,CAAC,EACnB,OAAQA,EAAE,MACX,EAAE,EAEF,MAAO,CAAE,OAAAD,EAAQ,YAAAG,EAAa,eAAAC,CAAe,CAC9C,CAEA,SAASC,EAAkBN,EAAwB,CAClD,OAAOA,EAAI,MACT,OAAQE,GAA2CA,EAAE,OAAS,MAAM,EACpE,IAAKA,GAAMA,EAAE,IAAI,EACjB,KAAK,EAAE,CACV,CAGA,SAASK,EAAqBP,EAA0B,CACvD,OAAOA,EAAI,MACT,OAAQE,GAAMA,EAAE,OAAS,gBAAkBA,EAAE,KAAK,WAAW,OAAO,CAAC,EACrE,IAAKA,GAAOA,EAAsC,QAAQ,EAC1D,OAAO,OAAO,CACjB,CAEA,eAAeM,EACdC,EACAC,EACsD,CACtD,IAAMC,EAAW,MAAM,MAAM,GAAGF,CAAG,gBAAiB,CACnD,OAAQ,OACR,QAAS,CAAE,eAAgB,kBAAmB,EAC9C,OAAQ,YAAY,QAAQ,GAAM,EAClC,KAAM,KAAK,UAAU,CAAE,SAAAC,CAAS,CAAC,CAClC,CAAC,EAED,GAAI,CAACC,EAAS,GACb,MAAM,IAAI,MACT,iBAAiBA,EAAS,MAAM,KAAK,MAAMA,EAAS,KAAK,CAAC,EAC3D,EAGD,GAAI,CAACA,EAAS,KACb,MAAM,IAAI,MAAM,2BAA2B,EAG5C,IAAMC,EAAclB,EAAqB,CACxC,OAAQiB,EAAS,KACjB,OAAQf,CACT,CAAC,EAAE,YACF,IAAI,gBAAgB,CACnB,UAAUiB,EAAOC,EAAY,CACxBD,EAAM,SACTC,EAAW,QAAQD,EAAM,KAAK,CAEhC,CACD,CAAC,CACF,EAEIE,EACJ,cAAiBf,KAAOL,EAAoB,CAAE,OAAQiB,CAAY,CAAC,EAClEG,EAAef,EAGhB,GAAI,CAACe,EACJ,MAAM,IAAI,MAAM,iCAAiC,EAGlD,MAAO,CAAE,OAAQhB,EAAegB,CAAY,EAAG,QAASA,CAAa,CACtE,CAiBO,SAASC,EACfC,EACAC,EAAM,kBACG,CACT,IAAMC,EAAO1B,EAAK,QAAQ,IAAI,EAAGyB,CAAG,EACpC7B,EAAU8B,EAAM,CAAE,UAAW,EAAK,CAAC,EACnC,IAAMC,EAAW,GAAGH,EAAS,IAAI,QACjC,OAAAzB,EAAcC,EAAK0B,EAAMC,CAAQ,EAAG,KAAK,UAAUH,EAAU,KAAM,CAAC,CAAC,EAC9DG,CACR,CAEO,SAASC,EAAcH,EAAM,kBAAmC,CACtE,IAAMC,EAAO1B,EAAK,QAAQ,IAAI,EAAGyB,CAAG,EACpC,OAAO5B,EAAY6B,CAAI,EACrB,OAAQG,GAAMA,EAAE,SAAS,OAAO,CAAC,EACjC,KAAK,EACL,IAAKA,GAAM,CACX,IAAMC,EAAM,KAAK,MAAMhC,EAAaE,EAAK0B,EAAMG,CAAC,EAAG,MAAM,CAAC,EAC1D,OAAOxB,EAAmB,MAAMyB,CAAG,CACpC,CAAC,CACH,CAKA,eAAsBC,EAAKf,EAAagB,EAAsC,CAC7E,IAAMC,EAAqB,CAC1B,GAAI,OAAO,WAAW,EACtB,KAAM,OACN,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAMD,CAAQ,CAAC,CACxC,EACM,CAAE,OAAAE,CAAO,EAAI,MAAMnB,EAAaC,EAAK,CAACiB,CAAO,CAAC,EACpD,OAAOC,CACR,CAKA,eAAsBC,EACrBnB,EACAoB,EAC8B,CAC9B,IAAMC,EAAuB,CAAC,EACxBC,EAAwC,CAAC,EAE/C,QAAWC,KAAQH,EAAO,CACzBC,EAAQ,KAAK,CACZ,GAAI,OAAO,WAAW,EACtB,KAAM,OACN,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAME,EAAK,KAAM,CAAC,CAC3C,CAAC,EAED,GAAM,CAAE,OAAAL,EAAQ,QAAAF,CAAQ,EAAI,MAAMjB,EAAaC,EAAKqB,CAAO,EAC3DA,EAAQ,KAAKL,CAAO,EAEpBM,EAAY,KAAK,CAAE,MAAOC,EAAK,MAAO,SAAUL,EAAQ,WAAY,CAAC,CAAE,CAAC,CACzE,CAEA,MAAO,CAAE,MAAOI,CAAY,CAC7B,CAeA,eAAsBE,EACrBxB,EACAQ,EAC8B,CAC9B,IAAMiB,EAAOjB,EAAS,MAAQ,aACxBa,EAAuB,CAAC,EACxBC,EAAwC,CAAC,EAGzCI,EAAgE,CAAC,EACvE,QAASC,EAAI,EAAGA,EAAInB,EAAS,SAAS,OAAQmB,IAAK,CAClD,IAAMpC,EAAMiB,EAAS,SAASmB,CAAC,EAC/B,GAAIpC,EAAI,OAAS,OAAQ,CACxB,IAAMqC,EAAOpB,EAAS,SAASmB,EAAI,CAAC,EACpCD,EAAU,KAAK,CACd,QAASnC,EACT,aAAcqC,GAAM,OAAS,YAAcA,EAAO,MACnD,CAAC,CACF,CACD,CAEA,QAASC,EAAU,EAAGA,EAAUH,EAAU,OAAQG,IAAW,CAC5D,GAAM,CAAE,QAAAZ,EAAS,aAAAa,CAAa,EAAIJ,EAAUG,CAAO,EAC7CE,EAAaF,IAAYH,EAAU,OAAS,EAG5CM,EAAgBF,EACnBhC,EAAqBgC,CAAY,EACjC,CAAC,EAIJ,GAFAT,EAAQ,KAAKJ,CAAO,EAEhBQ,IAAS,UAAY,CAACM,GAAcD,EAAc,CACrDT,EAAQ,KAAKS,CAAY,EACzB,IAAM5B,EAAWZ,EAAewC,CAAY,EACtCG,EAAaC,EAAgBF,EAAe9B,EAAS,WAAW,EACtEoB,EAAY,KAAK,CAChB,MAAOzB,EAAkBoB,CAAO,EAChC,SAAAf,EACA,WAAA+B,CACD,CAAC,EACD,QACD,CAEA,GAAM,CAAE,OAAAf,EAAQ,QAAAF,CAAQ,EAAI,MAAMjB,EAAaC,EAAKqB,CAAO,EAC3DA,EAAQ,KAAKL,CAAO,EAEpB,IAAMiB,EAAaC,EAAgBF,EAAed,EAAO,WAAW,EACpEI,EAAY,KAAK,CAChB,MAAOzB,EAAkBoB,CAAO,EAChC,SAAUC,EACV,WAAAe,CACD,CAAC,CACF,CAEA,MAAO,CAAE,MAAOX,CAAY,CAC7B,CAGA,SAASY,EACRC,EACAC,EACkB,CAClB,GAAID,EAAS,SAAW,EACvB,MAAO,CAAC,EAIT,IAAME,EAAY,IAAI,IAAID,CAAM,EAGhC,MAFuB,CAAC,GAAG,IAAI,IAAID,CAAQ,CAAC,EAEtB,IAAKG,IAAU,CACpC,OAAQD,EAAU,IAAIC,CAAI,EAC1B,SAAU,CAACA,CAAI,EACf,OAAAF,CACD,EAAE,CACH,CChSA,OAAS,cAAAG,EAAY,aAAAC,EAAW,iBAAAC,MAAqB,KCK9C,SAASC,EAAgBC,EAA6B,CAC5D,GAAI,CACH,OAAO,KAAK,MAAMA,CAAgB,CACnC,MAAQ,CACP,MAAO,CAAE,OAAQ,GAAI,YAAa,CAAC,EAAG,eAAgB,CAAC,CAAE,CAC1D,CACD,CAOO,SAASC,EAAmB,CAClC,OAAAD,EACA,SAAAE,EACA,SAAAC,CACD,EAIG,CACF,IAAMC,EAASL,EAAgBC,CAAM,EAC/BK,EACJF,GAAU,cAA4BD,EAExC,MAAO,CACN,KAAM,uBACN,MAHaE,EAAO,YAAY,SAASC,CAAY,EAGtC,EAAI,EACnB,SAAU,CAAE,SAAUA,EAAc,OAAQD,EAAO,WAAY,CAChE,CACD,CAKO,SAASE,EAAU,CAAE,OAAAN,CAAO,EAAwB,CAE1D,MAAO,CACN,KAAM,aACN,MAHcD,EAAgBC,CAAM,EAGtB,OAAO,OAAS,EAAI,EAAI,CACvC,CACD,CAOO,SAASO,EAAqB,CACpC,OAAAP,EACA,SAAAG,CACD,EAGG,CACF,IAAMC,EAASL,EAAgBC,CAAM,EAC/BQ,EAAkBL,GAAU,gBAAkB,CAAC,EAI/CM,EAAa,OAAO,KAAKD,CAAc,EAE7C,GAAIC,EAAW,SAAW,EACzB,MAAO,CAAE,KAAM,mBAAoB,MAAO,CAAE,EAI7C,IAAMC,EADQN,EAAO,eAAe,CAAC,GACR,OAAO,cAAgB,CAAC,EAKjDO,EAAU,EACRC,EAGF,CAAC,EAEL,QAAWC,KAASJ,EAAY,CAC/B,IAAMP,EAAWM,EAAeK,CAAK,EACjCC,EAEJ,GAAID,EAAM,SAAS,GAAG,EAAG,CACxB,GAAM,CAACE,EAAQC,CAAK,EAAIH,EAAM,MAAM,GAAG,EACvCC,EAAUJ,EAAaK,CAAM,IAAgCC,CAAK,CACnE,MACCF,EAASJ,EAAaG,CAAK,EAG5B,IAAMI,EAAQ,KAAK,UAAUH,CAAM,IAAM,KAAK,UAAUZ,CAAQ,EAC5De,GACHN,IAEDC,EAAQC,CAAK,EAAI,CAAE,SAAAX,EAAU,OAAAY,EAAQ,MAAAG,CAAM,CAC5C,CAEA,MAAO,CACN,KAAM,mBACN,MAAON,EAAUF,EAAW,OAC5B,SAAUG,CACX,CACD,CAMA,SAASM,EACRC,EAKC,CACD,MAAO,OAAOC,GAIR,CACL,IAAMhB,EAASL,EAAgBqB,EAAK,MAAM,EAC1C,OAAOD,EAAO,CACb,MAAOC,EAAK,MACZ,OAAQhB,EAAO,OACf,SAAUgB,EAAK,QAChB,CAAC,CACF,CACD,CAMA,eAAeC,EAAYC,EAAc,CAMxC,OALY,KAAM,QAAO,WAAW,EAAE,MAAM,IAAM,CACjD,MAAM,IAAI,MACT,eAAeA,CAAI,0DACpB,CACD,CAAC,GACuCA,CAAI,CAK7C,CAGO,IAAMC,EAAc,MAAOH,GAIVF,EAAa,MAAMG,EAAY,UAAU,CAAC,EAAED,CAAI,EAG3DI,EAAmB,MAAOJ,GAIfF,EAAa,MAAMG,EAAY,YAAY,CAAC,EAAED,CAAI,EAG7DK,EAAc,MAAOL,GAIVF,EAAa,MAAMG,EAAY,YAAY,CAAC,EAAED,CAAI,ED9InE,SAASM,EAAoBC,EAAY,aAAc,CAC7D,IAAIC,EACJ,GAAI,CACHA,EAAY,EAAQ,YAAY,EAA+B,QAChE,MAAQ,CACP,MAAM,IAAI,MACT,yEACD,CACD,CAEA,OAAKC,EAAWF,CAAS,GACxBG,EAAUH,EAAW,CAAE,UAAW,EAAK,CAAC,EAGlCC,EAAS,QAAS,CACxB,WACCG,EACAC,EAOC,CACD,IAAMC,EACJF,EAA0C,gBAAkB,UACxDG,EAAY,IAAI,KAAK,EAAE,YAAY,EAAE,QAAQ,QAAS,GAAG,EACzDC,EAAOH,EAAO,QAAQ,IAAKI,GAAM,CACtC,IAAMC,EAASD,EAAE,QAAU,CAAC,EACtBE,EAASC,EAAgBH,EAAE,MAAM,EACvC,MAAO,CACN,MAAOA,EAAE,MACT,OAAQE,EAAO,OACf,YAAaA,EAAO,YACpB,eAAgBA,EAAO,eACvB,OAAAD,CACD,CACD,CAAC,EAEKG,EAAU,GAAGb,CAAS,IAAIM,CAAI,IAAIC,CAAS,QACjDO,EAAcD,EAAS,KAAK,UAAUL,EAAM,KAAM,CAAC,CAAC,EAEpD,QAAQ,IAAI;AAAA,YAAQF,CAAI,KAAKD,EAAO,QAAQ,MAAM;AAAA,CAAY,EAE9D,IAAIU,EAAW,EACf,QAAWN,KAAKJ,EAAO,QAAS,CAC/B,IAAMK,EAASD,EAAE,QAAU,CAAC,EACtBO,EAAON,EAAO,uBAAyB,EACxCM,GACJD,IAGD,QAAQ,IACP,KAAKC,EAAO,SAAM,QAAG,IAAKP,EAAE,MAAiB,MAAM,EAAG,EAAE,CAAC,EAC1D,EACA,OAAW,CAACQ,EAAWC,CAAK,IAAK,OAAO,QAAQR,CAAM,EACrD,QAAQ,IAAI,QAAQO,CAAS,KAAKC,CAAK,EAAE,CAE3C,CAEA,eAAQ,IACP;AAAA,IAAOb,EAAO,QAAQ,OAASU,CAAQ,IAAIV,EAAO,QAAQ,MAAM,SACjE,EACA,QAAQ,IAAI,YAAOQ,CAAO;AAAA,CAAI,EACvBE,IAAa,CACrB,EAEA,UAAUI,EAAoB,CAC7B,IAAMC,EAAYD,EAAQ,MAAO,GAAM,IAAM,EAAI,EACjD,eAAQ,IACPC,EACG;AAAA,+BACA;AAAA,+BACJ,EACOA,CACR,CACD,CAAC,CACF","names":["mkdirSync","readdirSync","readFileSync","writeFileSync","join","parseJsonEventStream","readUIMessageStream","uiMessageChunkSchema","z","evalScenarioSchema","parseUIMessage","msg","output","p","toolParts","toolsCalled","toolCallTraces","textFromUIMessage","extractRecordedTools","sendMessages","url","messages","response","chunkStream","chunk","controller","finalMessage","saveScenario","scenario","dir","root","filename","loadScenarios","f","raw","chat","message","userMsg","result","conversation","turns","history","turnResults","turn","replayScenario","mode","userTurns","i","next","turnIdx","assistantMsg","isLastTurn","expectedTools","assertions","buildAssertions","expected","actual","actualSet","tool","existsSync","mkdirSync","writeFileSync","parseTaskOutput","output","calledExpectedTool","expected","metadata","parsed","expectedTool","hasOutput","toolInputFieldsMatch","expectedFields","fieldNames","stateUpdates","matches","details","field","actual","parent","child","match","wrapAutoeval","scorer","args","getAutoeval","name","FaqAccuracy","OutputFactuality","SafetyCheck","createLocalReporter","outputDir","Reporter","existsSync","mkdirSync","evaluator","result","name","timestamp","rows","r","scores","parsed","parseTaskOutput","outPath","writeFileSync","failures","pass","scoreName","value","results","allPassed"]}
|
|
1
|
+
{"version":3,"sources":["../../src/evals/chat.ts"],"sourcesContent":["import { mkdirSync, readdirSync, readFileSync, writeFileSync } from \"node:fs\";\nimport { join } from \"node:path\";\nimport {\n\tparseJsonEventStream,\n\treadUIMessageStream,\n\ttype UIMessage,\n\tuiMessageChunkSchema,\n} from \"ai\";\nimport { z } from \"zod\";\nimport type {\n\tChatResult,\n\tConversationResult,\n\tConversationTurn,\n\tConversationTurnResult,\n\tEvalScenario,\n\tToolCallTrace,\n\tTurnAssertion,\n} from \"./types\";\n\n// UIMessage parts are heterogeneous — validate the fields we need, pass extras through\nconst evalScenarioSchema = z.object({\n\tname: z.string(),\n\ttype: z.enum([\"regulatory\", \"functional\", \"tone\"]).optional(),\n\tmode: z.enum([\"synthetic\", \"manual\"]).optional(),\n\toutcome: z.object({ toolsCalled: z.array(z.string()) }).optional(),\n\tmessages: z.array(\n\t\tz.looseObject({\n\t\t\tid: z.string(),\n\t\t\trole: z.enum([\"user\", \"assistant\", \"system\", \"data\"]),\n\t\t\tparts: z.array(z.record(z.string(), z.unknown())),\n\t\t}),\n\t),\n});\n\n// --- Internal helpers ---\n\nfunction parseUIMessage(msg: UIMessage): ChatResult {\n\tconst output = msg.parts\n\t\t.filter((p): p is { type: \"text\"; text: string } => p.type === \"text\")\n\t\t.map((p) => p.text)\n\t\t.join(\"\");\n\n\tconst toolParts = msg.parts\n\t\t.filter((p) => p.type.startsWith(\"tool-\") || p.type === \"dynamic-tool\")\n\t\t.map(\n\t\t\t(p) =>\n\t\t\t\tp as unknown as {\n\t\t\t\t\ttoolName: string;\n\t\t\t\t\tinput?: Record<string, unknown>;\n\t\t\t\t\toutput?: unknown;\n\t\t\t\t},\n\t\t);\n\tconst toolsCalled = toolParts.map((p) => p.toolName);\n\tconst toolCallTraces: ToolCallTrace[] = toolParts.map((p) => ({\n\t\tname: p.toolName,\n\t\tinput: p.input ?? {},\n\t\toutput: p.output,\n\t}));\n\n\treturn { output, toolsCalled, toolCallTraces };\n}\n\nfunction textFromUIMessage(msg: UIMessage): string {\n\treturn msg.parts\n\t\t.filter((p): p is { type: \"text\"; text: string } => p.type === \"text\")\n\t\t.map((p) => p.text)\n\t\t.join(\"\");\n}\n\n/** Extract the tool names called in a recorded assistant UIMessage. */\nfunction extractRecordedTools(msg: UIMessage): string[] {\n\treturn msg.parts\n\t\t.filter((p) => p.type === \"dynamic-tool\" || p.type.startsWith(\"tool-\"))\n\t\t.map((p) => (p as unknown as { toolName: string }).toolName)\n\t\t.filter(Boolean);\n}\n\nasync function sendMessages(\n\turl: string,\n\tmessages: UIMessage[],\n): Promise<{ result: ChatResult; message: UIMessage }> {\n\tconst response = await fetch(`${url}/api/waniwani`, {\n\t\tmethod: \"POST\",\n\t\theaders: { \"Content-Type\": \"application/json\" },\n\t\tsignal: AbortSignal.timeout(60_000),\n\t\tbody: JSON.stringify({ messages }),\n\t});\n\n\tif (!response.ok) {\n\t\tthrow new Error(\n\t\t\t`Chat returned ${response.status}: ${await response.text()}`,\n\t\t);\n\t}\n\n\tif (!response.body) {\n\t\tthrow new Error(\"Chat response has no body\");\n\t}\n\n\tconst chunkStream = parseJsonEventStream({\n\t\tstream: response.body,\n\t\tschema: uiMessageChunkSchema,\n\t}).pipeThrough(\n\t\tnew TransformStream({\n\t\t\ttransform(chunk, controller) {\n\t\t\t\tif (chunk.success) {\n\t\t\t\t\tcontroller.enqueue(chunk.value);\n\t\t\t\t}\n\t\t\t},\n\t\t}),\n\t);\n\n\tlet finalMessage: UIMessage | undefined;\n\tfor await (const msg of readUIMessageStream({ stream: chunkStream })) {\n\t\tfinalMessage = msg;\n\t}\n\n\tif (!finalMessage) {\n\t\tthrow new Error(\"No message received from stream\");\n\t}\n\n\treturn { result: parseUIMessage(finalMessage), message: finalMessage };\n}\n\n// --- Public API ---\n\n/**\n * Load all session replay JSON files from a directory.\n * Drop any exported session JSON there — it just works.\n *\n * @param dir - Path to the sessions directory. Defaults to `evals/sessions`.\n */\n/**\n * Save an eval scenario JSON file to the scenarios directory.\n *\n * @param scenario - The scenario to save.\n * @param dir - Path to the scenarios directory. Defaults to `evals/scenarios`.\n * @returns The filename that was written.\n */\nexport function saveScenario(\n\tscenario: EvalScenario,\n\tdir = \"evals/scenarios\",\n): string {\n\tconst root = join(process.cwd(), dir);\n\tmkdirSync(root, { recursive: true });\n\tconst filename = `${scenario.name}.json`;\n\twriteFileSync(join(root, filename), JSON.stringify(scenario, null, 2));\n\treturn filename;\n}\n\nexport function loadScenarios(dir = \"evals/scenarios\"): EvalScenario[] {\n\tconst root = join(process.cwd(), dir);\n\treturn readdirSync(root)\n\t\t.filter((f) => f.endsWith(\".json\"))\n\t\t.sort()\n\t\t.map((f) => {\n\t\t\tconst raw = JSON.parse(readFileSync(join(root, f), \"utf8\"));\n\t\t\treturn evalScenarioSchema.parse(raw) as unknown as EvalScenario;\n\t\t});\n}\n\n/**\n * Send a single user message to a WaniWani MCP chat endpoint.\n */\nexport async function chat(url: string, message: string): Promise<ChatResult> {\n\tconst userMsg: UIMessage = {\n\t\tid: crypto.randomUUID(),\n\t\trole: \"user\",\n\t\tparts: [{ type: \"text\", text: message }],\n\t};\n\tconst { result } = await sendMessages(url, [userMsg]);\n\treturn result;\n}\n\n/**\n * Run a multi-turn conversation. Returns the result of each turn.\n */\nexport async function conversation(\n\turl: string,\n\tturns: ConversationTurn[],\n): Promise<ConversationResult> {\n\tconst history: UIMessage[] = [];\n\tconst turnResults: ConversationTurnResult[] = [];\n\n\tfor (const turn of turns) {\n\t\thistory.push({\n\t\t\tid: crypto.randomUUID(),\n\t\t\trole: \"user\",\n\t\t\tparts: [{ type: \"text\", text: turn.input }],\n\t\t});\n\n\t\tconst { result, message } = await sendMessages(url, history);\n\t\thistory.push(message);\n\n\t\tturnResults.push({ input: turn.input, response: result, assertions: [] });\n\t}\n\n\treturn { turns: turnResults };\n}\n\n/**\n * Replay a recorded eval scenario (exported from the chatbar debug button).\n * Uses UIMessage[] directly — same format as useChat's messages array.\n *\n * **\"regenerate\" mode** (default):\n * Sends only user messages. The LLM generates fresh responses.\n * Per-turn assertions are auto-derived by comparing actual tool calls\n * to the tool calls recorded in the scenario.\n *\n * **\"inject\" mode**:\n * Injects the recorded conversation as-is, only generates a fresh\n * response for the final user message.\n */\nexport async function replayScenario(\n\turl: string,\n\tscenario: EvalScenario,\n): Promise<ConversationResult> {\n\tconst mode = scenario.mode ?? \"regenerate\";\n\tconst history: UIMessage[] = [];\n\tconst turnResults: ConversationTurnResult[] = [];\n\n\t// Pair user messages with their assistant responses\n\tconst userTurns: { userMsg: UIMessage; assistantMsg?: UIMessage }[] = [];\n\tfor (let i = 0; i < scenario.messages.length; i++) {\n\t\tconst msg = scenario.messages[i];\n\t\tif (msg.role === \"user\") {\n\t\t\tconst next = scenario.messages[i + 1];\n\t\t\tuserTurns.push({\n\t\t\t\tuserMsg: msg,\n\t\t\t\tassistantMsg: next?.role === \"assistant\" ? next : undefined,\n\t\t\t});\n\t\t}\n\t}\n\n\tfor (let turnIdx = 0; turnIdx < userTurns.length; turnIdx++) {\n\t\tconst { userMsg, assistantMsg } = userTurns[turnIdx];\n\t\tconst isLastTurn = turnIdx === userTurns.length - 1;\n\n\t\t// Extract expected tools from the recorded assistant message\n\t\tconst expectedTools = assistantMsg\n\t\t\t? extractRecordedTools(assistantMsg)\n\t\t\t: [];\n\n\t\thistory.push(userMsg);\n\n\t\tif (mode === \"inject\" && !isLastTurn && assistantMsg) {\n\t\t\thistory.push(assistantMsg);\n\t\t\tconst response = parseUIMessage(assistantMsg);\n\t\t\tconst assertions = buildAssertions(expectedTools, response.toolsCalled);\n\t\t\tturnResults.push({\n\t\t\t\tinput: textFromUIMessage(userMsg),\n\t\t\t\tresponse,\n\t\t\t\tassertions,\n\t\t\t});\n\t\t\tcontinue;\n\t\t}\n\n\t\tconst { result, message } = await sendMessages(url, history);\n\t\thistory.push(message);\n\n\t\tconst assertions = buildAssertions(expectedTools, result.toolsCalled);\n\t\tturnResults.push({\n\t\t\tinput: textFromUIMessage(userMsg),\n\t\t\tresponse: result,\n\t\t\tassertions,\n\t\t});\n\t}\n\n\treturn { turns: turnResults };\n}\n\n/** Compare expected vs. actual tool calls and return assertion results. */\nfunction buildAssertions(\n\texpected: string[],\n\tactual: string[],\n): TurnAssertion[] {\n\tif (expected.length === 0) {\n\t\treturn [];\n\t}\n\n\t// Group expected tools and check each against actual calls\n\tconst actualSet = new Set(actual);\n\tconst expectedUnique = [...new Set(expected)];\n\n\treturn expectedUnique.map((tool) => ({\n\t\tpassed: actualSet.has(tool),\n\t\texpected: [tool],\n\t\tactual,\n\t}));\n}\n"],"mappings":"AAAA,OAAS,aAAAA,EAAW,eAAAC,EAAa,gBAAAC,EAAc,iBAAAC,MAAqB,KACpE,OAAS,QAAAC,MAAY,OACrB,OACC,wBAAAC,EACA,uBAAAC,EAEA,wBAAAC,MACM,KACP,OAAS,KAAAC,MAAS,MAYlB,IAAMC,EAAqBD,EAAE,OAAO,CACnC,KAAMA,EAAE,OAAO,EACf,KAAMA,EAAE,KAAK,CAAC,aAAc,aAAc,MAAM,CAAC,EAAE,SAAS,EAC5D,KAAMA,EAAE,KAAK,CAAC,YAAa,QAAQ,CAAC,EAAE,SAAS,EAC/C,QAASA,EAAE,OAAO,CAAE,YAAaA,EAAE,MAAMA,EAAE,OAAO,CAAC,CAAE,CAAC,EAAE,SAAS,EACjE,SAAUA,EAAE,MACXA,EAAE,YAAY,CACb,GAAIA,EAAE,OAAO,EACb,KAAMA,EAAE,KAAK,CAAC,OAAQ,YAAa,SAAU,MAAM,CAAC,EACpD,MAAOA,EAAE,MAAMA,EAAE,OAAOA,EAAE,OAAO,EAAGA,EAAE,QAAQ,CAAC,CAAC,CACjD,CAAC,CACF,CACD,CAAC,EAID,SAASE,EAAeC,EAA4B,CACnD,IAAMC,EAASD,EAAI,MACjB,OAAQE,GAA2CA,EAAE,OAAS,MAAM,EACpE,IAAKA,GAAMA,EAAE,IAAI,EACjB,KAAK,EAAE,EAEHC,EAAYH,EAAI,MACpB,OAAQE,GAAMA,EAAE,KAAK,WAAW,OAAO,GAAKA,EAAE,OAAS,cAAc,EACrE,IACCA,GACAA,CAKF,EACKE,EAAcD,EAAU,IAAKD,GAAMA,EAAE,QAAQ,EAC7CG,EAAkCF,EAAU,IAAKD,IAAO,CAC7D,KAAMA,EAAE,SACR,MAAOA,EAAE,OAAS,CAAC,EACnB,OAAQA,EAAE,MACX,EAAE,EAEF,MAAO,CAAE,OAAAD,EAAQ,YAAAG,EAAa,eAAAC,CAAe,CAC9C,CAEA,SAASC,EAAkBN,EAAwB,CAClD,OAAOA,EAAI,MACT,OAAQE,GAA2CA,EAAE,OAAS,MAAM,EACpE,IAAKA,GAAMA,EAAE,IAAI,EACjB,KAAK,EAAE,CACV,CAGA,SAASK,EAAqBP,EAA0B,CACvD,OAAOA,EAAI,MACT,OAAQE,GAAMA,EAAE,OAAS,gBAAkBA,EAAE,KAAK,WAAW,OAAO,CAAC,EACrE,IAAKA,GAAOA,EAAsC,QAAQ,EAC1D,OAAO,OAAO,CACjB,CAEA,eAAeM,EACdC,EACAC,EACsD,CACtD,IAAMC,EAAW,MAAM,MAAM,GAAGF,CAAG,gBAAiB,CACnD,OAAQ,OACR,QAAS,CAAE,eAAgB,kBAAmB,EAC9C,OAAQ,YAAY,QAAQ,GAAM,EAClC,KAAM,KAAK,UAAU,CAAE,SAAAC,CAAS,CAAC,CAClC,CAAC,EAED,GAAI,CAACC,EAAS,GACb,MAAM,IAAI,MACT,iBAAiBA,EAAS,MAAM,KAAK,MAAMA,EAAS,KAAK,CAAC,EAC3D,EAGD,GAAI,CAACA,EAAS,KACb,MAAM,IAAI,MAAM,2BAA2B,EAG5C,IAAMC,EAAclB,EAAqB,CACxC,OAAQiB,EAAS,KACjB,OAAQf,CACT,CAAC,EAAE,YACF,IAAI,gBAAgB,CACnB,UAAUiB,EAAOC,EAAY,CACxBD,EAAM,SACTC,EAAW,QAAQD,EAAM,KAAK,CAEhC,CACD,CAAC,CACF,EAEIE,EACJ,cAAiBf,KAAOL,EAAoB,CAAE,OAAQiB,CAAY,CAAC,EAClEG,EAAef,EAGhB,GAAI,CAACe,EACJ,MAAM,IAAI,MAAM,iCAAiC,EAGlD,MAAO,CAAE,OAAQhB,EAAegB,CAAY,EAAG,QAASA,CAAa,CACtE,CAiBO,SAASC,EACfC,EACAC,EAAM,kBACG,CACT,IAAMC,EAAO1B,EAAK,QAAQ,IAAI,EAAGyB,CAAG,EACpC7B,EAAU8B,EAAM,CAAE,UAAW,EAAK,CAAC,EACnC,IAAMC,EAAW,GAAGH,EAAS,IAAI,QACjC,OAAAzB,EAAcC,EAAK0B,EAAMC,CAAQ,EAAG,KAAK,UAAUH,EAAU,KAAM,CAAC,CAAC,EAC9DG,CACR,CAEO,SAASC,EAAcH,EAAM,kBAAmC,CACtE,IAAMC,EAAO1B,EAAK,QAAQ,IAAI,EAAGyB,CAAG,EACpC,OAAO5B,EAAY6B,CAAI,EACrB,OAAQG,GAAMA,EAAE,SAAS,OAAO,CAAC,EACjC,KAAK,EACL,IAAKA,GAAM,CACX,IAAMC,EAAM,KAAK,MAAMhC,EAAaE,EAAK0B,EAAMG,CAAC,EAAG,MAAM,CAAC,EAC1D,OAAOxB,EAAmB,MAAMyB,CAAG,CACpC,CAAC,CACH,CAKA,eAAsBC,EAAKf,EAAagB,EAAsC,CAC7E,IAAMC,EAAqB,CAC1B,GAAI,OAAO,WAAW,EACtB,KAAM,OACN,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAMD,CAAQ,CAAC,CACxC,EACM,CAAE,OAAAE,CAAO,EAAI,MAAMnB,EAAaC,EAAK,CAACiB,CAAO,CAAC,EACpD,OAAOC,CACR,CAKA,eAAsBC,EACrBnB,EACAoB,EAC8B,CAC9B,IAAMC,EAAuB,CAAC,EACxBC,EAAwC,CAAC,EAE/C,QAAWC,KAAQH,EAAO,CACzBC,EAAQ,KAAK,CACZ,GAAI,OAAO,WAAW,EACtB,KAAM,OACN,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAME,EAAK,KAAM,CAAC,CAC3C,CAAC,EAED,GAAM,CAAE,OAAAL,EAAQ,QAAAF,CAAQ,EAAI,MAAMjB,EAAaC,EAAKqB,CAAO,EAC3DA,EAAQ,KAAKL,CAAO,EAEpBM,EAAY,KAAK,CAAE,MAAOC,EAAK,MAAO,SAAUL,EAAQ,WAAY,CAAC,CAAE,CAAC,CACzE,CAEA,MAAO,CAAE,MAAOI,CAAY,CAC7B,CAeA,eAAsBE,EACrBxB,EACAQ,EAC8B,CAC9B,IAAMiB,EAAOjB,EAAS,MAAQ,aACxBa,EAAuB,CAAC,EACxBC,EAAwC,CAAC,EAGzCI,EAAgE,CAAC,EACvE,QAAS,EAAI,EAAG,EAAIlB,EAAS,SAAS,OAAQ,IAAK,CAClD,IAAMjB,EAAMiB,EAAS,SAAS,CAAC,EAC/B,GAAIjB,EAAI,OAAS,OAAQ,CACxB,IAAMoC,EAAOnB,EAAS,SAAS,EAAI,CAAC,EACpCkB,EAAU,KAAK,CACd,QAASnC,EACT,aAAcoC,GAAM,OAAS,YAAcA,EAAO,MACnD,CAAC,CACF,CACD,CAEA,QAASC,EAAU,EAAGA,EAAUF,EAAU,OAAQE,IAAW,CAC5D,GAAM,CAAE,QAAAX,EAAS,aAAAY,CAAa,EAAIH,EAAUE,CAAO,EAC7CE,EAAaF,IAAYF,EAAU,OAAS,EAG5CK,EAAgBF,EACnB/B,EAAqB+B,CAAY,EACjC,CAAC,EAIJ,GAFAR,EAAQ,KAAKJ,CAAO,EAEhBQ,IAAS,UAAY,CAACK,GAAcD,EAAc,CACrDR,EAAQ,KAAKQ,CAAY,EACzB,IAAM3B,EAAWZ,EAAeuC,CAAY,EACtCG,EAAaC,EAAgBF,EAAe7B,EAAS,WAAW,EACtEoB,EAAY,KAAK,CAChB,MAAOzB,EAAkBoB,CAAO,EAChC,SAAAf,EACA,WAAA8B,CACD,CAAC,EACD,QACD,CAEA,GAAM,CAAE,OAAAd,EAAQ,QAAAF,CAAQ,EAAI,MAAMjB,EAAaC,EAAKqB,CAAO,EAC3DA,EAAQ,KAAKL,CAAO,EAEpB,IAAMgB,EAAaC,EAAgBF,EAAeb,EAAO,WAAW,EACpEI,EAAY,KAAK,CAChB,MAAOzB,EAAkBoB,CAAO,EAChC,SAAUC,EACV,WAAAc,CACD,CAAC,CACF,CAEA,MAAO,CAAE,MAAOV,CAAY,CAC7B,CAGA,SAASW,EACRC,EACAC,EACkB,CAClB,GAAID,EAAS,SAAW,EACvB,MAAO,CAAC,EAIT,IAAME,EAAY,IAAI,IAAID,CAAM,EAGhC,MAFuB,CAAC,GAAG,IAAI,IAAID,CAAQ,CAAC,EAEtB,IAAKG,IAAU,CACpC,OAAQD,EAAU,IAAIC,CAAI,EAC1B,SAAU,CAACA,CAAI,EACf,OAAAF,CACD,EAAE,CACH","names":["mkdirSync","readdirSync","readFileSync","writeFileSync","join","parseJsonEventStream","readUIMessageStream","uiMessageChunkSchema","z","evalScenarioSchema","parseUIMessage","msg","output","p","toolParts","toolsCalled","toolCallTraces","textFromUIMessage","extractRecordedTools","sendMessages","url","messages","response","chunkStream","chunk","controller","finalMessage","saveScenario","scenario","dir","root","filename","loadScenarios","f","raw","chat","message","userMsg","result","conversation","turns","history","turnResults","turn","replayScenario","mode","userTurns","next","turnIdx","assistantMsg","isLastTurn","expectedTools","assertions","buildAssertions","expected","actual","actualSet","tool"]}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Create a local Braintrust reporter that writes JSON results to a directory and
|
|
3
|
+
* prints a summary to console.
|
|
4
|
+
*
|
|
5
|
+
* Requires the `braintrust` package: bun add -d braintrust
|
|
6
|
+
*
|
|
7
|
+
* @param outputDir - Directory to write JSON result files (default: "evals/runs")
|
|
8
|
+
*/
|
|
9
|
+
declare function createLocalReporter(outputDir?: string): unknown;
|
|
10
|
+
|
|
11
|
+
interface ToolCallTrace {
|
|
12
|
+
name: string;
|
|
13
|
+
input: Record<string, unknown>;
|
|
14
|
+
output: unknown;
|
|
15
|
+
}
|
|
16
|
+
interface ChatResult {
|
|
17
|
+
output: string;
|
|
18
|
+
toolsCalled: string[];
|
|
19
|
+
toolCallTraces: ToolCallTrace[];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Parse the JSON-stringified ChatResult from a Braintrust task output.
|
|
24
|
+
*/
|
|
25
|
+
declare function parseTaskOutput(output: unknown): ChatResult;
|
|
26
|
+
/**
|
|
27
|
+
* Checks whether the expected tool was called.
|
|
28
|
+
* Looks for the tool name in `metadata.expectedTool` first (for cases where `expected` is a
|
|
29
|
+
* reference answer), then falls back to `expected` directly.
|
|
30
|
+
*/
|
|
31
|
+
declare function calledExpectedTool({ output, expected, metadata, }: {
|
|
32
|
+
output: unknown;
|
|
33
|
+
expected?: unknown;
|
|
34
|
+
metadata?: Record<string, unknown>;
|
|
35
|
+
}): {
|
|
36
|
+
name: string;
|
|
37
|
+
score: number;
|
|
38
|
+
metadata: {
|
|
39
|
+
expected: string;
|
|
40
|
+
actual: string[];
|
|
41
|
+
};
|
|
42
|
+
};
|
|
43
|
+
/**
|
|
44
|
+
* Checks whether the assistant produced any text output.
|
|
45
|
+
*/
|
|
46
|
+
declare function hasOutput({ output }: {
|
|
47
|
+
output: unknown;
|
|
48
|
+
}): {
|
|
49
|
+
name: string;
|
|
50
|
+
score: number;
|
|
51
|
+
};
|
|
52
|
+
/**
|
|
53
|
+
* Checks specific fields in the first tool call's `stateUpdates` against expected values.
|
|
54
|
+
* Supports nested fields via dot notation (e.g. "mixedBreed.knowsBreeds").
|
|
55
|
+
* Returns partial credit (fraction of matching fields).
|
|
56
|
+
*/
|
|
57
|
+
declare function toolInputFieldsMatch({ output, metadata, }: {
|
|
58
|
+
output: unknown;
|
|
59
|
+
metadata?: Record<string, unknown>;
|
|
60
|
+
}): {
|
|
61
|
+
name: string;
|
|
62
|
+
score: number;
|
|
63
|
+
metadata?: undefined;
|
|
64
|
+
} | {
|
|
65
|
+
name: string;
|
|
66
|
+
score: number;
|
|
67
|
+
metadata: Record<string, {
|
|
68
|
+
expected: unknown;
|
|
69
|
+
actual: unknown;
|
|
70
|
+
match: boolean;
|
|
71
|
+
}>;
|
|
72
|
+
};
|
|
73
|
+
/** ClosedQA — checks if the answer correctly addresses the question given a reference answer. */
|
|
74
|
+
declare const FaqAccuracy: (args: {
|
|
75
|
+
input: unknown;
|
|
76
|
+
output: unknown;
|
|
77
|
+
expected?: unknown;
|
|
78
|
+
}) => Promise<unknown>;
|
|
79
|
+
/** Factuality — checks if the output is factually consistent with the expected output. */
|
|
80
|
+
declare const OutputFactuality: (args: {
|
|
81
|
+
input: unknown;
|
|
82
|
+
output: unknown;
|
|
83
|
+
expected?: unknown;
|
|
84
|
+
}) => Promise<unknown>;
|
|
85
|
+
/** Moderation — flags unsafe or inappropriate content. */
|
|
86
|
+
declare const SafetyCheck: (args: {
|
|
87
|
+
input: unknown;
|
|
88
|
+
output: unknown;
|
|
89
|
+
expected?: unknown;
|
|
90
|
+
}) => Promise<unknown>;
|
|
91
|
+
|
|
92
|
+
export { FaqAccuracy, OutputFactuality, SafetyCheck, calledExpectedTool, createLocalReporter, hasOutput, parseTaskOutput, toolInputFieldsMatch };
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
var y=(t=>typeof require<"u"?require:typeof Proxy<"u"?new Proxy(t,{get:(n,r)=>(typeof require<"u"?require:n)[r]}):t)(function(t){if(typeof require<"u")return require.apply(this,arguments);throw Error('Dynamic require of "'+t+'" is not supported')});import{existsSync as S,mkdirSync as v,writeFileSync as T}from"fs";function c(t){try{return JSON.parse(t)}catch{return{output:"",toolsCalled:[],toolCallTraces:[]}}}function R({output:t,expected:n,metadata:r}){let e=c(t),s=r?.expectedTool??n;return{name:"called_expected_tool",score:e.toolsCalled.includes(s)?1:0,metadata:{expected:s,actual:e.toolsCalled}}}function C({output:t}){return{name:"has_output",score:c(t).output.length>0?1:0}}function O({output:t,metadata:n}){let r=c(t),e=n?.expectedFields??{},s=Object.keys(e);if(s.length===0)return{name:"field_extraction",score:1};let i=r.toolCallTraces[0]?.input?.stateUpdates??{},p=0,l={};for(let o of s){let a=e[o],u;if(o.includes(".")){let[k,h]=o.split(".");u=i[k]?.[h]}else u=i[o];let d=JSON.stringify(u)===JSON.stringify(a);d&&p++,l[o]={expected:a,actual:u,match:d}}return{name:"field_extraction",score:p/s.length,metadata:l}}function m(t){return async n=>{let r=c(n.output);return t({input:n.input,output:r.output,expected:n.expected})}}async function g(t){return(await import("autoevals").catch(()=>{throw new Error(`LLM scorer "${t}" requires the "autoevals" package: bun add -d autoevals`)}))[t]}var b=async t=>m(await g("ClosedQA"))(t),F=async t=>m(await g("Factuality"))(t),$=async t=>m(await g("Moderation"))(t);function A(t="evals/runs"){let n;try{n=y("braintrust").Reporter}catch{throw new Error('Local reporter requires the "braintrust" package: bun add -d braintrust')}return S(t)||v(t,{recursive:!0}),n("local",{reportEval(r,e){let s=r.experimentName??"unknown",w=new Date().toISOString().replace(/[:.]/g,"-"),i=e.results.map(o=>{let a=o.scores??{},u=c(o.output);return{input:o.input,output:u.output,toolsCalled:u.toolsCalled,toolCallTraces:u.toolCallTraces,scores:a}}),p=`${t}/${s}-${w}.json`;T(p,JSON.stringify(i,null,2)),console.log(`
|
|
2
|
+
\u{1F4CA} ${s} (${e.results.length} cases):
|
|
3
|
+
`);let l=0;for(let o of e.results){let a=o.scores??{},u=a.called_expected_tool===1;u||l++,console.log(` ${u?"\u2705":"\u274C"} ${o.input.slice(0,70)}`);for(let[d,k]of Object.entries(a))console.log(` ${d}: ${k}`)}return console.log(`
|
|
4
|
+
${e.results.length-l}/${e.results.length} passed`),console.log(` \u2192 ${p}
|
|
5
|
+
`),l===0},reportRun(r){let e=r.every(s=>s===!0);return console.log(e?`
|
|
6
|
+
\u2705 All experiments passed`:`
|
|
7
|
+
\u274C Some experiments failed`),e}})}export{b as FaqAccuracy,F as OutputFactuality,$ as SafetyCheck,R as calledExpectedTool,A as createLocalReporter,C as hasOutput,c as parseTaskOutput,O as toolInputFieldsMatch};
|
|
8
|
+
//# sourceMappingURL=scorers.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/evals/reporter.ts","../../src/evals/scorers.ts"],"sourcesContent":["import { existsSync, mkdirSync, writeFileSync } from \"node:fs\";\nimport { parseTaskOutput } from \"./scorers\";\n\ntype ReporterFn = (\n\tname: string,\n\thandlers: {\n\t\treportEval(\n\t\t\tevaluator: unknown,\n\t\t\tresult: {\n\t\t\t\tresults: Array<{\n\t\t\t\t\tinput: unknown;\n\t\t\t\t\toutput: unknown;\n\t\t\t\t\tscores?: Record<string, number>;\n\t\t\t\t}>;\n\t\t\t},\n\t\t): boolean;\n\t\treportRun(results: boolean[]): boolean;\n\t},\n) => unknown;\n\n/**\n * Create a local Braintrust reporter that writes JSON results to a directory and\n * prints a summary to console.\n *\n * Requires the `braintrust` package: bun add -d braintrust\n *\n * @param outputDir - Directory to write JSON result files (default: \"evals/runs\")\n */\nexport function createLocalReporter(outputDir = \"evals/runs\") {\n\tlet Reporter: ReporterFn;\n\ttry {\n\t\tReporter = (require(\"braintrust\") as { Reporter: ReporterFn }).Reporter;\n\t} catch {\n\t\tthrow new Error(\n\t\t\t'Local reporter requires the \"braintrust\" package: bun add -d braintrust',\n\t\t);\n\t}\n\n\tif (!existsSync(outputDir)) {\n\t\tmkdirSync(outputDir, { recursive: true });\n\t}\n\n\treturn Reporter(\"local\", {\n\t\treportEval(\n\t\t\tevaluator: unknown,\n\t\t\tresult: {\n\t\t\t\tresults: Array<{\n\t\t\t\t\tinput: unknown;\n\t\t\t\t\toutput: unknown;\n\t\t\t\t\tscores?: Record<string, number>;\n\t\t\t\t}>;\n\t\t\t},\n\t\t) {\n\t\t\tconst name =\n\t\t\t\t(evaluator as { experimentName?: string }).experimentName ?? \"unknown\";\n\t\t\tconst timestamp = new Date().toISOString().replace(/[:.]/g, \"-\");\n\t\t\tconst rows = result.results.map((r) => {\n\t\t\t\tconst scores = r.scores ?? {};\n\t\t\t\tconst parsed = parseTaskOutput(r.output);\n\t\t\t\treturn {\n\t\t\t\t\tinput: r.input,\n\t\t\t\t\toutput: parsed.output,\n\t\t\t\t\ttoolsCalled: parsed.toolsCalled,\n\t\t\t\t\ttoolCallTraces: parsed.toolCallTraces,\n\t\t\t\t\tscores,\n\t\t\t\t};\n\t\t\t});\n\n\t\t\tconst outPath = `${outputDir}/${name}-${timestamp}.json`;\n\t\t\twriteFileSync(outPath, JSON.stringify(rows, null, 2));\n\n\t\t\tconsole.log(`\\n📊 ${name} (${result.results.length} cases):\\n`);\n\n\t\t\tlet failures = 0;\n\t\t\tfor (const r of result.results) {\n\t\t\t\tconst scores = r.scores ?? {};\n\t\t\t\tconst pass = scores.called_expected_tool === 1;\n\t\t\t\tif (!pass) {\n\t\t\t\t\tfailures++;\n\t\t\t\t}\n\n\t\t\t\tconsole.log(\n\t\t\t\t\t` ${pass ? \"✅\" : \"❌\"} ${(r.input as string).slice(0, 70)}`,\n\t\t\t\t);\n\t\t\t\tfor (const [scoreName, value] of Object.entries(scores)) {\n\t\t\t\t\tconsole.log(` ${scoreName}: ${value}`);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tconsole.log(\n\t\t\t\t`\\n ${result.results.length - failures}/${result.results.length} passed`,\n\t\t\t);\n\t\t\tconsole.log(` → ${outPath}\\n`);\n\t\t\treturn failures === 0;\n\t\t},\n\n\t\treportRun(results: boolean[]) {\n\t\t\tconst allPassed = results.every((r) => r === true);\n\t\t\tconsole.log(\n\t\t\t\tallPassed\n\t\t\t\t\t? \"\\n✅ All experiments passed\"\n\t\t\t\t\t: \"\\n❌ Some experiments failed\",\n\t\t\t);\n\t\t\treturn allPassed;\n\t\t},\n\t});\n}\n","import type { ChatResult } from \"./types\";\n\n/**\n * Parse the JSON-stringified ChatResult from a Braintrust task output.\n */\nexport function parseTaskOutput(output: unknown): ChatResult {\n\ttry {\n\t\treturn JSON.parse(output as string);\n\t} catch {\n\t\treturn { output: \"\", toolsCalled: [], toolCallTraces: [] };\n\t}\n}\n\n/**\n * Checks whether the expected tool was called.\n * Looks for the tool name in `metadata.expectedTool` first (for cases where `expected` is a\n * reference answer), then falls back to `expected` directly.\n */\nexport function calledExpectedTool({\n\toutput,\n\texpected,\n\tmetadata,\n}: {\n\toutput: unknown;\n\texpected?: unknown;\n\tmetadata?: Record<string, unknown>;\n}) {\n\tconst parsed = parseTaskOutput(output);\n\tconst expectedTool =\n\t\t(metadata?.expectedTool as string) ?? (expected as string);\n\tconst found = parsed.toolsCalled.includes(expectedTool);\n\treturn {\n\t\tname: \"called_expected_tool\",\n\t\tscore: found ? 1 : 0,\n\t\tmetadata: { expected: expectedTool, actual: parsed.toolsCalled },\n\t};\n}\n\n/**\n * Checks whether the assistant produced any text output.\n */\nexport function hasOutput({ output }: { output: unknown }) {\n\tconst parsed = parseTaskOutput(output);\n\treturn {\n\t\tname: \"has_output\",\n\t\tscore: parsed.output.length > 0 ? 1 : 0,\n\t};\n}\n\n/**\n * Checks specific fields in the first tool call's `stateUpdates` against expected values.\n * Supports nested fields via dot notation (e.g. \"mixedBreed.knowsBreeds\").\n * Returns partial credit (fraction of matching fields).\n */\nexport function toolInputFieldsMatch({\n\toutput,\n\tmetadata,\n}: {\n\toutput: unknown;\n\tmetadata?: Record<string, unknown>;\n}) {\n\tconst parsed = parseTaskOutput(output);\n\tconst expectedFields = (metadata?.expectedFields ?? {}) as Record<\n\t\tstring,\n\t\tunknown\n\t>;\n\tconst fieldNames = Object.keys(expectedFields);\n\n\tif (fieldNames.length === 0) {\n\t\treturn { name: \"field_extraction\", score: 1 };\n\t}\n\n\tconst trace = parsed.toolCallTraces[0];\n\tconst stateUpdates = (trace?.input?.stateUpdates ?? {}) as Record<\n\t\tstring,\n\t\tunknown\n\t>;\n\n\tlet matches = 0;\n\tconst details: Record<\n\t\tstring,\n\t\t{ expected: unknown; actual: unknown; match: boolean }\n\t> = {};\n\n\tfor (const field of fieldNames) {\n\t\tconst expected = expectedFields[field];\n\t\tlet actual: unknown;\n\n\t\tif (field.includes(\".\")) {\n\t\t\tconst [parent, child] = field.split(\".\");\n\t\t\tactual = (stateUpdates[parent] as Record<string, unknown>)?.[child];\n\t\t} else {\n\t\t\tactual = stateUpdates[field];\n\t\t}\n\n\t\tconst match = JSON.stringify(actual) === JSON.stringify(expected);\n\t\tif (match) {\n\t\t\tmatches++;\n\t\t}\n\t\tdetails[field] = { expected, actual, match };\n\t}\n\n\treturn {\n\t\tname: \"field_extraction\",\n\t\tscore: matches / fieldNames.length,\n\t\tmetadata: details,\n\t};\n}\n\n/**\n * Wraps an autoevals scorer to extract the text output from the JSON-stringified ChatResult.\n * Requires the `autoevals` package: bun add -d autoevals\n */\nfunction wrapAutoeval(\n\tscorer: (args: {\n\t\tinput: unknown;\n\t\toutput: string;\n\t\texpected?: unknown;\n\t}) => unknown,\n) {\n\treturn async (args: {\n\t\tinput: unknown;\n\t\toutput: unknown;\n\t\texpected?: unknown;\n\t}) => {\n\t\tconst parsed = parseTaskOutput(args.output);\n\t\treturn scorer({\n\t\t\tinput: args.input,\n\t\t\toutput: parsed.output,\n\t\t\texpected: args.expected,\n\t\t});\n\t};\n}\n\n// LLM-based scorers — require `autoevals` as a dev dependency.\n// These are dynamically imported so the module loads even if autoevals is not installed.\n// Using LLM scorers without autoevals installed will throw at call time.\n\nasync function getAutoeval(name: string) {\n\tconst mod = await import(\"autoevals\").catch(() => {\n\t\tthrow new Error(\n\t\t\t`LLM scorer \"${name}\" requires the \"autoevals\" package: bun add -d autoevals`,\n\t\t);\n\t});\n\treturn (mod as Record<string, unknown>)[name] as (args: {\n\t\tinput: unknown;\n\t\toutput: string;\n\t\texpected?: unknown;\n\t}) => unknown;\n}\n\n/** ClosedQA — checks if the answer correctly addresses the question given a reference answer. */\nexport const FaqAccuracy = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"ClosedQA\"))(args);\n\n/** Factuality — checks if the output is factually consistent with the expected output. */\nexport const OutputFactuality = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"Factuality\"))(args);\n\n/** Moderation — flags unsafe or inappropriate content. */\nexport const SafetyCheck = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"Moderation\"))(args);\n"],"mappings":"yPAAA,OAAS,cAAAA,EAAY,aAAAC,EAAW,iBAAAC,MAAqB,KCK9C,SAASC,EAAgBC,EAA6B,CAC5D,GAAI,CACH,OAAO,KAAK,MAAMA,CAAgB,CACnC,MAAQ,CACP,MAAO,CAAE,OAAQ,GAAI,YAAa,CAAC,EAAG,eAAgB,CAAC,CAAE,CAC1D,CACD,CAOO,SAASC,EAAmB,CAClC,OAAAD,EACA,SAAAE,EACA,SAAAC,CACD,EAIG,CACF,IAAMC,EAASL,EAAgBC,CAAM,EAC/BK,EACJF,GAAU,cAA4BD,EAExC,MAAO,CACN,KAAM,uBACN,MAHaE,EAAO,YAAY,SAASC,CAAY,EAGtC,EAAI,EACnB,SAAU,CAAE,SAAUA,EAAc,OAAQD,EAAO,WAAY,CAChE,CACD,CAKO,SAASE,EAAU,CAAE,OAAAN,CAAO,EAAwB,CAE1D,MAAO,CACN,KAAM,aACN,MAHcD,EAAgBC,CAAM,EAGtB,OAAO,OAAS,EAAI,EAAI,CACvC,CACD,CAOO,SAASO,EAAqB,CACpC,OAAAP,EACA,SAAAG,CACD,EAGG,CACF,IAAMC,EAASL,EAAgBC,CAAM,EAC/BQ,EAAkBL,GAAU,gBAAkB,CAAC,EAI/CM,EAAa,OAAO,KAAKD,CAAc,EAE7C,GAAIC,EAAW,SAAW,EACzB,MAAO,CAAE,KAAM,mBAAoB,MAAO,CAAE,EAI7C,IAAMC,EADQN,EAAO,eAAe,CAAC,GACR,OAAO,cAAgB,CAAC,EAKjDO,EAAU,EACRC,EAGF,CAAC,EAEL,QAAWC,KAASJ,EAAY,CAC/B,IAAMP,EAAWM,EAAeK,CAAK,EACjCC,EAEJ,GAAID,EAAM,SAAS,GAAG,EAAG,CACxB,GAAM,CAACE,EAAQC,CAAK,EAAIH,EAAM,MAAM,GAAG,EACvCC,EAAUJ,EAAaK,CAAM,IAAgCC,CAAK,CACnE,MACCF,EAASJ,EAAaG,CAAK,EAG5B,IAAMI,EAAQ,KAAK,UAAUH,CAAM,IAAM,KAAK,UAAUZ,CAAQ,EAC5De,GACHN,IAEDC,EAAQC,CAAK,EAAI,CAAE,SAAAX,EAAU,OAAAY,EAAQ,MAAAG,CAAM,CAC5C,CAEA,MAAO,CACN,KAAM,mBACN,MAAON,EAAUF,EAAW,OAC5B,SAAUG,CACX,CACD,CAMA,SAASM,EACRC,EAKC,CACD,MAAO,OAAOC,GAIR,CACL,IAAMhB,EAASL,EAAgBqB,EAAK,MAAM,EAC1C,OAAOD,EAAO,CACb,MAAOC,EAAK,MACZ,OAAQhB,EAAO,OACf,SAAUgB,EAAK,QAChB,CAAC,CACF,CACD,CAMA,eAAeC,EAAYC,EAAc,CAMxC,OALY,KAAM,QAAO,WAAW,EAAE,MAAM,IAAM,CACjD,MAAM,IAAI,MACT,eAAeA,CAAI,0DACpB,CACD,CAAC,GACuCA,CAAI,CAK7C,CAGO,IAAMC,EAAc,MAAOH,GAIVF,EAAa,MAAMG,EAAY,UAAU,CAAC,EAAED,CAAI,EAG3DI,EAAmB,MAAOJ,GAIfF,EAAa,MAAMG,EAAY,YAAY,CAAC,EAAED,CAAI,EAG7DK,EAAc,MAAOL,GAIVF,EAAa,MAAMG,EAAY,YAAY,CAAC,EAAED,CAAI,ED9InE,SAASM,EAAoBC,EAAY,aAAc,CAC7D,IAAIC,EACJ,GAAI,CACHA,EAAY,EAAQ,YAAY,EAA+B,QAChE,MAAQ,CACP,MAAM,IAAI,MACT,yEACD,CACD,CAEA,OAAKC,EAAWF,CAAS,GACxBG,EAAUH,EAAW,CAAE,UAAW,EAAK,CAAC,EAGlCC,EAAS,QAAS,CACxB,WACCG,EACAC,EAOC,CACD,IAAMC,EACJF,EAA0C,gBAAkB,UACxDG,EAAY,IAAI,KAAK,EAAE,YAAY,EAAE,QAAQ,QAAS,GAAG,EACzDC,EAAOH,EAAO,QAAQ,IAAKI,GAAM,CACtC,IAAMC,EAASD,EAAE,QAAU,CAAC,EACtBE,EAASC,EAAgBH,EAAE,MAAM,EACvC,MAAO,CACN,MAAOA,EAAE,MACT,OAAQE,EAAO,OACf,YAAaA,EAAO,YACpB,eAAgBA,EAAO,eACvB,OAAAD,CACD,CACD,CAAC,EAEKG,EAAU,GAAGb,CAAS,IAAIM,CAAI,IAAIC,CAAS,QACjDO,EAAcD,EAAS,KAAK,UAAUL,EAAM,KAAM,CAAC,CAAC,EAEpD,QAAQ,IAAI;AAAA,YAAQF,CAAI,KAAKD,EAAO,QAAQ,MAAM;AAAA,CAAY,EAE9D,IAAIU,EAAW,EACf,QAAWN,KAAKJ,EAAO,QAAS,CAC/B,IAAMK,EAASD,EAAE,QAAU,CAAC,EACtBO,EAAON,EAAO,uBAAyB,EACxCM,GACJD,IAGD,QAAQ,IACP,KAAKC,EAAO,SAAM,QAAG,IAAKP,EAAE,MAAiB,MAAM,EAAG,EAAE,CAAC,EAC1D,EACA,OAAW,CAACQ,EAAWC,CAAK,IAAK,OAAO,QAAQR,CAAM,EACrD,QAAQ,IAAI,QAAQO,CAAS,KAAKC,CAAK,EAAE,CAE3C,CAEA,eAAQ,IACP;AAAA,IAAOb,EAAO,QAAQ,OAASU,CAAQ,IAAIV,EAAO,QAAQ,MAAM,SACjE,EACA,QAAQ,IAAI,YAAOQ,CAAO;AAAA,CAAI,EACvBE,IAAa,CACrB,EAEA,UAAUI,EAAoB,CAC7B,IAAMC,EAAYD,EAAQ,MAAOV,GAAMA,IAAM,EAAI,EACjD,eAAQ,IACPW,EACG;AAAA,+BACA;AAAA,+BACJ,EACOA,CACR,CACD,CAAC,CACF","names":["existsSync","mkdirSync","writeFileSync","parseTaskOutput","output","calledExpectedTool","expected","metadata","parsed","expectedTool","hasOutput","toolInputFieldsMatch","expectedFields","fieldNames","stateUpdates","matches","details","field","actual","parent","child","match","wrapAutoeval","scorer","args","getAutoeval","name","FaqAccuracy","OutputFactuality","SafetyCheck","createLocalReporter","outputDir","Reporter","existsSync","mkdirSync","evaluator","result","name","timestamp","rows","r","scores","parsed","parseTaskOutput","outPath","writeFileSync","failures","pass","scoreName","value","results","allPassed"]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@waniwani/sdk",
|
|
3
|
-
"version": "0.6.1-beta.
|
|
3
|
+
"version": "0.6.1-beta.4",
|
|
4
4
|
"description": "WaniWani SDK - MCP event tracking, widget framework, and tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
@@ -44,6 +44,11 @@
|
|
|
44
44
|
"types": "./dist/evals/index.d.ts",
|
|
45
45
|
"import": "./dist/evals/index.js",
|
|
46
46
|
"default": "./dist/evals/index.js"
|
|
47
|
+
},
|
|
48
|
+
"./evals/scorers": {
|
|
49
|
+
"types": "./dist/evals/scorers.d.ts",
|
|
50
|
+
"import": "./dist/evals/scorers.js",
|
|
51
|
+
"default": "./dist/evals/scorers.js"
|
|
47
52
|
}
|
|
48
53
|
},
|
|
49
54
|
"files": [
|