@waniwani/sdk 0.4.9-beta.1 → 0.4.9-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evals/index.d.ts +8 -1
- package/dist/evals/index.js +6 -6
- package/dist/evals/index.js.map +1 -1
- package/package.json +1 -1
package/dist/evals/index.d.ts
CHANGED
|
@@ -60,6 +60,13 @@ interface SessionReplay {
|
|
|
60
60
|
};
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
+
/**
|
|
64
|
+
* Load all session replay JSON files from a directory.
|
|
65
|
+
* Drop any exported session JSON there — it just works.
|
|
66
|
+
*
|
|
67
|
+
* @param dir - Path to the sessions directory. Defaults to `evals/sessions`.
|
|
68
|
+
*/
|
|
69
|
+
declare function loadSessions(dir?: string): SessionReplay[];
|
|
63
70
|
/**
|
|
64
71
|
* Send a single user message to a WaniWani MCP chat endpoint.
|
|
65
72
|
*/
|
|
@@ -163,4 +170,4 @@ declare const SafetyCheck: (args: {
|
|
|
163
170
|
expected?: unknown;
|
|
164
171
|
}) => Promise<unknown>;
|
|
165
172
|
|
|
166
|
-
export { type ChatResult, type ConversationResult, type ConversationTurn, type ConversationTurnResult, FaqAccuracy, OutputFactuality, SafetyCheck, type SessionReplay, type ToolCallTrace, type TurnAssertion, calledExpectedTool, chat, conversation, createLocalReporter, hasOutput, parseTaskOutput, replaySession, toolInputFieldsMatch };
|
|
173
|
+
export { type ChatResult, type ConversationResult, type ConversationTurn, type ConversationTurnResult, FaqAccuracy, OutputFactuality, SafetyCheck, type SessionReplay, type ToolCallTrace, type TurnAssertion, calledExpectedTool, chat, conversation, createLocalReporter, hasOutput, loadSessions, parseTaskOutput, replaySession, toolInputFieldsMatch };
|
package/dist/evals/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
var
|
|
2
|
-
\u{1F4CA} ${r} (${
|
|
3
|
-
`);let u=0;for(let
|
|
4
|
-
${
|
|
5
|
-
`),u===0},reportRun(n){let
|
|
1
|
+
var U=(t=>typeof require<"u"?require:typeof Proxy<"u"?new Proxy(t,{get:(e,n)=>(typeof require<"u"?require:e)[n]}):t)(function(t){if(typeof require<"u")return require.apply(this,arguments);throw Error('Dynamic require of "'+t+'" is not supported')});import{readdirSync as b,readFileSync as I}from"fs";import{join as C}from"path";import{parseJsonEventStream as O,readUIMessageStream as N,uiMessageChunkSchema as $}from"ai";import{z as l}from"zod";var F=l.object({name:l.string(),mode:l.enum(["regenerate","inject"]).optional(),outcome:l.object({toolsCalled:l.array(l.string())}).optional(),messages:l.array(l.looseObject({id:l.string(),role:l.enum(["user","assistant","system","data"]),parts:l.array(l.record(l.string(),l.unknown()))}))});function M(t){let e=t.parts.filter(o=>o.type==="text").map(o=>o.text).join(""),n=t.parts.filter(o=>o.type.startsWith("tool-")||o.type==="dynamic-tool").map(o=>o),s=n.map(o=>o.toolName),r=n.map(o=>({name:o.toolName,input:o.input??{},output:o.output}));return{output:e,toolsCalled:s,toolCallTraces:r}}function S(t){return t.parts.filter(e=>e.type==="text").map(e=>e.text).join("")}function j(t){return t.parts.filter(e=>e.type==="dynamic-tool"||e.type.startsWith("tool-")).map(e=>e.toolName).filter(Boolean)}async function x(t,e){let n=await fetch(`${t}/api/waniwani`,{method:"POST",headers:{"Content-Type":"application/json"},signal:AbortSignal.timeout(6e4),body:JSON.stringify({messages:e})});if(!n.ok)throw new Error(`Chat returned ${n.status}: ${await n.text()}`);if(!n.body)throw new Error("Chat response has no body");let s=O({stream:n.body,schema:$}).pipeThrough(new TransformStream({transform(o,a){o.success&&a.enqueue(o.value)}})),r;for await(let o of N({stream:s}))r=o;if(!r)throw new Error("No message received from stream");return{result:M(r),message:r}}function A(t="evals/sessions"){let e=C(process.cwd(),t);return b(e).filter(n=>n.endsWith(".json")).sort().map(n=>{let s=JSON.parse(I(C(e,n),"utf8"));return F.parse(s)})}async function P(t,e){let n={id:crypto.randomUUID(),role:"user",parts:[{type:"text",text:e}]},{result:s}=await x(t,[n]);return s}async function E(t,e){let n=[],s=[];for(let r of e){n.push({id:crypto.randomUUID(),role:"user",parts:[{type:"text",text:r.input}]});let{result:o,message:a}=await x(t,n);n.push(a),s.push({input:r.input,response:o,assertions:[]})}return{turns:s}}async function q(t,e){let n=e.mode??"regenerate",s=[],r=[],o=[];for(let a=0;a<e.messages.length;a++){let p=e.messages[a];if(p.role==="user"){let u=e.messages[a+1];o.push({userMsg:p,assistantMsg:u?.role==="assistant"?u:void 0})}}for(let a=0;a<o.length;a++){let{userMsg:p,assistantMsg:u}=o[a],i=a===o.length-1,d=u?j(u):[];if(s.push(p),n==="inject"&&!i&&u){s.push(u);let w=M(u),v=T(d,w.toolsCalled);r.push({input:S(p),response:w,assertions:v});continue}let{result:c,message:g}=await x(t,s);s.push(g);let f=T(d,c.toolsCalled);r.push({input:S(p),response:c,assertions:f})}return{turns:r}}function T(t,e){if(t.length===0)return[];let n=new Set(e);return[...new Set(t)].map(r=>({passed:n.has(r),expected:[r],actual:e}))}import{existsSync as D,mkdirSync as Q,writeFileSync as G}from"fs";function m(t){try{return JSON.parse(t)}catch{return{output:"",toolsCalled:[],toolCallTraces:[]}}}function _({output:t,expected:e,metadata:n}){let s=m(t),r=n?.expectedTool??e;return{name:"called_expected_tool",score:s.toolsCalled.includes(r)?1:0,metadata:{expected:r,actual:s.toolsCalled}}}function J({output:t}){return{name:"has_output",score:m(t).output.length>0?1:0}}function L({output:t,metadata:e}){let n=m(t),s=e?.expectedFields??{},r=Object.keys(s);if(r.length===0)return{name:"field_extraction",score:1};let a=n.toolCallTraces[0]?.input?.stateUpdates??{},p=0,u={};for(let i of r){let d=s[i],c;if(i.includes(".")){let[f,w]=i.split(".");c=a[f]?.[w]}else c=a[i];let g=JSON.stringify(c)===JSON.stringify(d);g&&p++,u[i]={expected:d,actual:c,match:g}}return{name:"field_extraction",score:p/r.length,metadata:u}}function k(t){return async e=>{let n=m(e.output);return t({input:e.input,output:n.output,expected:e.expected})}}async function R(t){return(await import("autoevals").catch(()=>{throw new Error(`LLM scorer "${t}" requires the "autoevals" package: bun add -d autoevals`)}))[t]}var W=async t=>k(await R("ClosedQA"))(t),z=async t=>k(await R("Factuality"))(t),B=async t=>k(await R("Moderation"))(t);function H(t="evals/runs"){let e;try{e=U("braintrust").Reporter}catch{throw new Error('Local reporter requires the "braintrust" package: bun add -d braintrust')}return D(t)||Q(t,{recursive:!0}),e("local",{reportEval(n,s){let r=n.experimentName??"unknown",o=new Date().toISOString().replace(/[:.]/g,"-"),a=s.results.map(i=>{let d=i.scores??{},c=m(i.output);return{input:i.input,output:c.output,toolsCalled:c.toolsCalled,toolCallTraces:c.toolCallTraces,scores:d}}),p=`${t}/${r}-${o}.json`;G(p,JSON.stringify(a,null,2)),console.log(`
|
|
2
|
+
\u{1F4CA} ${r} (${s.results.length} cases):
|
|
3
|
+
`);let u=0;for(let i of s.results){let d=i.scores??{},c=d.called_expected_tool===1;c||u++,console.log(` ${c?"\u2705":"\u274C"} ${i.input.slice(0,70)}`);for(let[g,f]of Object.entries(d))console.log(` ${g}: ${f}`)}return console.log(`
|
|
4
|
+
${s.results.length-u}/${s.results.length} passed`),console.log(` \u2192 ${p}
|
|
5
|
+
`),u===0},reportRun(n){let s=n.every(r=>r===!0);return console.log(s?`
|
|
6
6
|
\u2705 All experiments passed`:`
|
|
7
|
-
\u274C Some experiments failed`),
|
|
7
|
+
\u274C Some experiments failed`),s}})}export{W as FaqAccuracy,z as OutputFactuality,B as SafetyCheck,_ as calledExpectedTool,P as chat,E as conversation,H as createLocalReporter,J as hasOutput,A as loadSessions,m as parseTaskOutput,q as replaySession,L as toolInputFieldsMatch};
|
|
8
8
|
//# sourceMappingURL=index.js.map
|
package/dist/evals/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/evals/chat.ts","../../src/evals/reporter.ts","../../src/evals/scorers.ts"],"sourcesContent":["import {\n\tparseJsonEventStream,\n\treadUIMessageStream,\n\ttype UIMessage,\n\tuiMessageChunkSchema,\n} from \"ai\";\nimport type {\n\tChatResult,\n\tConversationResult,\n\tConversationTurn,\n\tConversationTurnResult,\n\tSessionReplay,\n\tToolCallTrace,\n\tTurnAssertion,\n} from \"./types\";\n\n// --- Internal helpers ---\n\nfunction parseUIMessage(msg: UIMessage): ChatResult {\n\tconst output = msg.parts\n\t\t.filter((p): p is { type: \"text\"; text: string } => p.type === \"text\")\n\t\t.map((p) => p.text)\n\t\t.join(\"\");\n\n\tconst toolParts = msg.parts\n\t\t.filter((p) => p.type.startsWith(\"tool-\") || p.type === \"dynamic-tool\")\n\t\t.map(\n\t\t\t(p) =>\n\t\t\t\tp as unknown as {\n\t\t\t\t\ttoolName: string;\n\t\t\t\t\tinput?: Record<string, unknown>;\n\t\t\t\t\toutput?: unknown;\n\t\t\t\t},\n\t\t);\n\tconst toolsCalled = toolParts.map((p) => p.toolName);\n\tconst toolCallTraces: ToolCallTrace[] = toolParts.map((p) => ({\n\t\tname: p.toolName,\n\t\tinput: p.input ?? {},\n\t\toutput: p.output,\n\t}));\n\n\treturn { output, toolsCalled, toolCallTraces };\n}\n\nfunction textFromUIMessage(msg: UIMessage): string {\n\treturn msg.parts\n\t\t.filter((p): p is { type: \"text\"; text: string } => p.type === \"text\")\n\t\t.map((p) => p.text)\n\t\t.join(\"\");\n}\n\n/** Extract the tool names called in a recorded assistant UIMessage. */\nfunction extractRecordedTools(msg: UIMessage): string[] {\n\treturn msg.parts\n\t\t.filter((p) => p.type === \"dynamic-tool\" || p.type.startsWith(\"tool-\"))\n\t\t.map((p) => (p as unknown as { toolName: string }).toolName)\n\t\t.filter(Boolean);\n}\n\nasync function sendMessages(\n\turl: string,\n\tmessages: UIMessage[],\n): Promise<{ result: ChatResult; message: UIMessage }> {\n\tconst response = await fetch(`${url}/api/waniwani`, {\n\t\tmethod: \"POST\",\n\t\theaders: { \"Content-Type\": \"application/json\" },\n\t\tsignal: AbortSignal.timeout(60_000),\n\t\tbody: JSON.stringify({ messages }),\n\t});\n\n\tif (!response.ok) {\n\t\tthrow new Error(\n\t\t\t`Chat returned ${response.status}: ${await response.text()}`,\n\t\t);\n\t}\n\n\tif (!response.body) {\n\t\tthrow new Error(\"Chat response has no body\");\n\t}\n\n\tconst chunkStream = parseJsonEventStream({\n\t\tstream: response.body,\n\t\tschema: uiMessageChunkSchema,\n\t}).pipeThrough(\n\t\tnew TransformStream({\n\t\t\ttransform(chunk, controller) {\n\t\t\t\tif (chunk.success) {\n\t\t\t\t\tcontroller.enqueue(chunk.value);\n\t\t\t\t}\n\t\t\t},\n\t\t}),\n\t);\n\n\tlet finalMessage: UIMessage | undefined;\n\tfor await (const msg of readUIMessageStream({ stream: chunkStream })) {\n\t\tfinalMessage = msg;\n\t}\n\n\tif (!finalMessage) {\n\t\tthrow new Error(\"No message received from stream\");\n\t}\n\n\treturn { result: parseUIMessage(finalMessage), message: finalMessage };\n}\n\n// --- Public API ---\n\n/**\n * Send a single user message to a WaniWani MCP chat endpoint.\n */\nexport async function chat(url: string, message: string): Promise<ChatResult> {\n\tconst userMsg: UIMessage = {\n\t\tid: crypto.randomUUID(),\n\t\trole: \"user\",\n\t\tparts: [{ type: \"text\", text: message }],\n\t};\n\tconst { result } = await sendMessages(url, [userMsg]);\n\treturn result;\n}\n\n/**\n * Run a multi-turn conversation. Returns the result of each turn.\n */\nexport async function conversation(\n\turl: string,\n\tturns: ConversationTurn[],\n): Promise<ConversationResult> {\n\tconst history: UIMessage[] = [];\n\tconst turnResults: ConversationTurnResult[] = [];\n\n\tfor (const turn of turns) {\n\t\thistory.push({\n\t\t\tid: crypto.randomUUID(),\n\t\t\trole: \"user\",\n\t\t\tparts: [{ type: \"text\", text: turn.input }],\n\t\t});\n\n\t\tconst { result, message } = await sendMessages(url, history);\n\t\thistory.push(message);\n\n\t\tturnResults.push({ input: turn.input, response: result, assertions: [] });\n\t}\n\n\treturn { turns: turnResults };\n}\n\n/**\n * Replay a recorded conversation session (exported from the chatbar debug button).\n * Uses UIMessage[] directly — same format as useChat's messages array.\n *\n * **\"regenerate\" mode** (default):\n * Sends only user messages. The LLM generates fresh responses.\n * Per-turn assertions are auto-derived by comparing actual tool calls\n * to the tool calls recorded in the session.\n *\n * **\"inject\" mode**:\n * Injects the recorded conversation as-is, only generates a fresh\n * response for the final user message.\n */\nexport async function replaySession(\n\turl: string,\n\tsession: SessionReplay,\n): Promise<ConversationResult> {\n\tconst mode = session.mode ?? \"regenerate\";\n\tconst history: UIMessage[] = [];\n\tconst turnResults: ConversationTurnResult[] = [];\n\n\t// Pair user messages with their assistant responses\n\tconst userTurns: { userMsg: UIMessage; assistantMsg?: UIMessage }[] = [];\n\tfor (let i = 0; i < session.messages.length; i++) {\n\t\tconst msg = session.messages[i];\n\t\tif (msg.role === \"user\") {\n\t\t\tconst next = session.messages[i + 1];\n\t\t\tuserTurns.push({\n\t\t\t\tuserMsg: msg,\n\t\t\t\tassistantMsg: next?.role === \"assistant\" ? next : undefined,\n\t\t\t});\n\t\t}\n\t}\n\n\tfor (let turnIdx = 0; turnIdx < userTurns.length; turnIdx++) {\n\t\tconst { userMsg, assistantMsg } = userTurns[turnIdx];\n\t\tconst isLastTurn = turnIdx === userTurns.length - 1;\n\n\t\t// Extract expected tools from the recorded assistant message\n\t\tconst expectedTools = assistantMsg\n\t\t\t? extractRecordedTools(assistantMsg)\n\t\t\t: [];\n\n\t\thistory.push(userMsg);\n\n\t\tif (mode === \"inject\" && !isLastTurn && assistantMsg) {\n\t\t\thistory.push(assistantMsg);\n\t\t\tconst response = parseUIMessage(assistantMsg);\n\t\t\tconst assertions = buildAssertions(expectedTools, response.toolsCalled);\n\t\t\tturnResults.push({\n\t\t\t\tinput: textFromUIMessage(userMsg),\n\t\t\t\tresponse,\n\t\t\t\tassertions,\n\t\t\t});\n\t\t\tcontinue;\n\t\t}\n\n\t\tconst { result, message } = await sendMessages(url, history);\n\t\thistory.push(message);\n\n\t\tconst assertions = buildAssertions(expectedTools, result.toolsCalled);\n\t\tturnResults.push({\n\t\t\tinput: textFromUIMessage(userMsg),\n\t\t\tresponse: result,\n\t\t\tassertions,\n\t\t});\n\t}\n\n\treturn { turns: turnResults };\n}\n\n/** Compare expected vs. actual tool calls and return assertion results. */\nfunction buildAssertions(\n\texpected: string[],\n\tactual: string[],\n): TurnAssertion[] {\n\tif (expected.length === 0) {\n\t\treturn [];\n\t}\n\n\t// Group expected tools and check each against actual calls\n\tconst actualSet = new Set(actual);\n\tconst expectedUnique = [...new Set(expected)];\n\n\treturn expectedUnique.map((tool) => ({\n\t\tpassed: actualSet.has(tool),\n\t\texpected: [tool],\n\t\tactual,\n\t}));\n}\n","import { existsSync, mkdirSync, writeFileSync } from \"node:fs\";\nimport { parseTaskOutput } from \"./scorers\";\n\ntype ReporterFn = (\n\tname: string,\n\thandlers: {\n\t\treportEval(\n\t\t\tevaluator: unknown,\n\t\t\tresult: {\n\t\t\t\tresults: Array<{\n\t\t\t\t\tinput: unknown;\n\t\t\t\t\toutput: unknown;\n\t\t\t\t\tscores?: Record<string, number>;\n\t\t\t\t}>;\n\t\t\t},\n\t\t): boolean;\n\t\treportRun(results: boolean[]): boolean;\n\t},\n) => unknown;\n\n/**\n * Create a local Braintrust reporter that writes JSON results to a directory and\n * prints a summary to console.\n *\n * Requires the `braintrust` package: bun add -d braintrust\n *\n * @param outputDir - Directory to write JSON result files (default: \"evals/runs\")\n */\nexport function createLocalReporter(outputDir = \"evals/runs\") {\n\tlet Reporter: ReporterFn;\n\ttry {\n\t\tReporter = (require(\"braintrust\") as { Reporter: ReporterFn }).Reporter;\n\t} catch {\n\t\tthrow new Error(\n\t\t\t'Local reporter requires the \"braintrust\" package: bun add -d braintrust',\n\t\t);\n\t}\n\n\tif (!existsSync(outputDir)) {\n\t\tmkdirSync(outputDir, { recursive: true });\n\t}\n\n\treturn Reporter(\"local\", {\n\t\treportEval(\n\t\t\tevaluator: unknown,\n\t\t\tresult: {\n\t\t\t\tresults: Array<{\n\t\t\t\t\tinput: unknown;\n\t\t\t\t\toutput: unknown;\n\t\t\t\t\tscores?: Record<string, number>;\n\t\t\t\t}>;\n\t\t\t},\n\t\t) {\n\t\t\tconst name =\n\t\t\t\t(evaluator as { experimentName?: string }).experimentName ?? \"unknown\";\n\t\t\tconst timestamp = new Date().toISOString().replace(/[:.]/g, \"-\");\n\t\t\tconst rows = result.results.map((r) => {\n\t\t\t\tconst scores = r.scores ?? {};\n\t\t\t\tconst parsed = parseTaskOutput(r.output);\n\t\t\t\treturn {\n\t\t\t\t\tinput: r.input,\n\t\t\t\t\toutput: parsed.output,\n\t\t\t\t\ttoolsCalled: parsed.toolsCalled,\n\t\t\t\t\ttoolCallTraces: parsed.toolCallTraces,\n\t\t\t\t\tscores,\n\t\t\t\t};\n\t\t\t});\n\n\t\t\tconst outPath = `${outputDir}/${name}-${timestamp}.json`;\n\t\t\twriteFileSync(outPath, JSON.stringify(rows, null, 2));\n\n\t\t\tconsole.log(`\\n📊 ${name} (${result.results.length} cases):\\n`);\n\n\t\t\tlet failures = 0;\n\t\t\tfor (const r of result.results) {\n\t\t\t\tconst scores = r.scores ?? {};\n\t\t\t\tconst pass = scores.called_expected_tool === 1;\n\t\t\t\tif (!pass) {\n\t\t\t\t\tfailures++;\n\t\t\t\t}\n\n\t\t\t\tconsole.log(\n\t\t\t\t\t` ${pass ? \"✅\" : \"❌\"} ${(r.input as string).slice(0, 70)}`,\n\t\t\t\t);\n\t\t\t\tfor (const [scoreName, value] of Object.entries(scores)) {\n\t\t\t\t\tconsole.log(` ${scoreName}: ${value}`);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tconsole.log(\n\t\t\t\t`\\n ${result.results.length - failures}/${result.results.length} passed`,\n\t\t\t);\n\t\t\tconsole.log(` → ${outPath}\\n`);\n\t\t\treturn failures === 0;\n\t\t},\n\n\t\treportRun(results: boolean[]) {\n\t\t\tconst allPassed = results.every((r) => r === true);\n\t\t\tconsole.log(\n\t\t\t\tallPassed\n\t\t\t\t\t? \"\\n✅ All experiments passed\"\n\t\t\t\t\t: \"\\n❌ Some experiments failed\",\n\t\t\t);\n\t\t\treturn allPassed;\n\t\t},\n\t});\n}\n","import type { ChatResult } from \"./types\";\n\n/**\n * Parse the JSON-stringified ChatResult from a Braintrust task output.\n */\nexport function parseTaskOutput(output: unknown): ChatResult {\n\ttry {\n\t\treturn JSON.parse(output as string);\n\t} catch {\n\t\treturn { output: \"\", toolsCalled: [], toolCallTraces: [] };\n\t}\n}\n\n/**\n * Checks whether the expected tool was called.\n * Looks for the tool name in `metadata.expectedTool` first (for cases where `expected` is a\n * reference answer), then falls back to `expected` directly.\n */\nexport function calledExpectedTool({\n\toutput,\n\texpected,\n\tmetadata,\n}: {\n\toutput: unknown;\n\texpected?: unknown;\n\tmetadata?: Record<string, unknown>;\n}) {\n\tconst parsed = parseTaskOutput(output);\n\tconst expectedTool =\n\t\t(metadata?.expectedTool as string) ?? (expected as string);\n\tconst found = parsed.toolsCalled.includes(expectedTool);\n\treturn {\n\t\tname: \"called_expected_tool\",\n\t\tscore: found ? 1 : 0,\n\t\tmetadata: { expected: expectedTool, actual: parsed.toolsCalled },\n\t};\n}\n\n/**\n * Checks whether the assistant produced any text output.\n */\nexport function hasOutput({ output }: { output: unknown }) {\n\tconst parsed = parseTaskOutput(output);\n\treturn {\n\t\tname: \"has_output\",\n\t\tscore: parsed.output.length > 0 ? 1 : 0,\n\t};\n}\n\n/**\n * Checks specific fields in the first tool call's `stateUpdates` against expected values.\n * Supports nested fields via dot notation (e.g. \"mixedBreed.knowsBreeds\").\n * Returns partial credit (fraction of matching fields).\n */\nexport function toolInputFieldsMatch({\n\toutput,\n\tmetadata,\n}: {\n\toutput: unknown;\n\tmetadata?: Record<string, unknown>;\n}) {\n\tconst parsed = parseTaskOutput(output);\n\tconst expectedFields = (metadata?.expectedFields ?? {}) as Record<\n\t\tstring,\n\t\tunknown\n\t>;\n\tconst fieldNames = Object.keys(expectedFields);\n\n\tif (fieldNames.length === 0) {\n\t\treturn { name: \"field_extraction\", score: 1 };\n\t}\n\n\tconst trace = parsed.toolCallTraces[0];\n\tconst stateUpdates = (trace?.input?.stateUpdates ?? {}) as Record<\n\t\tstring,\n\t\tunknown\n\t>;\n\n\tlet matches = 0;\n\tconst details: Record<\n\t\tstring,\n\t\t{ expected: unknown; actual: unknown; match: boolean }\n\t> = {};\n\n\tfor (const field of fieldNames) {\n\t\tconst expected = expectedFields[field];\n\t\tlet actual: unknown;\n\n\t\tif (field.includes(\".\")) {\n\t\t\tconst [parent, child] = field.split(\".\");\n\t\t\tactual = (stateUpdates[parent] as Record<string, unknown>)?.[child];\n\t\t} else {\n\t\t\tactual = stateUpdates[field];\n\t\t}\n\n\t\tconst match = JSON.stringify(actual) === JSON.stringify(expected);\n\t\tif (match) {\n\t\t\tmatches++;\n\t\t}\n\t\tdetails[field] = { expected, actual, match };\n\t}\n\n\treturn {\n\t\tname: \"field_extraction\",\n\t\tscore: matches / fieldNames.length,\n\t\tmetadata: details,\n\t};\n}\n\n/**\n * Wraps an autoevals scorer to extract the text output from the JSON-stringified ChatResult.\n * Requires the `autoevals` package: bun add -d autoevals\n */\nfunction wrapAutoeval(\n\tscorer: (args: {\n\t\tinput: unknown;\n\t\toutput: string;\n\t\texpected?: unknown;\n\t}) => unknown,\n) {\n\treturn async (args: {\n\t\tinput: unknown;\n\t\toutput: unknown;\n\t\texpected?: unknown;\n\t}) => {\n\t\tconst parsed = parseTaskOutput(args.output);\n\t\treturn scorer({\n\t\t\tinput: args.input,\n\t\t\toutput: parsed.output,\n\t\t\texpected: args.expected,\n\t\t});\n\t};\n}\n\n// LLM-based scorers — require `autoevals` as a dev dependency.\n// These are dynamically imported so the module loads even if autoevals is not installed.\n// Using LLM scorers without autoevals installed will throw at call time.\n\nasync function getAutoeval(name: string) {\n\tconst mod = await import(\"autoevals\").catch(() => {\n\t\tthrow new Error(\n\t\t\t`LLM scorer \"${name}\" requires the \"autoevals\" package: bun add -d autoevals`,\n\t\t);\n\t});\n\treturn (mod as Record<string, unknown>)[name] as (args: {\n\t\tinput: unknown;\n\t\toutput: string;\n\t\texpected?: unknown;\n\t}) => unknown;\n}\n\n/** ClosedQA — checks if the answer correctly addresses the question given a reference answer. */\nexport const FaqAccuracy = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}) => wrapAutoeval(await getAutoeval(\"ClosedQA\"))(args);\n\n/** Factuality — checks if the output is factually consistent with the expected output. */\nexport const OutputFactuality = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}) => wrapAutoeval(await getAutoeval(\"Factuality\"))(args);\n\n/** Moderation — flags unsafe or inappropriate content. */\nexport const SafetyCheck = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}) => wrapAutoeval(await getAutoeval(\"Moderation\"))(args);\n"],"mappings":"yPAAA,OACC,wBAAAA,EACA,uBAAAC,EAEA,wBAAAC,MACM,KAaP,SAASC,EAAeC,EAA4B,CACnD,IAAMC,EAASD,EAAI,MACjB,OAAQE,GAA2CA,EAAE,OAAS,MAAM,EACpE,IAAKA,GAAMA,EAAE,IAAI,EACjB,KAAK,EAAE,EAEHC,EAAYH,EAAI,MACpB,OAAQE,GAAMA,EAAE,KAAK,WAAW,OAAO,GAAKA,EAAE,OAAS,cAAc,EACrE,IACCA,GACAA,CAKF,EACKE,EAAcD,EAAU,IAAKD,GAAMA,EAAE,QAAQ,EAC7CG,EAAkCF,EAAU,IAAKD,IAAO,CAC7D,KAAMA,EAAE,SACR,MAAOA,EAAE,OAAS,CAAC,EACnB,OAAQA,EAAE,MACX,EAAE,EAEF,MAAO,CAAE,OAAAD,EAAQ,YAAAG,EAAa,eAAAC,CAAe,CAC9C,CAEA,SAASC,EAAkBN,EAAwB,CAClD,OAAOA,EAAI,MACT,OAAQE,GAA2CA,EAAE,OAAS,MAAM,EACpE,IAAKA,GAAMA,EAAE,IAAI,EACjB,KAAK,EAAE,CACV,CAGA,SAASK,EAAqBP,EAA0B,CACvD,OAAOA,EAAI,MACT,OAAQE,GAAMA,EAAE,OAAS,gBAAkBA,EAAE,KAAK,WAAW,OAAO,CAAC,EACrE,IAAKA,GAAOA,EAAsC,QAAQ,EAC1D,OAAO,OAAO,CACjB,CAEA,eAAeM,EACdC,EACAC,EACsD,CACtD,IAAMC,EAAW,MAAM,MAAM,GAAGF,CAAG,gBAAiB,CACnD,OAAQ,OACR,QAAS,CAAE,eAAgB,kBAAmB,EAC9C,OAAQ,YAAY,QAAQ,GAAM,EAClC,KAAM,KAAK,UAAU,CAAE,SAAAC,CAAS,CAAC,CAClC,CAAC,EAED,GAAI,CAACC,EAAS,GACb,MAAM,IAAI,MACT,iBAAiBA,EAAS,MAAM,KAAK,MAAMA,EAAS,KAAK,CAAC,EAC3D,EAGD,GAAI,CAACA,EAAS,KACb,MAAM,IAAI,MAAM,2BAA2B,EAG5C,IAAMC,EAAchB,EAAqB,CACxC,OAAQe,EAAS,KACjB,OAAQb,CACT,CAAC,EAAE,YACF,IAAI,gBAAgB,CACnB,UAAUe,EAAOC,EAAY,CACxBD,EAAM,SACTC,EAAW,QAAQD,EAAM,KAAK,CAEhC,CACD,CAAC,CACF,EAEIE,EACJ,cAAiBf,KAAOH,EAAoB,CAAE,OAAQe,CAAY,CAAC,EAClEG,EAAef,EAGhB,GAAI,CAACe,EACJ,MAAM,IAAI,MAAM,iCAAiC,EAGlD,MAAO,CAAE,OAAQhB,EAAegB,CAAY,EAAG,QAASA,CAAa,CACtE,CAOA,eAAsBC,EAAKP,EAAaQ,EAAsC,CAC7E,IAAMC,EAAqB,CAC1B,GAAI,OAAO,WAAW,EACtB,KAAM,OACN,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAMD,CAAQ,CAAC,CACxC,EACM,CAAE,OAAAE,CAAO,EAAI,MAAMX,EAAaC,EAAK,CAACS,CAAO,CAAC,EACpD,OAAOC,CACR,CAKA,eAAsBC,EACrBX,EACAY,EAC8B,CAC9B,IAAMC,EAAuB,CAAC,EACxBC,EAAwC,CAAC,EAE/C,QAAWC,KAAQH,EAAO,CACzBC,EAAQ,KAAK,CACZ,GAAI,OAAO,WAAW,EACtB,KAAM,OACN,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAME,EAAK,KAAM,CAAC,CAC3C,CAAC,EAED,GAAM,CAAE,OAAAL,EAAQ,QAAAF,CAAQ,EAAI,MAAMT,EAAaC,EAAKa,CAAO,EAC3DA,EAAQ,KAAKL,CAAO,EAEpBM,EAAY,KAAK,CAAE,MAAOC,EAAK,MAAO,SAAUL,EAAQ,WAAY,CAAC,CAAE,CAAC,CACzE,CAEA,MAAO,CAAE,MAAOI,CAAY,CAC7B,CAeA,eAAsBE,EACrBhB,EACAiB,EAC8B,CAC9B,IAAMC,EAAOD,EAAQ,MAAQ,aACvBJ,EAAuB,CAAC,EACxBC,EAAwC,CAAC,EAGzCK,EAAgE,CAAC,EACvE,QAASC,EAAI,EAAGA,EAAIH,EAAQ,SAAS,OAAQG,IAAK,CACjD,IAAM7B,EAAM0B,EAAQ,SAASG,CAAC,EAC9B,GAAI7B,EAAI,OAAS,OAAQ,CACxB,IAAM8B,EAAOJ,EAAQ,SAASG,EAAI,CAAC,EACnCD,EAAU,KAAK,CACd,QAAS5B,EACT,aAAc8B,GAAM,OAAS,YAAcA,EAAO,MACnD,CAAC,CACF,CACD,CAEA,QAASC,EAAU,EAAGA,EAAUH,EAAU,OAAQG,IAAW,CAC5D,GAAM,CAAE,QAAAb,EAAS,aAAAc,CAAa,EAAIJ,EAAUG,CAAO,EAC7CE,EAAaF,IAAYH,EAAU,OAAS,EAG5CM,EAAgBF,EACnBzB,EAAqByB,CAAY,EACjC,CAAC,EAIJ,GAFAV,EAAQ,KAAKJ,CAAO,EAEhBS,IAAS,UAAY,CAACM,GAAcD,EAAc,CACrDV,EAAQ,KAAKU,CAAY,EACzB,IAAMrB,EAAWZ,EAAeiC,CAAY,EACtCG,EAAaC,EAAgBF,EAAevB,EAAS,WAAW,EACtEY,EAAY,KAAK,CAChB,MAAOjB,EAAkBY,CAAO,EAChC,SAAAP,EACA,WAAAwB,CACD,CAAC,EACD,QACD,CAEA,GAAM,CAAE,OAAAhB,EAAQ,QAAAF,CAAQ,EAAI,MAAMT,EAAaC,EAAKa,CAAO,EAC3DA,EAAQ,KAAKL,CAAO,EAEpB,IAAMkB,EAAaC,EAAgBF,EAAef,EAAO,WAAW,EACpEI,EAAY,KAAK,CAChB,MAAOjB,EAAkBY,CAAO,EAChC,SAAUC,EACV,WAAAgB,CACD,CAAC,CACF,CAEA,MAAO,CAAE,MAAOZ,CAAY,CAC7B,CAGA,SAASa,EACRC,EACAC,EACkB,CAClB,GAAID,EAAS,SAAW,EACvB,MAAO,CAAC,EAIT,IAAME,EAAY,IAAI,IAAID,CAAM,EAGhC,MAFuB,CAAC,GAAG,IAAI,IAAID,CAAQ,CAAC,EAEtB,IAAKG,IAAU,CACpC,OAAQD,EAAU,IAAIC,CAAI,EAC1B,SAAU,CAACA,CAAI,EACf,OAAAF,CACD,EAAE,CACH,CC3OA,OAAS,cAAAG,EAAY,aAAAC,EAAW,iBAAAC,MAAqB,KCK9C,SAASC,EAAgBC,EAA6B,CAC5D,GAAI,CACH,OAAO,KAAK,MAAMA,CAAgB,CACnC,MAAQ,CACP,MAAO,CAAE,OAAQ,GAAI,YAAa,CAAC,EAAG,eAAgB,CAAC,CAAE,CAC1D,CACD,CAOO,SAASC,EAAmB,CAClC,OAAAD,EACA,SAAAE,EACA,SAAAC,CACD,EAIG,CACF,IAAMC,EAASL,EAAgBC,CAAM,EAC/BK,EACJF,GAAU,cAA4BD,EAExC,MAAO,CACN,KAAM,uBACN,MAHaE,EAAO,YAAY,SAASC,CAAY,EAGtC,EAAI,EACnB,SAAU,CAAE,SAAUA,EAAc,OAAQD,EAAO,WAAY,CAChE,CACD,CAKO,SAASE,EAAU,CAAE,OAAAN,CAAO,EAAwB,CAE1D,MAAO,CACN,KAAM,aACN,MAHcD,EAAgBC,CAAM,EAGtB,OAAO,OAAS,EAAI,EAAI,CACvC,CACD,CAOO,SAASO,EAAqB,CACpC,OAAAP,EACA,SAAAG,CACD,EAGG,CACF,IAAMC,EAASL,EAAgBC,CAAM,EAC/BQ,EAAkBL,GAAU,gBAAkB,CAAC,EAI/CM,EAAa,OAAO,KAAKD,CAAc,EAE7C,GAAIC,EAAW,SAAW,EACzB,MAAO,CAAE,KAAM,mBAAoB,MAAO,CAAE,EAI7C,IAAMC,EADQN,EAAO,eAAe,CAAC,GACR,OAAO,cAAgB,CAAC,EAKjDO,EAAU,EACRC,EAGF,CAAC,EAEL,QAAWC,KAASJ,EAAY,CAC/B,IAAMP,EAAWM,EAAeK,CAAK,EACjCC,EAEJ,GAAID,EAAM,SAAS,GAAG,EAAG,CACxB,GAAM,CAACE,EAAQC,CAAK,EAAIH,EAAM,MAAM,GAAG,EACvCC,EAAUJ,EAAaK,CAAM,IAAgCC,CAAK,CACnE,MACCF,EAASJ,EAAaG,CAAK,EAG5B,IAAMI,EAAQ,KAAK,UAAUH,CAAM,IAAM,KAAK,UAAUZ,CAAQ,EAC5De,GACHN,IAEDC,EAAQC,CAAK,EAAI,CAAE,SAAAX,EAAU,OAAAY,EAAQ,MAAAG,CAAM,CAC5C,CAEA,MAAO,CACN,KAAM,mBACN,MAAON,EAAUF,EAAW,OAC5B,SAAUG,CACX,CACD,CAMA,SAASM,EACRC,EAKC,CACD,MAAO,OAAOC,GAIR,CACL,IAAMhB,EAASL,EAAgBqB,EAAK,MAAM,EAC1C,OAAOD,EAAO,CACb,MAAOC,EAAK,MACZ,OAAQhB,EAAO,OACf,SAAUgB,EAAK,QAChB,CAAC,CACF,CACD,CAMA,eAAeC,EAAYC,EAAc,CAMxC,OALY,KAAM,QAAO,WAAW,EAAE,MAAM,IAAM,CACjD,MAAM,IAAI,MACT,eAAeA,CAAI,0DACpB,CACD,CAAC,GACuCA,CAAI,CAK7C,CAGO,IAAMC,EAAc,MAAOH,GAI5BF,EAAa,MAAMG,EAAY,UAAU,CAAC,EAAED,CAAI,EAGzCI,EAAmB,MAAOJ,GAIjCF,EAAa,MAAMG,EAAY,YAAY,CAAC,EAAED,CAAI,EAG3CK,EAAc,MAAOL,GAI5BF,EAAa,MAAMG,EAAY,YAAY,CAAC,EAAED,CAAI,ED9IjD,SAASM,EAAoBC,EAAY,aAAc,CAC7D,IAAIC,EACJ,GAAI,CACHA,EAAY,EAAQ,YAAY,EAA+B,QAChE,MAAQ,CACP,MAAM,IAAI,MACT,yEACD,CACD,CAEA,OAAKC,EAAWF,CAAS,GACxBG,EAAUH,EAAW,CAAE,UAAW,EAAK,CAAC,EAGlCC,EAAS,QAAS,CACxB,WACCG,EACAC,EAOC,CACD,IAAMC,EACJF,EAA0C,gBAAkB,UACxDG,EAAY,IAAI,KAAK,EAAE,YAAY,EAAE,QAAQ,QAAS,GAAG,EACzDC,EAAOH,EAAO,QAAQ,IAAKI,GAAM,CACtC,IAAMC,EAASD,EAAE,QAAU,CAAC,EACtBE,EAASC,EAAgBH,EAAE,MAAM,EACvC,MAAO,CACN,MAAOA,EAAE,MACT,OAAQE,EAAO,OACf,YAAaA,EAAO,YACpB,eAAgBA,EAAO,eACvB,OAAAD,CACD,CACD,CAAC,EAEKG,EAAU,GAAGb,CAAS,IAAIM,CAAI,IAAIC,CAAS,QACjDO,EAAcD,EAAS,KAAK,UAAUL,EAAM,KAAM,CAAC,CAAC,EAEpD,QAAQ,IAAI;AAAA,YAAQF,CAAI,KAAKD,EAAO,QAAQ,MAAM;AAAA,CAAY,EAE9D,IAAIU,EAAW,EACf,QAAWN,KAAKJ,EAAO,QAAS,CAC/B,IAAMK,EAASD,EAAE,QAAU,CAAC,EACtBO,EAAON,EAAO,uBAAyB,EACxCM,GACJD,IAGD,QAAQ,IACP,KAAKC,EAAO,SAAM,QAAG,IAAKP,EAAE,MAAiB,MAAM,EAAG,EAAE,CAAC,EAC1D,EACA,OAAW,CAACQ,EAAWC,CAAK,IAAK,OAAO,QAAQR,CAAM,EACrD,QAAQ,IAAI,QAAQO,CAAS,KAAKC,CAAK,EAAE,CAE3C,CAEA,eAAQ,IACP;AAAA,IAAOb,EAAO,QAAQ,OAASU,CAAQ,IAAIV,EAAO,QAAQ,MAAM,SACjE,EACA,QAAQ,IAAI,YAAOQ,CAAO;AAAA,CAAI,EACvBE,IAAa,CACrB,EAEA,UAAUI,EAAoB,CAC7B,IAAMC,EAAYD,EAAQ,MAAO,GAAM,IAAM,EAAI,EACjD,eAAQ,IACPC,EACG;AAAA,+BACA;AAAA,+BACJ,EACOA,CACR,CACD,CAAC,CACF","names":["parseJsonEventStream","readUIMessageStream","uiMessageChunkSchema","parseUIMessage","msg","output","p","toolParts","toolsCalled","toolCallTraces","textFromUIMessage","extractRecordedTools","sendMessages","url","messages","response","chunkStream","chunk","controller","finalMessage","chat","message","userMsg","result","conversation","turns","history","turnResults","turn","replaySession","session","mode","userTurns","i","next","turnIdx","assistantMsg","isLastTurn","expectedTools","assertions","buildAssertions","expected","actual","actualSet","tool","existsSync","mkdirSync","writeFileSync","parseTaskOutput","output","calledExpectedTool","expected","metadata","parsed","expectedTool","hasOutput","toolInputFieldsMatch","expectedFields","fieldNames","stateUpdates","matches","details","field","actual","parent","child","match","wrapAutoeval","scorer","args","getAutoeval","name","FaqAccuracy","OutputFactuality","SafetyCheck","createLocalReporter","outputDir","Reporter","existsSync","mkdirSync","evaluator","result","name","timestamp","rows","r","scores","parsed","parseTaskOutput","outPath","writeFileSync","failures","pass","scoreName","value","results","allPassed"]}
|
|
1
|
+
{"version":3,"sources":["../../src/evals/chat.ts","../../src/evals/reporter.ts","../../src/evals/scorers.ts"],"sourcesContent":["import { readdirSync, readFileSync } from \"node:fs\";\nimport { join } from \"node:path\";\nimport {\n\tparseJsonEventStream,\n\treadUIMessageStream,\n\ttype UIMessage,\n\tuiMessageChunkSchema,\n} from \"ai\";\nimport { z } from \"zod\";\nimport type {\n\tChatResult,\n\tConversationResult,\n\tConversationTurn,\n\tConversationTurnResult,\n\tSessionReplay,\n\tToolCallTrace,\n\tTurnAssertion,\n} from \"./types\";\n\n// UIMessage parts are heterogeneous — validate the fields we need, pass extras through\nconst sessionReplaySchema = z.object({\n\tname: z.string(),\n\tmode: z.enum([\"regenerate\", \"inject\"]).optional(),\n\toutcome: z.object({ toolsCalled: z.array(z.string()) }).optional(),\n\tmessages: z.array(\n\t\tz.looseObject({\n\t\t\tid: z.string(),\n\t\t\trole: z.enum([\"user\", \"assistant\", \"system\", \"data\"]),\n\t\t\tparts: z.array(z.record(z.string(), z.unknown())),\n\t\t}),\n\t),\n});\n\n// --- Internal helpers ---\n\nfunction parseUIMessage(msg: UIMessage): ChatResult {\n\tconst output = msg.parts\n\t\t.filter((p): p is { type: \"text\"; text: string } => p.type === \"text\")\n\t\t.map((p) => p.text)\n\t\t.join(\"\");\n\n\tconst toolParts = msg.parts\n\t\t.filter((p) => p.type.startsWith(\"tool-\") || p.type === \"dynamic-tool\")\n\t\t.map(\n\t\t\t(p) =>\n\t\t\t\tp as unknown as {\n\t\t\t\t\ttoolName: string;\n\t\t\t\t\tinput?: Record<string, unknown>;\n\t\t\t\t\toutput?: unknown;\n\t\t\t\t},\n\t\t);\n\tconst toolsCalled = toolParts.map((p) => p.toolName);\n\tconst toolCallTraces: ToolCallTrace[] = toolParts.map((p) => ({\n\t\tname: p.toolName,\n\t\tinput: p.input ?? {},\n\t\toutput: p.output,\n\t}));\n\n\treturn { output, toolsCalled, toolCallTraces };\n}\n\nfunction textFromUIMessage(msg: UIMessage): string {\n\treturn msg.parts\n\t\t.filter((p): p is { type: \"text\"; text: string } => p.type === \"text\")\n\t\t.map((p) => p.text)\n\t\t.join(\"\");\n}\n\n/** Extract the tool names called in a recorded assistant UIMessage. */\nfunction extractRecordedTools(msg: UIMessage): string[] {\n\treturn msg.parts\n\t\t.filter((p) => p.type === \"dynamic-tool\" || p.type.startsWith(\"tool-\"))\n\t\t.map((p) => (p as unknown as { toolName: string }).toolName)\n\t\t.filter(Boolean);\n}\n\nasync function sendMessages(\n\turl: string,\n\tmessages: UIMessage[],\n): Promise<{ result: ChatResult; message: UIMessage }> {\n\tconst response = await fetch(`${url}/api/waniwani`, {\n\t\tmethod: \"POST\",\n\t\theaders: { \"Content-Type\": \"application/json\" },\n\t\tsignal: AbortSignal.timeout(60_000),\n\t\tbody: JSON.stringify({ messages }),\n\t});\n\n\tif (!response.ok) {\n\t\tthrow new Error(\n\t\t\t`Chat returned ${response.status}: ${await response.text()}`,\n\t\t);\n\t}\n\n\tif (!response.body) {\n\t\tthrow new Error(\"Chat response has no body\");\n\t}\n\n\tconst chunkStream = parseJsonEventStream({\n\t\tstream: response.body,\n\t\tschema: uiMessageChunkSchema,\n\t}).pipeThrough(\n\t\tnew TransformStream({\n\t\t\ttransform(chunk, controller) {\n\t\t\t\tif (chunk.success) {\n\t\t\t\t\tcontroller.enqueue(chunk.value);\n\t\t\t\t}\n\t\t\t},\n\t\t}),\n\t);\n\n\tlet finalMessage: UIMessage | undefined;\n\tfor await (const msg of readUIMessageStream({ stream: chunkStream })) {\n\t\tfinalMessage = msg;\n\t}\n\n\tif (!finalMessage) {\n\t\tthrow new Error(\"No message received from stream\");\n\t}\n\n\treturn { result: parseUIMessage(finalMessage), message: finalMessage };\n}\n\n// --- Public API ---\n\n/**\n * Load all session replay JSON files from a directory.\n * Drop any exported session JSON there — it just works.\n *\n * @param dir - Path to the sessions directory. Defaults to `evals/sessions`.\n */\nexport function loadSessions(dir = \"evals/sessions\"): SessionReplay[] {\n\tconst root = join(process.cwd(), dir);\n\treturn readdirSync(root)\n\t\t.filter((f) => f.endsWith(\".json\"))\n\t\t.sort()\n\t\t.map((f) => {\n\t\t\tconst raw = JSON.parse(readFileSync(join(root, f), \"utf8\"));\n\t\t\treturn sessionReplaySchema.parse(raw) as unknown as SessionReplay;\n\t\t});\n}\n\n/**\n * Send a single user message to a WaniWani MCP chat endpoint.\n */\nexport async function chat(url: string, message: string): Promise<ChatResult> {\n\tconst userMsg: UIMessage = {\n\t\tid: crypto.randomUUID(),\n\t\trole: \"user\",\n\t\tparts: [{ type: \"text\", text: message }],\n\t};\n\tconst { result } = await sendMessages(url, [userMsg]);\n\treturn result;\n}\n\n/**\n * Run a multi-turn conversation. Returns the result of each turn.\n */\nexport async function conversation(\n\turl: string,\n\tturns: ConversationTurn[],\n): Promise<ConversationResult> {\n\tconst history: UIMessage[] = [];\n\tconst turnResults: ConversationTurnResult[] = [];\n\n\tfor (const turn of turns) {\n\t\thistory.push({\n\t\t\tid: crypto.randomUUID(),\n\t\t\trole: \"user\",\n\t\t\tparts: [{ type: \"text\", text: turn.input }],\n\t\t});\n\n\t\tconst { result, message } = await sendMessages(url, history);\n\t\thistory.push(message);\n\n\t\tturnResults.push({ input: turn.input, response: result, assertions: [] });\n\t}\n\n\treturn { turns: turnResults };\n}\n\n/**\n * Replay a recorded conversation session (exported from the chatbar debug button).\n * Uses UIMessage[] directly — same format as useChat's messages array.\n *\n * **\"regenerate\" mode** (default):\n * Sends only user messages. The LLM generates fresh responses.\n * Per-turn assertions are auto-derived by comparing actual tool calls\n * to the tool calls recorded in the session.\n *\n * **\"inject\" mode**:\n * Injects the recorded conversation as-is, only generates a fresh\n * response for the final user message.\n */\nexport async function replaySession(\n\turl: string,\n\tsession: SessionReplay,\n): Promise<ConversationResult> {\n\tconst mode = session.mode ?? \"regenerate\";\n\tconst history: UIMessage[] = [];\n\tconst turnResults: ConversationTurnResult[] = [];\n\n\t// Pair user messages with their assistant responses\n\tconst userTurns: { userMsg: UIMessage; assistantMsg?: UIMessage }[] = [];\n\tfor (let i = 0; i < session.messages.length; i++) {\n\t\tconst msg = session.messages[i];\n\t\tif (msg.role === \"user\") {\n\t\t\tconst next = session.messages[i + 1];\n\t\t\tuserTurns.push({\n\t\t\t\tuserMsg: msg,\n\t\t\t\tassistantMsg: next?.role === \"assistant\" ? next : undefined,\n\t\t\t});\n\t\t}\n\t}\n\n\tfor (let turnIdx = 0; turnIdx < userTurns.length; turnIdx++) {\n\t\tconst { userMsg, assistantMsg } = userTurns[turnIdx];\n\t\tconst isLastTurn = turnIdx === userTurns.length - 1;\n\n\t\t// Extract expected tools from the recorded assistant message\n\t\tconst expectedTools = assistantMsg\n\t\t\t? extractRecordedTools(assistantMsg)\n\t\t\t: [];\n\n\t\thistory.push(userMsg);\n\n\t\tif (mode === \"inject\" && !isLastTurn && assistantMsg) {\n\t\t\thistory.push(assistantMsg);\n\t\t\tconst response = parseUIMessage(assistantMsg);\n\t\t\tconst assertions = buildAssertions(expectedTools, response.toolsCalled);\n\t\t\tturnResults.push({\n\t\t\t\tinput: textFromUIMessage(userMsg),\n\t\t\t\tresponse,\n\t\t\t\tassertions,\n\t\t\t});\n\t\t\tcontinue;\n\t\t}\n\n\t\tconst { result, message } = await sendMessages(url, history);\n\t\thistory.push(message);\n\n\t\tconst assertions = buildAssertions(expectedTools, result.toolsCalled);\n\t\tturnResults.push({\n\t\t\tinput: textFromUIMessage(userMsg),\n\t\t\tresponse: result,\n\t\t\tassertions,\n\t\t});\n\t}\n\n\treturn { turns: turnResults };\n}\n\n/** Compare expected vs. actual tool calls and return assertion results. */\nfunction buildAssertions(\n\texpected: string[],\n\tactual: string[],\n): TurnAssertion[] {\n\tif (expected.length === 0) {\n\t\treturn [];\n\t}\n\n\t// Group expected tools and check each against actual calls\n\tconst actualSet = new Set(actual);\n\tconst expectedUnique = [...new Set(expected)];\n\n\treturn expectedUnique.map((tool) => ({\n\t\tpassed: actualSet.has(tool),\n\t\texpected: [tool],\n\t\tactual,\n\t}));\n}\n","import { existsSync, mkdirSync, writeFileSync } from \"node:fs\";\nimport { parseTaskOutput } from \"./scorers\";\n\ntype ReporterFn = (\n\tname: string,\n\thandlers: {\n\t\treportEval(\n\t\t\tevaluator: unknown,\n\t\t\tresult: {\n\t\t\t\tresults: Array<{\n\t\t\t\t\tinput: unknown;\n\t\t\t\t\toutput: unknown;\n\t\t\t\t\tscores?: Record<string, number>;\n\t\t\t\t}>;\n\t\t\t},\n\t\t): boolean;\n\t\treportRun(results: boolean[]): boolean;\n\t},\n) => unknown;\n\n/**\n * Create a local Braintrust reporter that writes JSON results to a directory and\n * prints a summary to console.\n *\n * Requires the `braintrust` package: bun add -d braintrust\n *\n * @param outputDir - Directory to write JSON result files (default: \"evals/runs\")\n */\nexport function createLocalReporter(outputDir = \"evals/runs\") {\n\tlet Reporter: ReporterFn;\n\ttry {\n\t\tReporter = (require(\"braintrust\") as { Reporter: ReporterFn }).Reporter;\n\t} catch {\n\t\tthrow new Error(\n\t\t\t'Local reporter requires the \"braintrust\" package: bun add -d braintrust',\n\t\t);\n\t}\n\n\tif (!existsSync(outputDir)) {\n\t\tmkdirSync(outputDir, { recursive: true });\n\t}\n\n\treturn Reporter(\"local\", {\n\t\treportEval(\n\t\t\tevaluator: unknown,\n\t\t\tresult: {\n\t\t\t\tresults: Array<{\n\t\t\t\t\tinput: unknown;\n\t\t\t\t\toutput: unknown;\n\t\t\t\t\tscores?: Record<string, number>;\n\t\t\t\t}>;\n\t\t\t},\n\t\t) {\n\t\t\tconst name =\n\t\t\t\t(evaluator as { experimentName?: string }).experimentName ?? \"unknown\";\n\t\t\tconst timestamp = new Date().toISOString().replace(/[:.]/g, \"-\");\n\t\t\tconst rows = result.results.map((r) => {\n\t\t\t\tconst scores = r.scores ?? {};\n\t\t\t\tconst parsed = parseTaskOutput(r.output);\n\t\t\t\treturn {\n\t\t\t\t\tinput: r.input,\n\t\t\t\t\toutput: parsed.output,\n\t\t\t\t\ttoolsCalled: parsed.toolsCalled,\n\t\t\t\t\ttoolCallTraces: parsed.toolCallTraces,\n\t\t\t\t\tscores,\n\t\t\t\t};\n\t\t\t});\n\n\t\t\tconst outPath = `${outputDir}/${name}-${timestamp}.json`;\n\t\t\twriteFileSync(outPath, JSON.stringify(rows, null, 2));\n\n\t\t\tconsole.log(`\\n📊 ${name} (${result.results.length} cases):\\n`);\n\n\t\t\tlet failures = 0;\n\t\t\tfor (const r of result.results) {\n\t\t\t\tconst scores = r.scores ?? {};\n\t\t\t\tconst pass = scores.called_expected_tool === 1;\n\t\t\t\tif (!pass) {\n\t\t\t\t\tfailures++;\n\t\t\t\t}\n\n\t\t\t\tconsole.log(\n\t\t\t\t\t` ${pass ? \"✅\" : \"❌\"} ${(r.input as string).slice(0, 70)}`,\n\t\t\t\t);\n\t\t\t\tfor (const [scoreName, value] of Object.entries(scores)) {\n\t\t\t\t\tconsole.log(` ${scoreName}: ${value}`);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tconsole.log(\n\t\t\t\t`\\n ${result.results.length - failures}/${result.results.length} passed`,\n\t\t\t);\n\t\t\tconsole.log(` → ${outPath}\\n`);\n\t\t\treturn failures === 0;\n\t\t},\n\n\t\treportRun(results: boolean[]) {\n\t\t\tconst allPassed = results.every((r) => r === true);\n\t\t\tconsole.log(\n\t\t\t\tallPassed\n\t\t\t\t\t? \"\\n✅ All experiments passed\"\n\t\t\t\t\t: \"\\n❌ Some experiments failed\",\n\t\t\t);\n\t\t\treturn allPassed;\n\t\t},\n\t});\n}\n","import type { ChatResult } from \"./types\";\n\n/**\n * Parse the JSON-stringified ChatResult from a Braintrust task output.\n */\nexport function parseTaskOutput(output: unknown): ChatResult {\n\ttry {\n\t\treturn JSON.parse(output as string);\n\t} catch {\n\t\treturn { output: \"\", toolsCalled: [], toolCallTraces: [] };\n\t}\n}\n\n/**\n * Checks whether the expected tool was called.\n * Looks for the tool name in `metadata.expectedTool` first (for cases where `expected` is a\n * reference answer), then falls back to `expected` directly.\n */\nexport function calledExpectedTool({\n\toutput,\n\texpected,\n\tmetadata,\n}: {\n\toutput: unknown;\n\texpected?: unknown;\n\tmetadata?: Record<string, unknown>;\n}) {\n\tconst parsed = parseTaskOutput(output);\n\tconst expectedTool =\n\t\t(metadata?.expectedTool as string) ?? (expected as string);\n\tconst found = parsed.toolsCalled.includes(expectedTool);\n\treturn {\n\t\tname: \"called_expected_tool\",\n\t\tscore: found ? 1 : 0,\n\t\tmetadata: { expected: expectedTool, actual: parsed.toolsCalled },\n\t};\n}\n\n/**\n * Checks whether the assistant produced any text output.\n */\nexport function hasOutput({ output }: { output: unknown }) {\n\tconst parsed = parseTaskOutput(output);\n\treturn {\n\t\tname: \"has_output\",\n\t\tscore: parsed.output.length > 0 ? 1 : 0,\n\t};\n}\n\n/**\n * Checks specific fields in the first tool call's `stateUpdates` against expected values.\n * Supports nested fields via dot notation (e.g. \"mixedBreed.knowsBreeds\").\n * Returns partial credit (fraction of matching fields).\n */\nexport function toolInputFieldsMatch({\n\toutput,\n\tmetadata,\n}: {\n\toutput: unknown;\n\tmetadata?: Record<string, unknown>;\n}) {\n\tconst parsed = parseTaskOutput(output);\n\tconst expectedFields = (metadata?.expectedFields ?? {}) as Record<\n\t\tstring,\n\t\tunknown\n\t>;\n\tconst fieldNames = Object.keys(expectedFields);\n\n\tif (fieldNames.length === 0) {\n\t\treturn { name: \"field_extraction\", score: 1 };\n\t}\n\n\tconst trace = parsed.toolCallTraces[0];\n\tconst stateUpdates = (trace?.input?.stateUpdates ?? {}) as Record<\n\t\tstring,\n\t\tunknown\n\t>;\n\n\tlet matches = 0;\n\tconst details: Record<\n\t\tstring,\n\t\t{ expected: unknown; actual: unknown; match: boolean }\n\t> = {};\n\n\tfor (const field of fieldNames) {\n\t\tconst expected = expectedFields[field];\n\t\tlet actual: unknown;\n\n\t\tif (field.includes(\".\")) {\n\t\t\tconst [parent, child] = field.split(\".\");\n\t\t\tactual = (stateUpdates[parent] as Record<string, unknown>)?.[child];\n\t\t} else {\n\t\t\tactual = stateUpdates[field];\n\t\t}\n\n\t\tconst match = JSON.stringify(actual) === JSON.stringify(expected);\n\t\tif (match) {\n\t\t\tmatches++;\n\t\t}\n\t\tdetails[field] = { expected, actual, match };\n\t}\n\n\treturn {\n\t\tname: \"field_extraction\",\n\t\tscore: matches / fieldNames.length,\n\t\tmetadata: details,\n\t};\n}\n\n/**\n * Wraps an autoevals scorer to extract the text output from the JSON-stringified ChatResult.\n * Requires the `autoevals` package: bun add -d autoevals\n */\nfunction wrapAutoeval(\n\tscorer: (args: {\n\t\tinput: unknown;\n\t\toutput: string;\n\t\texpected?: unknown;\n\t}) => unknown,\n) {\n\treturn async (args: {\n\t\tinput: unknown;\n\t\toutput: unknown;\n\t\texpected?: unknown;\n\t}) => {\n\t\tconst parsed = parseTaskOutput(args.output);\n\t\treturn scorer({\n\t\t\tinput: args.input,\n\t\t\toutput: parsed.output,\n\t\t\texpected: args.expected,\n\t\t});\n\t};\n}\n\n// LLM-based scorers — require `autoevals` as a dev dependency.\n// These are dynamically imported so the module loads even if autoevals is not installed.\n// Using LLM scorers without autoevals installed will throw at call time.\n\nasync function getAutoeval(name: string) {\n\tconst mod = await import(\"autoevals\").catch(() => {\n\t\tthrow new Error(\n\t\t\t`LLM scorer \"${name}\" requires the \"autoevals\" package: bun add -d autoevals`,\n\t\t);\n\t});\n\treturn (mod as Record<string, unknown>)[name] as (args: {\n\t\tinput: unknown;\n\t\toutput: string;\n\t\texpected?: unknown;\n\t}) => unknown;\n}\n\n/** ClosedQA — checks if the answer correctly addresses the question given a reference answer. */\nexport const FaqAccuracy = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"ClosedQA\"))(args);\n\n/** Factuality — checks if the output is factually consistent with the expected output. */\nexport const OutputFactuality = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"Factuality\"))(args);\n\n/** Moderation — flags unsafe or inappropriate content. */\nexport const SafetyCheck = async (args: {\n\tinput: unknown;\n\toutput: unknown;\n\texpected?: unknown;\n}): Promise<unknown> => wrapAutoeval(await getAutoeval(\"Moderation\"))(args);\n"],"mappings":"yPAAA,OAAS,eAAAA,EAAa,gBAAAC,MAAoB,KAC1C,OAAS,QAAAC,MAAY,OACrB,OACC,wBAAAC,EACA,uBAAAC,EAEA,wBAAAC,MACM,KACP,OAAS,KAAAC,MAAS,MAYlB,IAAMC,EAAsBD,EAAE,OAAO,CACpC,KAAMA,EAAE,OAAO,EACf,KAAMA,EAAE,KAAK,CAAC,aAAc,QAAQ,CAAC,EAAE,SAAS,EAChD,QAASA,EAAE,OAAO,CAAE,YAAaA,EAAE,MAAMA,EAAE,OAAO,CAAC,CAAE,CAAC,EAAE,SAAS,EACjE,SAAUA,EAAE,MACXA,EAAE,YAAY,CACb,GAAIA,EAAE,OAAO,EACb,KAAMA,EAAE,KAAK,CAAC,OAAQ,YAAa,SAAU,MAAM,CAAC,EACpD,MAAOA,EAAE,MAAMA,EAAE,OAAOA,EAAE,OAAO,EAAGA,EAAE,QAAQ,CAAC,CAAC,CACjD,CAAC,CACF,CACD,CAAC,EAID,SAASE,EAAeC,EAA4B,CACnD,IAAMC,EAASD,EAAI,MACjB,OAAQE,GAA2CA,EAAE,OAAS,MAAM,EACpE,IAAKA,GAAMA,EAAE,IAAI,EACjB,KAAK,EAAE,EAEHC,EAAYH,EAAI,MACpB,OAAQE,GAAMA,EAAE,KAAK,WAAW,OAAO,GAAKA,EAAE,OAAS,cAAc,EACrE,IACCA,GACAA,CAKF,EACKE,EAAcD,EAAU,IAAKD,GAAMA,EAAE,QAAQ,EAC7CG,EAAkCF,EAAU,IAAKD,IAAO,CAC7D,KAAMA,EAAE,SACR,MAAOA,EAAE,OAAS,CAAC,EACnB,OAAQA,EAAE,MACX,EAAE,EAEF,MAAO,CAAE,OAAAD,EAAQ,YAAAG,EAAa,eAAAC,CAAe,CAC9C,CAEA,SAASC,EAAkBN,EAAwB,CAClD,OAAOA,EAAI,MACT,OAAQE,GAA2CA,EAAE,OAAS,MAAM,EACpE,IAAKA,GAAMA,EAAE,IAAI,EACjB,KAAK,EAAE,CACV,CAGA,SAASK,EAAqBP,EAA0B,CACvD,OAAOA,EAAI,MACT,OAAQE,GAAMA,EAAE,OAAS,gBAAkBA,EAAE,KAAK,WAAW,OAAO,CAAC,EACrE,IAAKA,GAAOA,EAAsC,QAAQ,EAC1D,OAAO,OAAO,CACjB,CAEA,eAAeM,EACdC,EACAC,EACsD,CACtD,IAAMC,EAAW,MAAM,MAAM,GAAGF,CAAG,gBAAiB,CACnD,OAAQ,OACR,QAAS,CAAE,eAAgB,kBAAmB,EAC9C,OAAQ,YAAY,QAAQ,GAAM,EAClC,KAAM,KAAK,UAAU,CAAE,SAAAC,CAAS,CAAC,CAClC,CAAC,EAED,GAAI,CAACC,EAAS,GACb,MAAM,IAAI,MACT,iBAAiBA,EAAS,MAAM,KAAK,MAAMA,EAAS,KAAK,CAAC,EAC3D,EAGD,GAAI,CAACA,EAAS,KACb,MAAM,IAAI,MAAM,2BAA2B,EAG5C,IAAMC,EAAclB,EAAqB,CACxC,OAAQiB,EAAS,KACjB,OAAQf,CACT,CAAC,EAAE,YACF,IAAI,gBAAgB,CACnB,UAAUiB,EAAOC,EAAY,CACxBD,EAAM,SACTC,EAAW,QAAQD,EAAM,KAAK,CAEhC,CACD,CAAC,CACF,EAEIE,EACJ,cAAiBf,KAAOL,EAAoB,CAAE,OAAQiB,CAAY,CAAC,EAClEG,EAAef,EAGhB,GAAI,CAACe,EACJ,MAAM,IAAI,MAAM,iCAAiC,EAGlD,MAAO,CAAE,OAAQhB,EAAegB,CAAY,EAAG,QAASA,CAAa,CACtE,CAUO,SAASC,EAAaC,EAAM,iBAAmC,CACrE,IAAMC,EAAOzB,EAAK,QAAQ,IAAI,EAAGwB,CAAG,EACpC,OAAO1B,EAAY2B,CAAI,EACrB,OAAQC,GAAMA,EAAE,SAAS,OAAO,CAAC,EACjC,KAAK,EACL,IAAKA,GAAM,CACX,IAAMC,EAAM,KAAK,MAAM5B,EAAaC,EAAKyB,EAAMC,CAAC,EAAG,MAAM,CAAC,EAC1D,OAAOrB,EAAoB,MAAMsB,CAAG,CACrC,CAAC,CACH,CAKA,eAAsBC,EAAKZ,EAAaa,EAAsC,CAC7E,IAAMC,EAAqB,CAC1B,GAAI,OAAO,WAAW,EACtB,KAAM,OACN,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAMD,CAAQ,CAAC,CACxC,EACM,CAAE,OAAAE,CAAO,EAAI,MAAMhB,EAAaC,EAAK,CAACc,CAAO,CAAC,EACpD,OAAOC,CACR,CAKA,eAAsBC,EACrBhB,EACAiB,EAC8B,CAC9B,IAAMC,EAAuB,CAAC,EACxBC,EAAwC,CAAC,EAE/C,QAAWC,KAAQH,EAAO,CACzBC,EAAQ,KAAK,CACZ,GAAI,OAAO,WAAW,EACtB,KAAM,OACN,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAME,EAAK,KAAM,CAAC,CAC3C,CAAC,EAED,GAAM,CAAE,OAAAL,EAAQ,QAAAF,CAAQ,EAAI,MAAMd,EAAaC,EAAKkB,CAAO,EAC3DA,EAAQ,KAAKL,CAAO,EAEpBM,EAAY,KAAK,CAAE,MAAOC,EAAK,MAAO,SAAUL,EAAQ,WAAY,CAAC,CAAE,CAAC,CACzE,CAEA,MAAO,CAAE,MAAOI,CAAY,CAC7B,CAeA,eAAsBE,EACrBrB,EACAsB,EAC8B,CAC9B,IAAMC,EAAOD,EAAQ,MAAQ,aACvBJ,EAAuB,CAAC,EACxBC,EAAwC,CAAC,EAGzCK,EAAgE,CAAC,EACvE,QAASC,EAAI,EAAGA,EAAIH,EAAQ,SAAS,OAAQG,IAAK,CACjD,IAAMlC,EAAM+B,EAAQ,SAASG,CAAC,EAC9B,GAAIlC,EAAI,OAAS,OAAQ,CACxB,IAAMmC,EAAOJ,EAAQ,SAASG,EAAI,CAAC,EACnCD,EAAU,KAAK,CACd,QAASjC,EACT,aAAcmC,GAAM,OAAS,YAAcA,EAAO,MACnD,CAAC,CACF,CACD,CAEA,QAASC,EAAU,EAAGA,EAAUH,EAAU,OAAQG,IAAW,CAC5D,GAAM,CAAE,QAAAb,EAAS,aAAAc,CAAa,EAAIJ,EAAUG,CAAO,EAC7CE,EAAaF,IAAYH,EAAU,OAAS,EAG5CM,EAAgBF,EACnB9B,EAAqB8B,CAAY,EACjC,CAAC,EAIJ,GAFAV,EAAQ,KAAKJ,CAAO,EAEhBS,IAAS,UAAY,CAACM,GAAcD,EAAc,CACrDV,EAAQ,KAAKU,CAAY,EACzB,IAAM1B,EAAWZ,EAAesC,CAAY,EACtCG,EAAaC,EAAgBF,EAAe5B,EAAS,WAAW,EACtEiB,EAAY,KAAK,CAChB,MAAOtB,EAAkBiB,CAAO,EAChC,SAAAZ,EACA,WAAA6B,CACD,CAAC,EACD,QACD,CAEA,GAAM,CAAE,OAAAhB,EAAQ,QAAAF,CAAQ,EAAI,MAAMd,EAAaC,EAAKkB,CAAO,EAC3DA,EAAQ,KAAKL,CAAO,EAEpB,IAAMkB,EAAaC,EAAgBF,EAAef,EAAO,WAAW,EACpEI,EAAY,KAAK,CAChB,MAAOtB,EAAkBiB,CAAO,EAChC,SAAUC,EACV,WAAAgB,CACD,CAAC,CACF,CAEA,MAAO,CAAE,MAAOZ,CAAY,CAC7B,CAGA,SAASa,EACRC,EACAC,EACkB,CAClB,GAAID,EAAS,SAAW,EACvB,MAAO,CAAC,EAIT,IAAME,EAAY,IAAI,IAAID,CAAM,EAGhC,MAFuB,CAAC,GAAG,IAAI,IAAID,CAAQ,CAAC,EAEtB,IAAKG,IAAU,CACpC,OAAQD,EAAU,IAAIC,CAAI,EAC1B,SAAU,CAACA,CAAI,EACf,OAAAF,CACD,EAAE,CACH,CC7QA,OAAS,cAAAG,EAAY,aAAAC,EAAW,iBAAAC,MAAqB,KCK9C,SAASC,EAAgBC,EAA6B,CAC5D,GAAI,CACH,OAAO,KAAK,MAAMA,CAAgB,CACnC,MAAQ,CACP,MAAO,CAAE,OAAQ,GAAI,YAAa,CAAC,EAAG,eAAgB,CAAC,CAAE,CAC1D,CACD,CAOO,SAASC,EAAmB,CAClC,OAAAD,EACA,SAAAE,EACA,SAAAC,CACD,EAIG,CACF,IAAMC,EAASL,EAAgBC,CAAM,EAC/BK,EACJF,GAAU,cAA4BD,EAExC,MAAO,CACN,KAAM,uBACN,MAHaE,EAAO,YAAY,SAASC,CAAY,EAGtC,EAAI,EACnB,SAAU,CAAE,SAAUA,EAAc,OAAQD,EAAO,WAAY,CAChE,CACD,CAKO,SAASE,EAAU,CAAE,OAAAN,CAAO,EAAwB,CAE1D,MAAO,CACN,KAAM,aACN,MAHcD,EAAgBC,CAAM,EAGtB,OAAO,OAAS,EAAI,EAAI,CACvC,CACD,CAOO,SAASO,EAAqB,CACpC,OAAAP,EACA,SAAAG,CACD,EAGG,CACF,IAAMC,EAASL,EAAgBC,CAAM,EAC/BQ,EAAkBL,GAAU,gBAAkB,CAAC,EAI/CM,EAAa,OAAO,KAAKD,CAAc,EAE7C,GAAIC,EAAW,SAAW,EACzB,MAAO,CAAE,KAAM,mBAAoB,MAAO,CAAE,EAI7C,IAAMC,EADQN,EAAO,eAAe,CAAC,GACR,OAAO,cAAgB,CAAC,EAKjDO,EAAU,EACRC,EAGF,CAAC,EAEL,QAAWC,KAASJ,EAAY,CAC/B,IAAMP,EAAWM,EAAeK,CAAK,EACjCC,EAEJ,GAAID,EAAM,SAAS,GAAG,EAAG,CACxB,GAAM,CAACE,EAAQC,CAAK,EAAIH,EAAM,MAAM,GAAG,EACvCC,EAAUJ,EAAaK,CAAM,IAAgCC,CAAK,CACnE,MACCF,EAASJ,EAAaG,CAAK,EAG5B,IAAMI,EAAQ,KAAK,UAAUH,CAAM,IAAM,KAAK,UAAUZ,CAAQ,EAC5De,GACHN,IAEDC,EAAQC,CAAK,EAAI,CAAE,SAAAX,EAAU,OAAAY,EAAQ,MAAAG,CAAM,CAC5C,CAEA,MAAO,CACN,KAAM,mBACN,MAAON,EAAUF,EAAW,OAC5B,SAAUG,CACX,CACD,CAMA,SAASM,EACRC,EAKC,CACD,MAAO,OAAOC,GAIR,CACL,IAAMhB,EAASL,EAAgBqB,EAAK,MAAM,EAC1C,OAAOD,EAAO,CACb,MAAOC,EAAK,MACZ,OAAQhB,EAAO,OACf,SAAUgB,EAAK,QAChB,CAAC,CACF,CACD,CAMA,eAAeC,EAAYC,EAAc,CAMxC,OALY,KAAM,QAAO,WAAW,EAAE,MAAM,IAAM,CACjD,MAAM,IAAI,MACT,eAAeA,CAAI,0DACpB,CACD,CAAC,GACuCA,CAAI,CAK7C,CAGO,IAAMC,EAAc,MAAOH,GAIVF,EAAa,MAAMG,EAAY,UAAU,CAAC,EAAED,CAAI,EAG3DI,EAAmB,MAAOJ,GAIfF,EAAa,MAAMG,EAAY,YAAY,CAAC,EAAED,CAAI,EAG7DK,EAAc,MAAOL,GAIVF,EAAa,MAAMG,EAAY,YAAY,CAAC,EAAED,CAAI,ED9InE,SAASM,EAAoBC,EAAY,aAAc,CAC7D,IAAIC,EACJ,GAAI,CACHA,EAAY,EAAQ,YAAY,EAA+B,QAChE,MAAQ,CACP,MAAM,IAAI,MACT,yEACD,CACD,CAEA,OAAKC,EAAWF,CAAS,GACxBG,EAAUH,EAAW,CAAE,UAAW,EAAK,CAAC,EAGlCC,EAAS,QAAS,CACxB,WACCG,EACAC,EAOC,CACD,IAAMC,EACJF,EAA0C,gBAAkB,UACxDG,EAAY,IAAI,KAAK,EAAE,YAAY,EAAE,QAAQ,QAAS,GAAG,EACzDC,EAAOH,EAAO,QAAQ,IAAKI,GAAM,CACtC,IAAMC,EAASD,EAAE,QAAU,CAAC,EACtBE,EAASC,EAAgBH,EAAE,MAAM,EACvC,MAAO,CACN,MAAOA,EAAE,MACT,OAAQE,EAAO,OACf,YAAaA,EAAO,YACpB,eAAgBA,EAAO,eACvB,OAAAD,CACD,CACD,CAAC,EAEKG,EAAU,GAAGb,CAAS,IAAIM,CAAI,IAAIC,CAAS,QACjDO,EAAcD,EAAS,KAAK,UAAUL,EAAM,KAAM,CAAC,CAAC,EAEpD,QAAQ,IAAI;AAAA,YAAQF,CAAI,KAAKD,EAAO,QAAQ,MAAM;AAAA,CAAY,EAE9D,IAAIU,EAAW,EACf,QAAWN,KAAKJ,EAAO,QAAS,CAC/B,IAAMK,EAASD,EAAE,QAAU,CAAC,EACtBO,EAAON,EAAO,uBAAyB,EACxCM,GACJD,IAGD,QAAQ,IACP,KAAKC,EAAO,SAAM,QAAG,IAAKP,EAAE,MAAiB,MAAM,EAAG,EAAE,CAAC,EAC1D,EACA,OAAW,CAACQ,EAAWC,CAAK,IAAK,OAAO,QAAQR,CAAM,EACrD,QAAQ,IAAI,QAAQO,CAAS,KAAKC,CAAK,EAAE,CAE3C,CAEA,eAAQ,IACP;AAAA,IAAOb,EAAO,QAAQ,OAASU,CAAQ,IAAIV,EAAO,QAAQ,MAAM,SACjE,EACA,QAAQ,IAAI,YAAOQ,CAAO;AAAA,CAAI,EACvBE,IAAa,CACrB,EAEA,UAAUI,EAAoB,CAC7B,IAAMC,EAAYD,EAAQ,MAAO,GAAM,IAAM,EAAI,EACjD,eAAQ,IACPC,EACG;AAAA,+BACA;AAAA,+BACJ,EACOA,CACR,CACD,CAAC,CACF","names":["readdirSync","readFileSync","join","parseJsonEventStream","readUIMessageStream","uiMessageChunkSchema","z","sessionReplaySchema","parseUIMessage","msg","output","p","toolParts","toolsCalled","toolCallTraces","textFromUIMessage","extractRecordedTools","sendMessages","url","messages","response","chunkStream","chunk","controller","finalMessage","loadSessions","dir","root","f","raw","chat","message","userMsg","result","conversation","turns","history","turnResults","turn","replaySession","session","mode","userTurns","i","next","turnIdx","assistantMsg","isLastTurn","expectedTools","assertions","buildAssertions","expected","actual","actualSet","tool","existsSync","mkdirSync","writeFileSync","parseTaskOutput","output","calledExpectedTool","expected","metadata","parsed","expectedTool","hasOutput","toolInputFieldsMatch","expectedFields","fieldNames","stateUpdates","matches","details","field","actual","parent","child","match","wrapAutoeval","scorer","args","getAutoeval","name","FaqAccuracy","OutputFactuality","SafetyCheck","createLocalReporter","outputDir","Reporter","existsSync","mkdirSync","evaluator","result","name","timestamp","rows","r","scores","parsed","parseTaskOutput","outPath","writeFileSync","failures","pass","scoreName","value","results","allPassed"]}
|