@kindlm/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +40 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3061 -0
- package/dist/index.d.ts +3061 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/package.json +57 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
var g=class extends Error{constructor(s,r,n,o=!1,a){super(r);this.code=s;this.statusCode=n;this.retryable=o;this.raw=a;this.name="ProviderError"}};function M(e){return{success:!0,data:e}}function A(e){return{success:!1,error:e}}function he(e,t){for(let[s,r]of Object.entries(t))if(JSON.stringify(e[s])!==JSON.stringify(r))return!1;return!0}function B(e,t){return{type:"tool_called",evaluate(s){let r=[],n=s.toolCalls.filter(a=>a.name===e),o=s.toolCalls.map(a=>a.name);if(n.length===0)return r.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${e}" to be called, but got: [${o.join(", ")}]`}),Promise.resolve(r);if(r.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!0,score:1}),t){let a=n.some(p=>he(p.arguments,t));r.push({assertionType:"tool_called",label:`Tool "${e}" args match`,passed:a,score:a?1:0,failureCode:a?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:a?void 0:`Expected args ${JSON.stringify(t)}, got ${JSON.stringify(n[0]?.arguments)}`})}return Promise.resolve(r)}}}function W(e){return{type:"tool_not_called",evaluate(t){let s=t.toolCalls.some(r=>r.name===e);return Promise.resolve([{assertionType:"tool_not_called",label:`Tool "${e}" not called`,passed:!s,score:s?0:1,failureCode:s?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:s?`Expected tool "${e}" to NOT be called, but it was`:void 0}])}}}function X(e){return{type:"tool_order",evaluate(t){let s=[];for(let r of e){if(r.shouldNotCall){let o=t.toolCalls.some(a=>a.name===r.tool);s.push({assertionType:"tool_order",label:`Tool "${r.tool}" not called`,passed:!o,score:o?0:1,failureCode:o?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:o?`Expected tool "${r.tool}" to NOT be called, but it was`:void 0});continue}let n=t.toolCalls.filter(o=>o.name===r.tool);if(n.length===0){s.push({assertionType:"tool_order",label:`Tool "${r.tool}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${r.tool}" to be called, but it was not`});continue}if(s.push({assertionType:"tool_order",label:`Tool "${r.tool}" called`,passed:!0,score:1}),r.argsMatch){let o=n.some(a=>he(a.arguments,r.argsMatch??{}));s.push({assertionType:"tool_order",label:`Tool "${r.tool}" args match`,passed:o,score:o?1:0,failureCode:o?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:o?void 0:`Expected args ${JSON.stringify(r.argsMatch)}, got ${JSON.stringify(n[0]?.arguments)}`})}if(r.order!==void 0){let o=t.toolCalls.findIndex(p=>p.name===r.tool),a=o===r.order;s.push({assertionType:"tool_order",label:`Tool "${r.tool}" at position ${r.order}`,passed:a,score:a?1:0,failureCode:a?void 0:"TOOL_CALL_ORDER_WRONG",failureMessage:a?void 0:`Expected "${r.tool}" at position ${r.order}, but found at position ${o}`})}}return Promise.resolve(s)}}}var Y;async function Fe(){if(Y)return Y;let e=await import("ajv"),t=await import("ajv-formats"),s=e.default,r=t.default,n=new s({allErrors:!0,strict:!1});return r(n),Y=n,n}var Re=new Map;function Ue(e,t){let s=JSON.stringify(t),r=Re.get(s);if(r)return r;let n=e.compile(t);return Re.set(s,n),n}function G(e){return{type:"schema",async evaluate(t){let s=[],r;if(e.format==="json")try{r=JSON.parse(t.outputText),s.push({assertionType:"schema",label:"Output is valid JSON",passed:!0,score:1})}catch(n){return s.push({assertionType:"schema",label:"Output is valid JSON",passed:!1,score:0,failureCode:"SCHEMA_PARSE_ERROR",failureMessage:`Failed to parse output as JSON: ${n instanceof Error?n.message:String(n)}`}),s}if(e.schemaContent){let n=await Fe(),o=Ue(n,e.schemaContent),a=o(r??t.outputText);s.push({assertionType:"schema",label:"Output matches JSON Schema",passed:a,score:a?1:0,failureCode:a?void 0:"SCHEMA_INVALID",failureMessage:a?void 0:`Schema validation failed: ${n.errorsText(o.errors)}`,metadata:a?void 0:{errors:o.errors}})}if(e.contains){let n=t.outputText.toLowerCase();for(let o of e.contains){let a=n.includes(o.toLowerCase());s.push({assertionType:"schema",label:`Output contains "${o}"`,passed:a,score:a?1:0,failureCode:a?void 0:"CONTAINS_FAILED",failureMessage:a?void 0:`Expected output to contain "${o}"`})}}if(e.notContains){let n=t.outputText.toLowerCase();for(let o of e.notContains){let a=n.includes(o.toLowerCase());s.push({assertionType:"schema",label:`Output does not contain "${o}"`,passed:!a,score:a?0:1,failureCode:a?"NOT_CONTAINS_FAILED":void 0,failureMessage:a?`Expected output to NOT contain "${o}"`:void 0})}}if(e.maxLength!==void 0){let n=t.outputText.length<=e.maxLength;s.push({assertionType:"schema",label:`Output length <= ${e.maxLength}`,passed:n,score:n?1:0,failureCode:n?void 0:"MAX_LENGTH_EXCEEDED",failureMessage:n?void 0:`Output length ${t.outputText.length} exceeds max ${e.maxLength}`})}return s}}}function Be(e){return e.length<=4?"*".repeat(e.length):e.slice(0,2)+"*".repeat(e.length-4)+e.slice(-2)}var Ge=/(\+|\*|\{[^}]+\})\)?(\+|\*|\{[^}]+\})/;function Je(e){return Ge.test(e)}function Ve(e,t,s){let r=[],n=Date.now();e.lastIndex=0;let o;for(;(o=e.exec(t))!==null&&(r.push(o[0]),!(r.length>=s||Date.now()-n>100)););return r}function J(e){let t=[],s;for(let r=0;r<e.denyPatterns.length;r++){let n=e.denyPatterns[r];n!==void 0&&t.push({name:`pii-pattern-${r+1}`,regex:new RegExp(n,"gi")})}if(e.customPatterns)for(let r of e.customPatterns){if(Je(r.pattern)){s=[{assertionType:"pii",label:"No PII detected",passed:!1,score:0,failureCode:"INVALID_PATTERN",failureMessage:`Custom pattern "${r.name}" contains nested quantifiers and may cause catastrophic backtracking`}];break}t.push({name:r.name,regex:new RegExp(r.pattern,"gi")})}return{type:"pii",evaluate(r){if(s)return Promise.resolve(s);let n=[],o=0;for(let{name:p,regex:i}of t){if(o>=1e3)break;let l=1e3-o,c=Ve(i,r.outputText,l);for(let f of c)n.push({name:p,redacted:Be(f)});o+=c.length}let a=n.length===0;return Promise.resolve([{assertionType:"pii",label:"No PII detected",passed:a,score:a?1:0,failureCode:a?void 0:"PII_DETECTED",failureMessage:a?void 0:`Found ${n.length} PII match(es): ${n.map(p=>`${p.name}=${p.redacted}`).join(", ")}`,metadata:a?void 0:{matches:n}}])}}}function Q(e){return{type:"keywords_present",evaluate(t){let s=t.outputText.toLowerCase(),r=e.some(n=>s.includes(n.toLowerCase()));return Promise.resolve([{assertionType:"keywords_present",label:"Required keyword present",passed:r,score:r?1:0,failureCode:r?void 0:"KEYWORD_MISSING",failureMessage:r?void 0:`Expected at least one of [${e.join(", ")}] in output`}])}}}function Z(e){return{type:"keywords_absent",evaluate(t){let s=t.outputText.toLowerCase(),r=[];for(let n of e){let o=s.includes(n.toLowerCase());r.push({assertionType:"keywords_absent",label:`Keyword "${n}" absent`,passed:!o,score:o?0:1,failureCode:o?"KEYWORD_DENIED":void 0,failureMessage:o?`Denied keyword "${n}" found in output`:void 0})}return Promise.resolve(r)}}}var Ke=`You are an impartial AI judge evaluating an AI assistant's response.
|
|
2
|
+
You will be given:
|
|
3
|
+
- The assistant's response
|
|
4
|
+
- Evaluation criteria
|
|
5
|
+
- An optional rubric
|
|
6
|
+
|
|
7
|
+
Score the response from 0.0 to 1.0 based on how well it meets the criteria.
|
|
8
|
+
|
|
9
|
+
Respond ONLY with a JSON object in this exact format:
|
|
10
|
+
{"score": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function He(e,t,s){let r=`## Assistant Response
|
|
11
|
+
${e}
|
|
12
|
+
|
|
13
|
+
## Criteria
|
|
14
|
+
${t}`;return s&&(r+=`
|
|
15
|
+
|
|
16
|
+
## Rubric
|
|
17
|
+
${s}`),r}function ze(e){let t=e.match(/```(?:json)?\s*([\s\S]*?)```/)??e.match(/(\{[\s\S]*\})/);if(!t?.[1])return null;try{let s=JSON.parse(t[1]);return typeof s.score=="number"&&typeof s.reasoning=="string"?{score:s.score,reasoning:s.reasoning}:null}catch{return null}}function V(e){return{type:"judge",async evaluate(t){if(!t.judgeAdapter||!t.judgeModel)return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Judge assertion requires judgeAdapter and judgeModel in context"}];let s=await t.judgeAdapter.complete({model:t.judgeModel,messages:[{role:"system",content:Ke},{role:"user",content:He(t.outputText,e.criteria,e.rubric)}],params:{temperature:0,maxTokens:512}}),r=ze(s.text);if(!r)return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:`Failed to parse judge response: ${s.text.slice(0,200)}`}];let n=r.score>=e.minScore;return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:n,score:r.score,failureCode:n?void 0:"JUDGE_BELOW_THRESHOLD",failureMessage:n?void 0:`Score ${r.score} below threshold ${e.minScore}: ${r.reasoning}`,metadata:{reasoning:r.reasoning}}]}}}var We=`You are an impartial AI judge comparing two AI assistant responses.
|
|
18
|
+
You will be given a baseline response and a new response.
|
|
19
|
+
Evaluate how much the new response has drifted from the baseline.
|
|
20
|
+
|
|
21
|
+
Score from 0.0 to 1.0 where:
|
|
22
|
+
- 0.0 = responses are semantically identical
|
|
23
|
+
- 0.5 = moderate differences in tone, detail, or structure
|
|
24
|
+
- 1.0 = completely different meaning or contradictory
|
|
25
|
+
|
|
26
|
+
Respond ONLY with a JSON object in this exact format:
|
|
27
|
+
{"driftScore": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function Xe(e){let t=e.match(/```(?:json)?\s*([\s\S]*?)```/)??e.match(/(\{[\s\S]*\})/);if(!t?.[1])return null;try{let s=JSON.parse(t[1]);return typeof s.driftScore=="number"&&typeof s.reasoning=="string"?{driftScore:s.driftScore,reasoning:s.reasoning}:null}catch{return null}}function ye(e,t){let s=t.split("."),r=e;for(let n of s){if(r==null||typeof r!="object")return;r=r[n]}return r}function Ye(e,t,s){let r,n;try{r=JSON.parse(e),n=JSON.parse(t)}catch{return{driftScore:1,mismatched:["(parse error)"]}}let o=[];for(let p of s){let i=ye(r,p),l=ye(n,p);JSON.stringify(i)!==JSON.stringify(l)&&o.push(p)}return{driftScore:s.length>0?o.length/s.length:0,mismatched:o}}function K(e){return{type:"drift",async evaluate(t){if(!t.baselineText)return[{assertionType:"drift",label:"Drift check",passed:!0,score:1,metadata:{reason:"No baseline available"}}];if(e.method==="embedding")return[{assertionType:"drift",label:"Drift check (embedding)",passed:!0,score:1,metadata:{reason:"Embedding method not yet implemented"}}];if(e.method==="field-diff"){let a=e.fields??[],{driftScore:p,mismatched:i}=Ye(t.baselineText,t.outputText,a),l=1-p,c=p<=e.maxScore;return[{assertionType:"drift",label:"Drift check (field-diff)",passed:c,score:l,failureCode:c?void 0:"DRIFT_EXCEEDED",failureMessage:c?void 0:`Drift score ${p.toFixed(2)} exceeds max ${e.maxScore}. Mismatched: [${i.join(", ")}]`,metadata:{driftScore:p,mismatched:i}}]}if(!t.judgeAdapter||!t.judgeModel)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Drift judge method requires judgeAdapter and judgeModel in context"}];let s=await t.judgeAdapter.complete({model:t.judgeModel,messages:[{role:"system",content:We},{role:"user",content:`## Baseline Response
|
|
28
|
+
${t.baselineText}
|
|
29
|
+
|
|
30
|
+
## New Response
|
|
31
|
+
${t.outputText}`}],params:{temperature:0,maxTokens:512}}),r=Xe(s.text);if(!r)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:`Failed to parse drift judge response: ${s.text.slice(0,200)}`}];let n=1-r.driftScore,o=r.driftScore<=e.maxScore;return[{assertionType:"drift",label:"Drift check (judge)",passed:o,score:n,failureCode:o?void 0:"DRIFT_EXCEEDED",failureMessage:o?void 0:`Drift score ${r.driftScore.toFixed(2)} exceeds max ${e.maxScore}: ${r.reasoning}`,metadata:{driftScore:r.driftScore,reasoning:r.reasoning}}]}}}function q(e){return{type:"latency",evaluate(t){let s=t.latencyMs??0,r=s<=e.maxMs;return Promise.resolve([{assertionType:"latency",label:`Latency <= ${e.maxMs}ms`,passed:r,score:r?1:0,failureCode:r?void 0:"PROVIDER_TIMEOUT",failureMessage:r?void 0:`Latency ${s}ms exceeds max ${e.maxMs}ms`,metadata:{latencyMs:s}}])}}}function ee(e){return{type:"cost",evaluate(t){let s=t.costUsd??0,r=s<=e.maxUsd;return Promise.resolve([{assertionType:"cost",label:`Cost <= $${e.maxUsd}`,passed:r,score:r?1:0,failureCode:r?void 0:"INTERNAL_ERROR",failureMessage:r?void 0:`Cost $${s.toFixed(4)} exceeds max $${e.maxUsd}`,metadata:{costUsd:s}}])}}}function H(e,t){let s=[];if(e.toolCalls)if(e.toolCalls.some(n=>n.order!==void 0))s.push(X(e.toolCalls));else for(let n of e.toolCalls)n.shouldNotCall?s.push(W(n.tool)):s.push(B(n.tool,n.argsMatch??void 0));if(e.output&&s.push(G({format:e.output.format,schemaFile:e.output.schemaFile,schemaContent:t?.schemaContent,contains:e.output.contains,notContains:e.output.notContains,maxLength:e.output.maxLength})),e.guardrails?.pii&&s.push(J({denyPatterns:e.guardrails.pii.denyPatterns,customPatterns:e.guardrails.pii.customPatterns})),e.guardrails?.keywords&&(e.guardrails.keywords.allow&&e.guardrails.keywords.allow.length>0&&s.push(Q(e.guardrails.keywords.allow)),e.guardrails.keywords.deny.length>0&&s.push(Z(e.guardrails.keywords.deny))),e.judge)for(let r of e.judge)s.push(V({criteria:r.criteria,minScore:r.minScore,rubric:r.rubric}));return e.baseline?.drift&&s.push(K({maxScore:e.baseline.drift.maxScore,method:e.baseline.drift.method,fields:e.baseline.drift.fields})),s}function Qe(){return new Map([["tool_called",e=>B(e.toolCalls?.[0]?.tool??"")],["schema",e=>G(e.output??{format:"text"})],["pii",e=>J(e.guardrails?.pii??{denyPatterns:[]})],["judge",e=>V(e.judge?.[0]??{criteria:"",minScore:.7})],["drift",e=>K(e.baseline?.drift??{maxScore:.15,method:"judge"})],["latency",()=>q({maxMs:6e4})],["cost",()=>ee({maxUsd:1})]])}var Ze=new Set(["judge","drift"]);function D(e){return Ze.has(e)?"probabilistic":"deterministic"}function qe(e){return D(e)==="deterministic"}function et(e){return D(e)==="probabilistic"}async function _(e,t){let{maxRetries:s,shouldRetry:r,baseDelayMs:n=500}=t,o;for(let a=0;a<=s;a++)try{return await e()}catch(p){if(o=p,a>=s||!r(p))throw p;let i=n*Math.pow(2,a);await tt(i)}throw o}function tt(e){return new Promise(t=>setTimeout(t,e))}var st={"gpt-4o":{input:2.5,output:10},"gpt-4o-mini":{input:.15,output:.6},"gpt-4-turbo":{input:10,output:30},"o3-mini":{input:1.1,output:4.4}};function rt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function ot(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function te(e){let t="",s="https://api.openai.com/v1",r,n=6e4,o=2;return{name:"openai",async initialize(a){if(!a.apiKey)throw new g("AUTH_FAILED","API key is required");t=a.apiKey,a.baseUrl&&(s=a.baseUrl),r=a.organization,n=a.timeoutMs,o=a.maxRetries},async complete(a){let p={model:a.model,messages:a.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content,tool_call_id:d.toolCallId}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||null,tool_calls:d.toolCalls.map(h=>({id:h.id,type:"function",function:{name:h.name,arguments:JSON.stringify(h.arguments)}}))}:{role:d.role,content:d.content}),temperature:a.params.temperature,max_tokens:a.params.maxTokens};a.params.topP!==void 0&&(p.top_p=a.params.topP),a.params.seed!==void 0&&(p.seed=a.params.seed),a.params.stopSequences&&(p.stop=a.params.stopSequences),a.tools&&a.tools.length>0&&(p.tools=a.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})),a.toolChoice&&(p.tool_choice=a.toolChoice));let i={"Content-Type":"application/json",Authorization:`Bearer ${t}`};r&&(i["OpenAI-Organization"]=r);let l=Date.now(),c=await _(()=>e.fetch(`${s}/chat/completions`,{method:"POST",headers:i,body:JSON.stringify(p),timeoutMs:n}),{maxRetries:o,shouldRetry:d=>d instanceof g&&d.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from OpenAI API",c.status,c.status>=500)}let y=Date.now()-l;if(!c.ok)throw ot(c.status,f);let m=f,x=m.choices?.[0],C=x?.message,b=(C?.tool_calls??[]).map(d=>{let h;try{h=JSON.parse(d.function.arguments||"{}")}catch{h={_raw:d.function.arguments}}return{id:d.id,name:d.function.name,arguments:h}});return{text:C?.content??"",toolCalls:b,usage:{inputTokens:m.usage?.prompt_tokens??0,outputTokens:m.usage?.completion_tokens??0,totalTokens:m.usage?.total_tokens??0},raw:f,latencyMs:y,modelId:m.model??a.model,finishReason:rt(x?.finish_reason)}},estimateCost(a,p){let i=st[a];return i?p.inputTokens/1e6*i.input+p.outputTokens/1e6*i.output:null},supportsTools(a){return!a.startsWith("o1-")}}}var nt={"claude-opus-4-5-20250929":{input:15,output:75},"claude-sonnet-4-5-20250929":{input:3,output:15},"claude-haiku-4-5-20251001":{input:.8,output:4}};function at(e){switch(e){case"end_turn":return"stop";case"max_tokens":return"max_tokens";case"tool_use":return"tool_calls";default:return"unknown"}}function it(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function se(e){let t="",s="https://api.anthropic.com",r=6e4,n=2;return{name:"anthropic",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a=o.messages.find(h=>h.role==="system"),p=o.messages.filter(h=>h.role!=="system").map(h=>{if(h.role==="tool")return{role:"user",content:[{type:"tool_result",tool_use_id:h.toolCallId,content:h.content}]};if(h.role==="assistant"){if(h.toolCalls&&h.toolCalls.length>0){let R=[];h.content&&R.push({type:"text",text:h.content});for(let P of h.toolCalls)R.push({type:"tool_use",id:P.id,name:P.name,input:P.arguments});return{role:"assistant",content:R}}return{role:"assistant",content:h.content}}return{role:"user",content:h.content}}),i={model:o.model,max_tokens:o.params.maxTokens,messages:p};a&&(i.system=a.content),o.params.temperature!==void 0&&(i.temperature=o.params.temperature),o.params.topP!==void 0&&(i.top_p=o.params.topP),o.params.stopSequences&&(i.stop_sequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(i.tools=o.tools.map(h=>({name:h.name,description:h.description??"",input_schema:h.parameters??{type:"object",properties:{}}})),o.toolChoice&&(i.tool_choice=o.toolChoice==="required"?{type:"any"}:{type:o.toolChoice}));let l=Date.now(),c=await _(()=>e.fetch(`${s}/v1/messages`,{method:"POST",headers:{"Content-Type":"application/json","x-api-key":t,"anthropic-version":"2023-06-01"},body:JSON.stringify(i),timeoutMs:r}),{maxRetries:n,shouldRetry:h=>h instanceof g&&h.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Anthropic API",c.status,c.status>=500)}let y=Date.now()-l;if(!c.ok)throw it(c.status,f);let m=f,x="",C=[];for(let h of m.content??[])h.type==="text"?x+=h.text:h.type==="tool_use"&&C.push({id:h.id,name:h.name,arguments:h.input??{}});let b=m.usage?.input_tokens??0,d=m.usage?.output_tokens??0;return{text:x,toolCalls:C,usage:{inputTokens:b,outputTokens:d,totalTokens:b+d},raw:f,latencyMs:y,modelId:m.model??o.model,finishReason:at(m.stop_reason)}},estimateCost(o,a){let p=Object.entries(nt).find(([i])=>o.includes(i)||i.includes(o));return p?a.inputTokens/1e6*p[1].input+a.outputTokens/1e6*p[1].output:null},supportsTools(o){return!0}}}function lt(e,t){if(t)return"tool_calls";switch(e){case"stop":return"stop";case"length":return"max_tokens";default:return"unknown"}}function ut(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error):"Unknown error";switch(e){case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function re(e){let t="http://localhost:11434",s=6e4,r=2;return{name:"ollama",async initialize(n){n.baseUrl&&(t=n.baseUrl),s=n.timeoutMs,r=n.maxRetries},async complete(n){let o=n.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||"",tool_calls:d.toolCalls.map(h=>({function:{name:h.name,arguments:h.arguments}}))}:{role:d.role,content:d.content}),a={model:n.model,messages:o,stream:!1,options:{temperature:n.params.temperature,num_predict:n.params.maxTokens}},p=a.options;n.params.topP!==void 0&&(p.top_p=n.params.topP),n.params.seed!==void 0&&(p.seed=n.params.seed),n.params.stopSequences&&(p.stop=n.params.stopSequences),n.tools&&n.tools.length>0&&(a.tools=n.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})));let i={"Content-Type":"application/json"},l=Date.now(),c=await _(()=>e.fetch(`${t}/api/chat`,{method:"POST",headers:i,body:JSON.stringify(a),timeoutMs:s}),{maxRetries:r,shouldRetry:d=>d instanceof g&&d.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Ollama API",c.status,c.status>=500)}let y=Date.now()-l;if(!c.ok)throw ut(c.status,f);let m=f,x=m.message,C=x?.tool_calls??[],b=C.map((d,h)=>({id:`ollama_call_${h}`,name:d.function.name,arguments:d.function.arguments??{}}));return{text:x?.content??"",toolCalls:b,usage:{inputTokens:m.prompt_eval_count??0,outputTokens:m.eval_count??0,totalTokens:(m.prompt_eval_count??0)+(m.eval_count??0)},raw:f,latencyMs:y,modelId:m.model??n.model,finishReason:lt(m.done_reason,C.length>0)}},estimateCost(){return 0},supportsTools(){return!0}}}var pt={"gemini-2.0-flash":{input:.1,output:.4},"gemini-2.0-flash-lite":{input:.075,output:.3},"gemini-1.5-pro":{input:1.25,output:5},"gemini-1.5-flash":{input:.075,output:.3},"gemini-1.5-flash-8b":{input:.0375,output:.15}};function ct(e,t){if(t)return"tool_calls";switch(e){case"STOP":return"stop";case"MAX_TOKENS":return"max_tokens";case"SAFETY":return"stop";default:return"unknown"}}function dt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 400:return s.toLowerCase().includes("api key")?new g("AUTH_FAILED",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 401:case 403:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function oe(e){let t="",s="https://generativelanguage.googleapis.com/v1beta",r=6e4,n=2;return{name:"gemini",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a=[],p;for(let T of o.messages){if(T.role==="system"){p={parts:[{text:T.content}]};continue}if(T.role==="tool"){a.push({role:"function",parts:[{functionResponse:{name:T.toolName??"unknown",response:mt(T.content)}}]});continue}if(T.role==="assistant"&&T.toolCalls&&T.toolCalls.length>0){let S=[];T.content&&S.push({text:T.content});for(let ge of T.toolCalls)S.push({functionCall:{name:ge.name,args:ge.arguments}});a.push({role:"model",parts:S});continue}let v=T.role==="assistant"?"model":"user";a.push({role:v,parts:[{text:T.content}]})}let i={contents:a,generationConfig:{temperature:o.params.temperature,maxOutputTokens:o.params.maxTokens}};p&&(i.systemInstruction=p);let l=i.generationConfig;o.params.topP!==void 0&&(l.topP=o.params.topP),o.params.stopSequences&&(l.stopSequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(i.tools=[{functionDeclarations:o.tools.map(T=>({name:T.name,description:T.description,parameters:T.parameters}))}],o.toolChoice&&(i.toolConfig=ft(o.toolChoice)));let c={"Content-Type":"application/json","x-goog-api-key":t},f=Date.now(),y=`${s}/models/${o.model}:generateContent`,m=await _(()=>e.fetch(y,{method:"POST",headers:c,body:JSON.stringify(i),timeoutMs:r}),{maxRetries:n,shouldRetry:T=>T instanceof g&&T.retryable}),x;try{x=await m.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Gemini API",m.status,m.status>=500)}let C=Date.now()-f;if(!m.ok)throw dt(m.status,x);let b=x,d=b.candidates?.[0],h=d?.content?.parts??[],R="",P=[],E=0;for(let T of h)T.text!==void 0&&(R+=T.text),T.functionCall&&(P.push({id:`gemini_call_${E}`,name:T.functionCall.name,arguments:T.functionCall.args??{}}),E++);return{text:R,toolCalls:P,usage:{inputTokens:b.usageMetadata?.promptTokenCount??0,outputTokens:b.usageMetadata?.candidatesTokenCount??0,totalTokens:b.usageMetadata?.totalTokenCount??0},raw:x,latencyMs:C,modelId:b.modelVersion??o.model,finishReason:ct(d?.finishReason,P.length>0)}},estimateCost(o,a){let p=pt[o];return p?a.inputTokens/1e6*p.input+a.outputTokens/1e6*p.output:null},supportsTools(){return!0}}}function mt(e){try{return JSON.parse(e)}catch{return{result:e}}}function ft(e){switch(e){case"auto":return{functionCallingConfig:{mode:"AUTO"}};case"required":return{functionCallingConfig:{mode:"ANY"}};case"none":return{functionCallingConfig:{mode:"NONE"}}}}function gt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function ht(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ne(e){let t="",s="https://api.mistral.ai/v1",r=6e4,n=2;return{name:"mistral",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a={model:o.model,messages:o.messages.map(b=>b.role==="tool"?{role:"tool",content:b.content,tool_call_id:b.toolCallId}:b.role==="assistant"&&b.toolCalls&&b.toolCalls.length>0?{role:"assistant",content:b.content||null,tool_calls:b.toolCalls.map(d=>({id:d.id,type:"function",function:{name:d.name,arguments:JSON.stringify(d.arguments)}}))}:{role:b.role,content:b.content}),temperature:o.params.temperature,max_tokens:o.params.maxTokens};o.params.topP!==void 0&&(a.top_p=o.params.topP),o.params.stopSequences&&(a.stop=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(b=>({type:"function",function:{name:b.name,description:b.description,parameters:b.parameters}})),o.toolChoice&&(a.tool_choice=o.toolChoice));let p={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),l=await _(()=>e.fetch(`${s}/chat/completions`,{method:"POST",headers:p,body:JSON.stringify(a),timeoutMs:r}),{maxRetries:n,shouldRetry:b=>b instanceof g&&b.retryable}),c;try{c=await l.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Mistral API",l.status,l.status>=500)}let f=Date.now()-i;if(!l.ok)throw ht(l.status,c);let y=c,m=y.choices?.[0],x=m?.message,C=(x?.tool_calls??[]).map(b=>{let d;try{d=JSON.parse(b.function.arguments||"{}")}catch{d={_raw:b.function.arguments}}return{id:b.id,name:b.function.name,arguments:d}});return{text:x?.content??"",toolCalls:C,usage:{inputTokens:y.usage?.prompt_tokens??0,outputTokens:y.usage?.completion_tokens??0,totalTokens:y.usage?.total_tokens??0},raw:c,latencyMs:f,modelId:y.model??o.model,finishReason:gt(m?.finish_reason)}},estimateCost(o,a){return null},supportsTools(o){return!0}}}function Rt(e){switch(e){case"COMPLETE":return"stop";case"MAX_TOKENS":return"max_tokens";case"TOOL_CALL":return"tool_calls";default:return"unknown"}}function yt(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ae(e){let t="",s="https://api.cohere.com",r=6e4,n=2;return{name:"cohere",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a={model:o.model,messages:o.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content,tool_call_id:d.toolCallId}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||null,tool_calls:d.toolCalls.map(h=>({id:h.id,type:"function",function:{name:h.name,arguments:JSON.stringify(h.arguments)}}))}:{role:d.role,content:d.content}),temperature:o.params.temperature,max_tokens:o.params.maxTokens};o.params.topP!==void 0&&(a.p=o.params.topP),o.params.stopSequences&&(a.stop_sequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})),o.toolChoice&&(a.tool_choice=o.toolChoice));let p={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),l=await _(()=>e.fetch(`${s}/v2/chat`,{method:"POST",headers:p,body:JSON.stringify(a),timeoutMs:r}),{maxRetries:n,shouldRetry:d=>d instanceof g&&d.retryable}),c;try{c=await l.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Cohere API",l.status,l.status>=500)}let f=Date.now()-i;if(!l.ok)throw yt(l.status,c);let y=c,m=y.message?.content?.filter(d=>d.type==="text").map(d=>d.text).join("")??"",x=(y.message?.tool_calls??[]).map(d=>{let h;try{h=JSON.parse(d.function.arguments||"{}")}catch{h={_raw:d.function.arguments}}return{id:d.id,name:d.function.name,arguments:h}}),C=y.usage?.tokens?.input_tokens??0,b=y.usage?.tokens?.output_tokens??0;return{text:m,toolCalls:x,usage:{inputTokens:C,outputTokens:b,totalTokens:C+b},raw:c,latencyMs:f,modelId:y.model??o.model,finishReason:Rt(y.finish_reason)}},estimateCost(o,a){return null},supportsTools(o){return!0}}}var be={openai:te,anthropic:se,ollama:re,gemini:oe,mistral:ne,cohere:ae};function bt(e,t){let s=be[e];if(!s){let r=Object.keys(be).join(", ");throw new Error(`Unknown provider: "${e}". Supported providers: ${r}`)}return s(t)}async function ie(e,t,s,r){let n=r?.maxTurns??10,o=[],a=[],p=[...t.messages],i={inputTokens:0,outputTokens:0,totalTokens:0},l=0;for(let y=0;y<n;y++){let m={...t,messages:p},x=await e.complete(m);if(o.push({request:m,response:x}),i.inputTokens+=x.usage.inputTokens,i.outputTokens+=x.usage.outputTokens,i.totalTokens+=x.usage.totalTokens,l+=x.latencyMs,x.toolCalls.length===0)return{turns:o,finalText:x.text,allToolCalls:a,totalUsage:i,totalLatencyMs:l};a.push(...x.toolCalls),p=[...p,{role:"assistant",content:x.text,toolCalls:x.toolCalls}];for(let C of x.toolCalls){let b=s.find(h=>h.name===C.name),d;b?d=xt(b,C.arguments):d={error:`Tool "${C.name}" not simulated`},p.push({role:"tool",content:JSON.stringify(d),toolCallId:C.id,toolName:C.name})}}let f=o[o.length-1]?.response??{text:"",toolCalls:[],usage:{inputTokens:0,outputTokens:0,totalTokens:0},raw:null,latencyMs:0,modelId:t.model,finishReason:"unknown"};return{turns:o,finalText:f.text,allToolCalls:a,totalUsage:i,totalLatencyMs:l}}function xt(e,t){if(e.responses){for(let s of e.responses)if(Tt(s.when,t))return s.then}return e.defaultResponse!==void 0?e.defaultResponse:{error:"No matching simulation response"}}function Tt(e,t){for(let[s,r]of Object.entries(e))if(JSON.stringify(t[s])!==JSON.stringify(r))return!1;return!0}import{z as u}from"zod";import{z as O}from"zod";var xe=O.object({outputTextAttr:O.string().default("gen_ai.completion.0.content"),modelAttr:O.string().default("gen_ai.response.model"),systemAttr:O.string().default("gen_ai.system"),inputTokensAttr:O.string().default("gen_ai.usage.input_tokens"),outputTokensAttr:O.string().default("gen_ai.usage.output_tokens")}),Te=O.object({namePattern:O.string().optional().describe("Regex to filter span names"),attributeMatch:O.record(O.string()).optional().describe("Attributes that must match"),minDurationMs:O.number().min(0).optional()}),le=O.object({port:O.number().int().min(1).max(65535).default(4318),timeoutMs:O.number().int().min(1e3).default(3e4),spanMapping:xe.default({}),spanFilter:Te.optional()});var k=u.string().min(1,"Must not be empty"),Ct=u.number().min(0).max(2).default(.2),$=u.number().min(0).max(1),ue=u.string().refine(e=>{try{return new RegExp(e),!0}catch{return!1}},{message:"Must be a valid regex pattern"}),L=u.object({apiKeyEnv:k.describe("Environment variable name containing the API key. Never a raw key."),baseUrl:u.string().url().optional().describe("Custom base URL for API-compatible proxies (e.g., Azure OpenAI, LiteLLM)"),organization:u.string().optional().describe("Organization ID (OpenAI-specific)")}),At=u.object({apiKeyEnv:u.string().min(1).optional().describe("Environment variable name containing the API key. Optional for Ollama (local)."),baseUrl:u.string().url().optional().describe("Ollama server URL. Defaults to http://localhost:11434.")}),Pt=u.object({openai:L.optional(),anthropic:L.optional(),ollama:At.optional(),gemini:L.optional(),mistral:L.optional(),cohere:L.optional()}).refine(e=>Object.keys(e).some(t=>e[t]!==void 0),{message:"At least one provider must be configured"}),Et=u.object({temperature:Ct,maxTokens:u.number().int().min(1).max(128e3).default(1024),topP:u.number().min(0).max(1).optional(),stopSequences:u.array(u.string()).optional(),seed:u.number().int().optional().describe("Seed for reproducibility (provider-dependent support)")}),vt=u.object({id:k.describe("Unique identifier for this model config, referenced in reports"),provider:u.enum(["openai","anthropic","ollama","gemini","mistral","cohere"]).describe("Must match a key in the providers section"),model:k.describe("Model name as the provider expects it (e.g., 'gpt-4o', 'claude-sonnet-4-5-20250929')"),params:Et.default({})}),wt=u.object({system:u.string().optional().describe("System prompt template. Supports {{variable}} interpolation."),user:k.describe("User prompt template. Supports {{variable}} interpolation."),assistant:u.string().optional().describe("Prefill for assistant response (Anthropic-specific)")}),St=u.object({format:u.enum(["text","json"]).default("text"),schemaFile:u.string().optional().describe("Path to JSON Schema file (relative to config file). Required if format is 'json'."),contains:u.array(u.string()).optional().describe("Output must contain all of these substrings"),notContains:u.array(u.string()).optional().describe("Output must not contain any of these substrings"),maxLength:u.number().int().positive().optional().describe("Maximum character length of the output")}).refine(e=>!(e.format==="json"&&!e.schemaFile),{message:"schemaFile is required when format is 'json'"}),Mt=u.object({enabled:u.boolean().default(!0),denyPatterns:u.array(ue).default(["\\b\\d{3}-\\d{2}-\\d{4}\\b","\\b\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}\\b","\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"]).describe("Regex patterns that must NOT appear in output. Defaults include SSN, credit card, email."),customPatterns:u.array(u.object({name:k,pattern:ue})).optional().describe("Named custom PII patterns for reporting clarity")}),Ot=u.object({deny:u.array(u.string()).default([]).describe("Words/phrases that must NOT appear in output (case-insensitive)"),allow:u.array(u.string()).optional().describe("If set, output MUST contain at least one of these words/phrases")}),_t=u.object({criteria:k.describe("Natural language description of what to evaluate (e.g., 'Response is empathetic and professional')"),minScore:$.default(.7).describe("Minimum score (0-1) for this criterion to pass"),model:u.string().optional().describe("Override judge model for this criterion. Defaults to first model in models list."),rubric:u.string().optional().describe("Detailed rubric for the judge. If omitted, a default rubric is generated from criteria.")}),kt=u.object({tool:k.describe("Expected tool/function name"),shouldNotCall:u.boolean().optional().default(!1).describe("If true, assert this tool was NOT called"),argsMatch:u.record(u.unknown()).optional().describe("Key-value pairs that must be present in the tool call arguments (partial match)"),argsSchema:u.string().optional().describe("Path to JSON Schema file to validate the tool call arguments"),order:u.number().int().min(0).optional().describe("Expected position in the sequence of tool calls (0-indexed)"),responseContains:u.string().optional().describe("Assert the simulated tool response contains this substring")}),It=u.object({maxScore:$.default(.15).describe("Maximum drift score (0-1). Higher = more drift allowed. Fail if exceeded."),method:u.enum(["judge","embedding","field-diff"]).default("judge").describe("Drift detection method. 'judge' uses LLM comparison, 'embedding' uses cosine similarity, 'field-diff' compares JSON fields."),fields:u.array(u.string()).optional().describe("For field-diff method: JSON paths to compare (e.g., ['response.action', 'response.message'])")}),jt=u.object({pii:Mt.optional(),keywords:Ot.optional()}),Nt=u.object({output:St.optional(),guardrails:jt.optional(),judge:u.array(_t).optional().describe("LLM-as-judge evaluations. Each criterion is scored independently."),toolCalls:u.array(kt).optional().describe("Expected tool/function calls in the model response"),baseline:u.object({drift:It.optional()}).optional()}),$t=u.object({name:k.describe("Tool/function name as the model sees it"),description:u.string().optional().describe("Tool description for documentation"),parameters:u.record(u.unknown()).optional().describe("JSON Schema for the tool's parameters"),responses:u.array(u.object({when:u.record(u.unknown()).describe("Condition: match tool call arguments (partial match)"),then:u.unknown().describe("Simulated response to return when condition matches")})).optional().describe("Simulated responses based on argument matching"),defaultResponse:u.unknown().optional().describe("Response when no 'when' condition matches")}),Dt=u.object({name:k.describe("Unique test case name within the suite. Used in reports and JUnit output."),prompt:k.optional().describe("Reference to a key in the prompts section. Exactly one of prompt or command must be set."),command:k.optional().describe("Shell command to execute. Stdout is captured and assertions run against it. Exactly one of prompt or command must be set."),vars:u.record(u.string()).default({}).describe("Variables to interpolate into the prompt template or command"),models:u.array(u.string()).optional().describe("Override: run this test only against these model IDs. Defaults to all models. Ignored for command tests."),repeat:u.number().int().min(1).optional().describe("Override: number of repeat runs for this specific test case"),tools:u.array($t).optional().describe("Simulated tools available to the model for this test case"),expect:Nt.describe("Assertions to evaluate against the model output"),tags:u.array(u.string()).optional().describe("Tags for filtering test cases in CLI (e.g., --tags regression)"),skip:u.boolean().optional().default(!1).describe("Skip this test case during execution")}).refine(e=>{let t=e.prompt!==void 0,s=e.command!==void 0;return(t||s)&&!(t&&s)},{message:"Exactly one of 'prompt' or 'command' must be set on each test case"}),Lt=u.object({passRateMin:$.default(.95).describe("Minimum overall pass rate (0-1). Computed after repeats and aggregation."),schemaFailuresMax:u.number().int().min(0).default(0).describe("Maximum allowed schema validation failures across entire suite"),judgeAvgMin:$.optional().describe("Minimum average LLM-as-judge score across all criteria and test cases"),driftScoreMax:$.optional().describe("Maximum allowed drift score against active baseline"),piiFailuresMax:u.number().int().min(0).default(0).describe("Maximum allowed PII detection failures"),keywordFailuresMax:u.number().int().min(0).default(0).describe("Maximum allowed keyword guardrail failures"),costMaxUsd:u.number().positive().optional().describe("Maximum total cost in USD for the entire run. Aborts if exceeded mid-run."),latencyMaxMs:u.number().positive().optional().describe("Maximum average latency in ms. Fails gate if exceeded."),deterministicPassRate:$.optional().describe("Minimum pass rate for deterministic assertions only (tool_called, schema, pii, keywords, etc.)"),probabilisticPassRate:$.optional().describe("Minimum pass rate for probabilistic assertions only (judge, drift)")}),Ft=u.object({enabled:u.boolean().default(!1),framework:u.enum(["eu-ai-act","custom"]).default("eu-ai-act"),outputDir:u.string().default("./compliance-reports"),metadata:u.object({systemName:u.string().optional().describe("Name of the AI system being tested"),systemVersion:u.string().optional().describe("Version of the AI system"),riskLevel:u.enum(["high","limited","minimal"]).optional(),operator:u.string().optional().describe("Organization operating the AI system"),intendedPurpose:u.string().optional().describe("Documented intended purpose of the AI system"),dataGovernanceNotes:u.string().optional()}).optional()}),Ut=u.object({enabled:u.boolean().default(!1),includeArtifacts:u.boolean().default(!1).describe("Upload raw prompt inputs and model outputs. Disabled by default for privacy."),redactPatterns:u.array(ue).optional().describe("Patterns to redact from artifacts before upload (applied on top of PII guardrails)"),apiUrl:u.string().url().default("https://api.kindlm.com/v1").describe("Cloud API URL. Override for self-hosted deployments.")}),Bt=u.object({name:k,description:u.string().optional(),tags:u.array(u.string()).optional()}),Ce=u.object({kindlm:u.literal(1).describe("Config schema version. Must be 1."),project:k.describe("Project identifier for cloud upload and report grouping"),suite:Bt,providers:Pt,models:u.array(vt).min(1,"At least one model must be configured"),prompts:u.record(wt).refine(e=>Object.keys(e).length>0,{message:"At least one prompt must be defined"}),tests:u.array(Dt).min(1,"At least one test case must be defined"),gates:Lt.default({}),compliance:Ft.optional(),trace:le.optional().describe("OpenTelemetry trace ingestion configuration for the 'kindlm trace' command"),upload:Ut.default({}),defaults:u.object({repeat:u.number().int().min(1).default(1).describe("Default repeat count per test case"),concurrency:u.number().int().min(1).max(32).default(4).describe("Default concurrency for test execution"),timeoutMs:u.number().int().min(1e3).default(6e4).describe("Default timeout per provider call in ms"),judgeModel:u.string().optional().describe("Default model ID for LLM-as-judge assertions. Must reference a configured model.")}).default({})});function pe(e){let t=Ce.safeParse(e);return t.success?M(t.data):A({code:"CONFIG_VALIDATION_ERROR",message:"Config validation failed",details:{errors:t.error.issues.map(s=>`${s.path.join(".")}: ${s.message}`)}})}import{parse as Gt}from"yaml";var Jt=1048576,Ae=1e3,Pe=50;function Vt(e,t){if(e.length>Jt)return A({code:"CONFIG_TOO_LARGE",message:`Config exceeds maximum size of 1MB (got ${(e.length/1048576).toFixed(1)}MB)`});let s;try{s=Gt(e)}catch(i){return A({code:"CONFIG_PARSE_ERROR",message:`Failed to parse YAML: ${i.message}`,cause:i})}let r=pe(s);if(!r.success)return r;let n=r.data;if(n.tests.length>Ae)return A({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${Ae} tests (got ${n.tests.length})`});if(n.models.length>Pe)return A({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${Pe} models (got ${n.models.length})`});let o=[],a=new Set;for(let i of n.models)a.has(i.id)&&o.push(`Duplicate model ID "${i.id}"`),a.add(i.id);let p=new Set;for(let i of n.tests)p.has(i.name)&&o.push(`Duplicate test name "${i.name}"`),p.add(i.name);for(let i of n.tests)i.prompt&&!(i.prompt in n.prompts)&&o.push(`Test "${i.name}" references prompt "${i.prompt}" which is not defined`);for(let i of n.tests)if(i.models)for(let l of i.models)a.has(l)||o.push(`Test "${i.name}" references model "${l}" which is not configured`);for(let i of n.models)n.providers[i.provider]||o.push(`Model "${i.id}" references provider "${i.provider}" which is not configured`);if(n.defaults.judgeModel&&!a.has(n.defaults.judgeModel)&&o.push(`defaults.judgeModel "${n.defaults.judgeModel}" is not a configured model`),t.fileReader)for(let i of n.tests){if(i.expect.output?.schemaFile){let l=Ee(t.configDir,i.expect.output.schemaFile);l.success?t.fileReader.readFile(l.data).success||o.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" not found at ${l.data}`):o.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" \u2014 ${l.error.message}`)}if(i.expect.toolCalls){for(let l of i.expect.toolCalls)if(l.argsSchema){let c=Ee(t.configDir,l.argsSchema);c.success?t.fileReader.readFile(c.data).success||o.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" not found at ${c.data}`):o.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" \u2014 ${c.error.message}`)}}}return o.length>0?A({code:"CONFIG_VALIDATION_ERROR",message:"Config cross-reference validation failed",details:{errors:o}}):M(n)}function Ee(e,t){if(t.startsWith("/")||t.startsWith("\\"))return A({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});if(/^[a-zA-Z]:/.test(t))return A({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});let s=e.endsWith("/")?e.slice(0,-1):e,r=`${s}/${t}`,n=ve(r),o=ve(s);return!n.startsWith(o+"/")&&n!==o?A({code:"PATH_TRAVERSAL",message:`Path "${t}" escapes the config directory`}):M(n)}function ve(e){let t=e.split("/"),s=[];for(let n of t)n==="."||n===""||(n===".."?s.pop():s.push(n));return(e.startsWith("/")?"/":"")+s.join("/")}var we=/\{\{(\w+)\}\}/g;function F(e,t){let s=Se(e,t);if(s.length>0)return A({code:"CONFIG_VALIDATION_ERROR",message:`Missing template variables: ${s.join(", ")}`,details:{missing:s}});let r=e.replace(we,(n,o)=>t[o]);return M(r)}function Se(e,t){let s=new Set;for(let r of e.matchAll(we)){let n=r[1];n!==void 0&&!(n in t)&&s.add(n)}return[...s]}function ce(e){let t=e[0];if(!t)throw new Error("aggregateRuns requires at least one run");let{testCaseName:s,modelId:r}=t,o=e.filter(y=>y.assertions.every(m=>m.passed)).length/e.length,a=new Map;for(let y of e)for(let m of y.assertions){let x=a.get(m.assertionType);x||(x=[],a.set(m.assertionType,x)),x.push(m.score)}let p={};for(let[y,m]of a){let x=m.reduce((C,b)=>C+b,0);p[y]={mean:x/m.length,min:Math.min(...m),max:Math.max(...m)}}let i=new Set;for(let y of e)for(let m of y.assertions)!m.passed&&m.failureCode&&i.add(m.failureCode);let l=e.reduce((y,m)=>y+m.latencyMs,0)/e.length,c=e.reduce((y,m)=>y+(m.costEstimateUsd??0),0),f=e.reduce((y,m)=>y+m.tokenUsage.totalTokens,0);return{testCaseName:s,modelId:r,runCount:e.length,passed:o===1,passRate:o,assertionScores:p,failureCodes:[...i],latencyAvgMs:l,totalCostUsd:c,totalTokens:f,runs:e}}function de(e){let t=e.stdout.split(`
|
|
32
|
+
`),s=[],r=[],n,o=0;for(let a of t){let p=a.trimStart();if(!p.startsWith('{"kindlm":')){s.push(a);continue}let i=Kt(p);if(!i){s.push(a);continue}i.kindlm==="tool_call"?r.push({id:i.id??`cmd_tc_${o++}`,name:i.name,arguments:i.arguments}):i.kindlm==="output_json"&&(n=i.data)}return{outputText:s.join(`
|
|
33
|
+
`).trim(),toolCalls:r,outputJson:n,exitCode:e.exitCode,stderr:e.stderr}}function Kt(e){try{let t=JSON.parse(e);return typeof t.kindlm!="string"?null:t.kindlm==="tool_call"?typeof t.name!="string"?null:{kindlm:"tool_call",id:typeof t.id=="string"?t.id:void 0,name:t.name,arguments:typeof t.arguments=="object"&&t.arguments!==null?t.arguments:{}}:t.kindlm==="output_json"?{kindlm:"output_json",data:t.data}:null}catch{return null}}function Ht(e,t){return{async run(){let s=Date.now(),r=new Map;for(let R of e.tests)if(R.expect.output?.schemaFile){let P=Yt(t.configDir,R.expect.output.schemaFile),E=t.fileReader.readFile(P);if(!E.success)return A({code:"SCHEMA_FILE_ERROR",message:`Failed to read schema file "${R.expect.output.schemaFile}": ${E.error.message}`});try{r.set(R.name,JSON.parse(E.data))}catch(T){return A({code:"SCHEMA_FILE_ERROR",message:`Failed to parse schema file "${R.expect.output.schemaFile}" as JSON: ${T instanceof Error?T.message:String(T)}`})}}let n=[];for(let R of e.tests){if(R.skip)continue;let P=R.repeat??e.defaults.repeat;if(R.command)for(let E=0;E<P;E++)n.push({test:R,modelConfig:null,runIndex:E});else{let E=R.models??e.models.map(T=>T.id);for(let T of E){let v=e.models.find(S=>S.id===T);if(v)for(let S=0;S<P;S++)n.push({test:R,modelConfig:v,runIndex:S})}}}let o=await Xt(n.map(R=>()=>zt(e,t,R,r)),e.defaults.concurrency),a=R=>`${R.testCaseName}::${R.modelId}`,p=new Map;for(let R of o){let P=a(R),E=p.get(P);E||(E=[],p.set(P,E)),E.push(R)}let i=[];for(let R of p.values())i.push(ce(R));let l=i.map(R=>({name:R.testCaseName,modelId:R.modelId,status:R.passed?"passed":"failed",assertions:R.runs[0]?.assertions??[],latencyMs:R.latencyAvgMs,costUsd:R.totalCostUsd})),c=e.tests.filter(R=>R.skip).map(R=>({name:R.name,modelId:"",status:"skipped",assertions:[],latencyMs:0,costUsd:0})),f=[...l,...c],y=f.filter(R=>R.status==="passed").length,m=f.filter(R=>R.status==="failed").length,x=f.filter(R=>R.status==="errored").length,C=f.filter(R=>R.status==="skipped").length,b=x>0?"errored":m>0?"failed":"passed",h={suites:[{name:e.suite.name,status:b,tests:f}],totalTests:f.length,passed:y,failed:m,errored:x,skipped:C,durationMs:Date.now()-s};return M({runResult:h,aggregated:i})}}}async function zt(e,t,s,r){let{test:n,modelConfig:o,runIndex:a}=s;if(n.command)return Wt(e,t,n,a,r);if(!o)return j(n.name,"unknown",a,"No model config for prompt-based test");t.onProgress?.({type:"test_start",test:n.name,model:o.id,run:a});try{let p=t.adapters.get(o.provider);if(!p)return j(n.name,o.id,a,`Provider adapter "${o.provider}" not found`);let i=n.prompt?e.prompts[n.prompt]:void 0;if(!i)return j(n.name,o.id,a,`Prompt "${n.prompt}" not defined`);let l=F(i.user,n.vars);if(!l.success)return j(n.name,o.id,a,l.error.message);let c=[];if(i.system){let v=F(i.system,n.vars);if(!v.success)return j(n.name,o.id,a,v.error.message);c.push({role:"system",content:v.data})}c.push({role:"user",content:l.data});let f=(n.tools??[]).map(v=>({name:v.name,description:v.description,parameters:v.parameters})),y={model:o.model,messages:c,params:{temperature:o.params.temperature,maxTokens:o.params.maxTokens,topP:o.params.topP,stopSequences:o.params.stopSequences,seed:o.params.seed},tools:f.length>0?f:void 0},m=await ie(p,y,n.tools??[]),x=p.estimateCost(o.model,m.totalUsage),C={};n.expect.output?.schemaFile&&r.has(n.name)&&(C.schemaContent=r.get(n.name));let b=H(n.expect,C),d=e.defaults.judgeModel??e.models[0]?.id,h=e.models.find(v=>v.id===d),R=h?t.adapters.get(h.provider):void 0,P={outputText:m.finalText,toolCalls:m.allToolCalls,configDir:t.configDir,latencyMs:m.totalLatencyMs,costUsd:x??void 0,judgeAdapter:R,judgeModel:h?.model};if(t.baselineData){let v=`${n.name}::${o.id}`,S=t.baselineData.results[v];S&&(P.baselineText=S.outputText)}let E=[];for(let v of b){let S=await v.evaluate(P);E.push(...S)}let T=E.every(v=>v.passed);return t.onProgress?.({type:"test_complete",test:n.name,model:o.id,run:a,passed:T}),{testCaseName:n.name,modelId:o.id,runIndex:a,outputText:m.finalText,assertions:E,latencyMs:m.totalLatencyMs,tokenUsage:m.totalUsage,costEstimateUsd:x}}catch(p){return t.onProgress?.({type:"test_complete",test:n.name,model:o.id,run:a,passed:!1}),j(n.name,o.id,a,p instanceof Error?p.message:String(p))}}async function Wt(e,t,s,r,n){let o="command";t.onProgress?.({type:"test_start",test:s.name,model:o,run:r});try{if(!t.commandExecutor)return j(s.name,o,r,"Command executor not available");if(!s.command)return j(s.name,o,r,"No command specified");let a=F(s.command,s.vars);if(!a.success)return j(s.name,o,r,a.error.message);let p=Date.now(),i=await t.commandExecutor.execute(a.data,{timeoutMs:e.defaults.timeoutMs,cwd:t.configDir});if(!i.success)return j(s.name,o,r,i.error.message);let l=Date.now()-p,c=de(i.data),f={};s.expect.output?.schemaFile&&n.has(s.name)&&(f.schemaContent=n.get(s.name));let y=H(s.expect,f),m=e.defaults.judgeModel??e.models[0]?.id,x=e.models.find(R=>R.id===m),C=x?t.adapters.get(x.provider):void 0,b={outputText:c.outputText,outputJson:c.outputJson,toolCalls:c.toolCalls,configDir:t.configDir,latencyMs:l,judgeAdapter:C,judgeModel:x?.model};if(t.baselineData){let R=`${s.name}::${o}`,P=t.baselineData.results[R];P&&(b.baselineText=P.outputText)}let d=[];for(let R of y){let P=await R.evaluate(b);d.push(...P)}let h=d.every(R=>R.passed);return t.onProgress?.({type:"test_complete",test:s.name,model:o,run:r,passed:h}),{testCaseName:s.name,modelId:o,runIndex:r,outputText:c.outputText,assertions:d,latencyMs:l,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}catch(a){return t.onProgress?.({type:"test_complete",test:s.name,model:o,run:r,passed:!1}),j(s.name,o,r,a instanceof Error?a.message:String(a))}}function j(e,t,s,r){return{testCaseName:e,modelId:t,runIndex:s,outputText:"",assertions:[{assertionType:"internal",label:"Execution error",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:r}],latencyMs:0,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}async function Xt(e,t){let s=new Array(e.length),r=0;async function n(){for(;r<e.length;){let a=r++,p=e[a];p&&(s[a]=await p())}}let o=Array.from({length:Math.min(t,e.length)},()=>n());return await Promise.all(o),s}function Yt(e,t){return t.startsWith("/")?t:`${e.endsWith("/")?e.slice(0,-1):e}/${t}`}function Qt(e,t){let s=[],r=t.reduce((l,c)=>l+c.runCount,0),n=t.reduce((l,c)=>l+Math.round(c.passRate*c.runCount),0),o=r>0?n/r:0;s.push({gateName:"passRateMin",passed:o>=e.passRateMin,actual:o,threshold:e.passRateMin,message:o>=e.passRateMin?`Pass rate ${w(o)} meets minimum ${w(e.passRateMin)}`:`Pass rate ${w(o)} below minimum ${w(e.passRateMin)}`});let a=me(t,["SCHEMA_INVALID","SCHEMA_PARSE_ERROR"]);if(s.push({gateName:"schemaFailuresMax",passed:a<=e.schemaFailuresMax,actual:a,threshold:e.schemaFailuresMax,message:a<=e.schemaFailuresMax?`Schema failures ${a} within limit ${e.schemaFailuresMax}`:`Schema failures ${a} exceed limit ${e.schemaFailuresMax}`}),e.judgeAvgMin!==void 0){let l=Oe(t,"judge"),c=l.length>0?l.reduce((f,y)=>f+y,0)/l.length:1;s.push({gateName:"judgeAvgMin",passed:c>=e.judgeAvgMin,actual:c,threshold:e.judgeAvgMin,message:c>=e.judgeAvgMin?`Judge average ${w(c)} meets minimum ${w(e.judgeAvgMin)}`:`Judge average ${w(c)} below minimum ${w(e.judgeAvgMin)}`})}if(e.driftScoreMax!==void 0){let l=Oe(t,"drift"),c=l.length>0?Math.max(...l):0;s.push({gateName:"driftScoreMax",passed:c<=e.driftScoreMax,actual:c,threshold:e.driftScoreMax,message:c<=e.driftScoreMax?`Drift score ${w(c)} within limit ${w(e.driftScoreMax)}`:`Drift score ${w(c)} exceeds limit ${w(e.driftScoreMax)}`})}let p=me(t,["PII_DETECTED"]);s.push({gateName:"piiFailuresMax",passed:p<=e.piiFailuresMax,actual:p,threshold:e.piiFailuresMax,message:p<=e.piiFailuresMax?`PII failures ${p} within limit ${e.piiFailuresMax}`:`PII failures ${p} exceed limit ${e.piiFailuresMax}`});let i=me(t,["KEYWORD_DENIED","KEYWORD_MISSING"]);if(s.push({gateName:"keywordFailuresMax",passed:i<=e.keywordFailuresMax,actual:i,threshold:e.keywordFailuresMax,message:i<=e.keywordFailuresMax?`Keyword failures ${i} within limit ${e.keywordFailuresMax}`:`Keyword failures ${i} exceed limit ${e.keywordFailuresMax}`}),e.costMaxUsd!==void 0){let l=t.reduce((c,f)=>c+f.totalCostUsd,0);s.push({gateName:"costMaxUsd",passed:l<=e.costMaxUsd,actual:l,threshold:e.costMaxUsd,message:l<=e.costMaxUsd?`Total cost $${l.toFixed(4)} within limit $${e.costMaxUsd.toFixed(4)}`:`Total cost $${l.toFixed(4)} exceeds limit $${e.costMaxUsd.toFixed(4)}`})}if(e.latencyMaxMs!==void 0){let l=t.length>0?t.reduce((c,f)=>c+f.latencyAvgMs,0)/t.length:0;s.push({gateName:"latencyMaxMs",passed:l<=e.latencyMaxMs,actual:l,threshold:e.latencyMaxMs,message:l<=e.latencyMaxMs?`Average latency ${Math.round(l)}ms within limit ${e.latencyMaxMs}ms`:`Average latency ${Math.round(l)}ms exceeds limit ${e.latencyMaxMs}ms`})}if(e.deterministicPassRate!==void 0){let l=Me(t,"deterministic");s.push({gateName:"deterministicPassRate",passed:l>=e.deterministicPassRate,actual:l,threshold:e.deterministicPassRate,message:l>=e.deterministicPassRate?`Deterministic pass rate ${w(l)} meets minimum ${w(e.deterministicPassRate)}`:`Deterministic pass rate ${w(l)} below minimum ${w(e.deterministicPassRate)}`})}if(e.probabilisticPassRate!==void 0){let l=Me(t,"probabilistic");s.push({gateName:"probabilisticPassRate",passed:l>=e.probabilisticPassRate,actual:l,threshold:e.probabilisticPassRate,message:l>=e.probabilisticPassRate?`Probabilistic pass rate ${w(l)} meets minimum ${w(e.probabilisticPassRate)}`:`Probabilistic pass rate ${w(l)} below minimum ${w(e.probabilisticPassRate)}`})}return{passed:s.every(l=>l.passed),gates:s}}function Me(e,t){let s=0,r=0;for(let n of e)for(let o of n.runs)for(let a of o.assertions)D(a.assertionType)===t&&(s++,a.passed&&r++);return s>0?r/s:1}function me(e,t){let s=0;for(let r of e)for(let n of t)r.failureCodes.includes(n)&&s++;return s}function Oe(e,t){let s=[];for(let r of e){let n=r.assertionScores[t];n&&s.push(n.mean)}return s}function w(e){return(e*100).toFixed(1)+"%"}var N=e=>e,fe={bold:N,red:N,green:N,yellow:N,cyan:N,dim:N,greenBold:N,redBold:N};function Zt(e=fe){return{name:"pretty",generate(t,s){let r=[],n=e;r.push(""),r.push(n.bold(" KindLM Test Results")),r.push("");let o=0;for(let c of t.suites){r.push(qt(c,n));for(let f of c.tests){r.push(es(f,n));let y=ts(f,n);y&&r.push(y);for(let m of f.assertions)r.push(ss(m,n));o+=f.costUsd}r.push("")}r.push(n.bold(" Summary"));let a=n.green(`${t.passed} passed`),p=t.failed>0?n.red(`${t.failed} failed`):`${t.failed} failed`,i=t.errored>0?n.yellow(`${t.errored} errored`):`${t.errored} errored`;if(r.push(` ${a}, ${p}, ${i} (${t.totalTests} total)`),r.push(` Duration: ${ke(t.durationMs)}`),o>0&&r.push(` Cost: ${Ie(o)}`),r.push(""),s.gates.length>0){r.push(n.bold(" Quality Gates"));for(let c of s.gates){let f=c.passed?n.green("\u2713"):n.red("\u2717");r.push(` ${f} ${c.message}`)}r.push("")}return t.failed===0&&t.errored===0&&s.passed?r.push(n.greenBold(" \u2713 All tests passed")):r.push(n.redBold(" \u2717 Some tests failed")),r.push(""),{content:r.join(`
|
|
34
|
+
`),format:"text"}}}}function qt(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${t.bold(e.name)}`}function es(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${e.name}`}function ts(e,t){if(e.status==="skipped")return null;let s=[];return e.modelId&&s.push(e.modelId),e.latencyMs>0&&s.push(ke(e.latencyMs)),e.costUsd>=5e-5&&s.push(Ie(e.costUsd)),s.length===0?null:` ${t.dim(s.join(" \xB7 "))}`}function ss(e,t){if(e.passed){let o=_e(e),a=o?`${e.label} ${t.cyan(o)}`:e.label;return` ${t.green("\u2713")} ${t.dim(a)}`}let s=_e(e),r=e.failureMessage??"failed",n=s?`${e.label} ${t.cyan(s)}`:e.label;return` ${t.red("\u2717")} ${n}: ${r}`}function _e(e){if(e.assertionType==="judge"||e.assertionType==="drift"){let t=rs(e);if(t!==null){let s=e.passed?"\u2265":"<";return`(${e.score.toFixed(2)} ${s} ${t.toFixed(2)})`}return`(${e.score.toFixed(2)})`}return""}function rs(e){if(e.metadata&&typeof e.metadata=="object"&&"threshold"in e.metadata){let t=e.metadata.threshold;if(typeof t=="number")return t}if(e.failureMessage){let t=e.failureMessage.match(/threshold (\d+\.?\d*)/i);if(t?.[1])return parseFloat(t[1]);let s=e.failureMessage.match(/below (\d+\.?\d*)/i);if(s?.[1])return parseFloat(s[1])}return null}function ke(e){return e<1e3?`${e}ms`:`${(e/1e3).toFixed(2)}s`}function Ie(e){return e<.01?`$${e.toFixed(4)}`:`$${e.toFixed(2)}`}function os(){return{name:"json",generate(e,t){let s={kindlm:{version:"1.0.0",timestamp:new Date().toISOString()},summary:{totalTests:e.totalTests,passed:e.passed,failed:e.failed,errored:e.errored,skipped:e.skipped,durationMs:e.durationMs},gates:{passed:t.passed,results:t.gates},suites:e.suites.map(r=>({name:r.name,status:r.status,tests:r.tests.map(n=>({name:n.name,status:n.status,assertions:n.assertions,latencyMs:n.latencyMs,costUsd:n.costUsd}))}))};return{content:JSON.stringify(s,null,2),format:"json"}}}}function ns(){return{name:"junit",generate(e,t){let s=e.durationMs/1e3,r=[];r.push('<?xml version="1.0" encoding="UTF-8"?>'),r.push(`<testsuites name="KindLM" tests="${e.totalTests}" failures="${e.failed}" errors="${e.errored}" time="${s.toFixed(3)}">`);for(let n of e.suites){let o=n.tests.filter(i=>i.status==="failed").length,a=n.tests.filter(i=>i.status==="errored").length,p=n.tests.reduce((i,l)=>i+l.latencyMs,0)/1e3;r.push(` <testsuite name="${I(n.name)}" tests="${n.tests.length}" failures="${o}" errors="${a}" time="${p.toFixed(3)}">`);for(let i of n.tests){let l=i.latencyMs/1e3;if(r.push(` <testcase name="${I(i.name)}" classname="${I(n.name)}" time="${l.toFixed(3)}">`),i.status==="skipped")r.push(" <skipped/>");else if(i.status==="errored"&&i.error)r.push(` <error message="${I(i.error.message)}" type="${I(i.error.code)}">${I(i.error.message)}</error>`);else if(i.status==="failed"){let c=i.assertions.filter(f=>!f.passed);for(let f of c)r.push(` <failure message="${I(f.label)}" type="${I(f.failureCode??"ASSERTION_FAILED")}">${I(f.failureMessage??"Assertion failed")}</failure>`)}r.push(" </testcase>")}r.push(" </testsuite>")}if(t.gates.length>0){let n=t.gates.filter(o=>!o.passed).length;r.push(` <testsuite name="Quality Gates" tests="${t.gates.length}" failures="${n}" errors="0" time="0.000">`);for(let o of t.gates)r.push(` <testcase name="${I(o.gateName)}" classname="Quality Gates" time="0.000">`),o.passed||r.push(` <failure message="${I(o.message)}" type="GATE_FAILED">${I(o.message)}</failure>`),r.push(" </testcase>");r.push(" </testsuite>")}return r.push("</testsuites>"),{content:r.join(`
|
|
35
|
+
`),format:"xml"}}}}function I(e){return e.replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""").replace(/'/g,"'")}import{createHash as as}from"crypto";function is(){return{name:"compliance",generate(e,t){let s=new Date().toISOString(),r=[];r.push("# EU AI Act \u2014 Annex IV Compliance Report"),r.push(""),r.push(`**Generated:** ${s}`),r.push("**Framework:** EU AI Act (Regulation 2024/1689)"),r.push("**Tool:** KindLM v1.0.0"),r.push(""),r.push("## Article 9 \u2014 Risk Management System"),r.push(""),r.push("Testing demonstrates ongoing risk identification and mitigation through automated behavioral regression tests."),r.push(""),r.push(z(t,["passRateMin"])),r.push("## Article 10 \u2014 Data and Data Governance"),r.push(""),r.push("PII detection guardrails verify that personal data is not exposed in AI system outputs."),r.push(""),r.push(z(t,["piiFailuresMax","keywordFailuresMax"])),r.push("## Article 12 \u2014 Record-Keeping"),r.push(""),r.push("### Test Execution Log"),r.push(""),r.push("| Metric | Value |"),r.push("|--------|-------|"),r.push(`| Total Tests | ${e.totalTests} |`),r.push(`| Passed | ${e.passed} |`),r.push(`| Failed | ${e.failed} |`),r.push(`| Errored | ${e.errored} |`),r.push(`| Duration | ${e.durationMs}ms |`),r.push(""),r.push("### Suite Results"),r.push("");for(let p of e.suites){r.push(`**${p.name}** \u2014 ${p.status}`);for(let i of p.tests){let l=i.status==="passed"?"PASS":"FAIL";r.push(`- [${l}] ${i.name}`)}r.push("")}r.push("## Article 13 \u2014 Transparency and Provision of Information"),r.push(""),r.push("This report provides transparent documentation of AI system testing methodology, results, and quality gate evaluations as required under Article 13."),r.push(""),r.push(z(t,["judgeAvgMin","driftScoreMax"])),r.push("## Article 15 \u2014 Accuracy, Robustness and Cybersecurity"),r.push(""),r.push("Schema validation and behavioral assertions verify output accuracy and robustness."),r.push(""),r.push(z(t,["schemaFailuresMax","costMaxUsd","latencyMaxMs"])),r.push("## Quality Gate Summary"),r.push(""),r.push("| Gate | Result | Actual | Threshold |"),r.push("|------|--------|--------|-----------|");for(let p of t.gates){let i=p.passed?"PASS":"FAIL";r.push(`| ${p.gateName} | ${i} | ${je(p.actual)} | ${je(p.threshold)} |`)}r.push("");let n=t.passed?"PASS":"FAIL";r.push(`**Overall Verdict:** ${n}`),r.push("");let o=r.join(`
|
|
36
|
+
`),a=as("sha256").update(o).digest("hex");return r.push("---"),r.push(`**Tamper Evidence Hash (SHA-256):** \`${a}\``),r.push(""),{content:r.join(`
|
|
37
|
+
`),format:"markdown"}}}}function z(e,t){let s=e.gates.filter(n=>t.includes(n.gateName));if(s.length===0)return"";let r=[];r.push("**Gate Evidence:**");for(let n of s){let o=n.passed?"PASS":"FAIL";r.push(`- [${o}] ${n.message}`)}return r.push(""),r.join(`
|
|
38
|
+
`)}function je(e){return Number.isInteger(e)?String(e):e.toFixed(4)}var U="1";function Ne(e){return JSON.stringify(e,null,2)}function $e(e){let t;try{t=JSON.parse(e)}catch{return A({code:"BASELINE_CORRUPT",message:"Baseline file is not valid JSON"})}if(typeof t!="object"||t===null)return A({code:"BASELINE_CORRUPT",message:"Baseline file is not a JSON object"});let s=t;return typeof s.version!="string"?A({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: version"}):s.version!==U?A({code:"BASELINE_VERSION_MISMATCH",message:`Baseline version "${s.version}" does not match expected "${U}". Re-run \`kindlm baseline set\` to update.`}):typeof s.suiteName!="string"?A({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: suiteName"}):typeof s.createdAt!="string"?A({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: createdAt"}):typeof s.results!="object"||s.results===null?A({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: results"}):M(t)}function ls(e,t){let s=t.read(e);return s.success?$e(s.data):s}function us(e,t){let s=Ne(e);return t.write(e.suiteName,s)}function ps(e){return e.list()}function cs(e,t,s){let r={};for(let n of t){let o=`${n.testCaseName}::${n.modelId}`,p=n.runs.find(i=>i.assertions.every(l=>l.passed))??n.runs[0];r[o]={passRate:n.passRate,outputText:p?.outputText??"",failureCodes:n.failureCodes,latencyAvgMs:n.latencyAvgMs,costUsd:n.totalCostUsd,runCount:n.runCount}}return{version:U,suiteName:e,createdAt:s,results:r}}function ds(e,t){let s=[],r=[],n=[],o=[],a=[],p=new Set(Object.keys(e.results));for(let[i,l]of Object.entries(t)){let c=e.results[i];if(!c){o.push(i);continue}p.delete(i);let f=l.passRate-c.passRate;if(f<-.001){let y=l.failureCodes.filter(m=>!c.failureCodes.includes(m));s.push({testName:i,baselinePassRate:c.passRate,currentPassRate:l.passRate,newFailureCodes:y})}else f>.001?r.push({testName:i,baselinePassRate:c.passRate,currentPassRate:l.passRate}):n.push({testName:i,passRate:l.passRate})}for(let i of p)a.push(i);return{suiteName:e.suiteName,hasBaseline:!0,regressions:s,improvements:r,unchanged:n,newTests:o,removedTests:a}}function ms(e){if(!e||typeof e!="object")return A({code:"CONFIG_PARSE_ERROR",message:"OTLP payload must be a non-null object"});let t=e;if(!Array.isArray(t.resourceSpans))return A({code:"CONFIG_PARSE_ERROR",message:"OTLP payload missing resourceSpans array"});let s=[];for(let r of t.resourceSpans){let n=Le(r.resource?.attributes??[]);if(Array.isArray(r.scopeSpans)){for(let o of r.scopeSpans)if(Array.isArray(o.spans))for(let a of o.spans){let p=De(a.startTimeUnixNano),i=De(a.endTimeUnixNano);s.push({traceId:a.traceId,spanId:a.spanId,parentSpanId:a.parentSpanId||void 0,name:a.name,kind:a.kind,startTimeMs:p,endTimeMs:i,durationMs:i-p,attributes:Le(a.attributes??[]),resourceAttributes:n,statusCode:a.status?.code,statusMessage:a.status?.message})}}}return M(s)}function De(e){let t=BigInt(e);return Number(t/1000000n)}function Le(e){let t={};for(let s of e){let r=fs(s.value);r!==void 0&&(t[s.key]=r)}return t}function fs(e){if(e.stringValue!==void 0)return e.stringValue;if(e.intValue!==void 0)return parseInt(e.intValue,10);if(e.doubleValue!==void 0)return e.doubleValue;if(e.boolValue!==void 0)return e.boolValue}function gs(e,t){return t?e.filter(s=>{if(t.namePattern&&!new RegExp(t.namePattern).test(s.name))return!1;if(t.attributeMatch){for(let[r,n]of Object.entries(t.attributeMatch))if(String(s.attributes[r])!==n)return!1}return!(t.minDurationMs!==void 0&&s.durationMs<t.minDurationMs)}):e}function hs(e,t){let s="",r=[],n=0,o=0,a=0,p,i;for(let l of e){let c={...l.resourceAttributes,...l.attributes},f=c[t.outputTextAttr];typeof f=="string"&&f&&(s=s?`${s}
|
|
39
|
+
${f}`:f);let y=c[t.modelAttr];typeof y=="string"&&y&&(p=y);let m=c[t.systemAttr];typeof m=="string"&&m&&(i=m);let x=c[t.inputTokensAttr];typeof x=="number"&&(o+=x);let C=c[t.outputTokensAttr];typeof C=="number"&&(a+=C),l.parentSpanId||(n+=l.durationMs);let b=c["gen_ai.tool.name"],d=c["gen_ai.tool.arguments"];if(typeof b=="string"){let h={};if(typeof d=="string")try{h=JSON.parse(d)}catch{}r.push({id:l.spanId,name:b,arguments:h})}}return{outputText:s,toolCalls:r,latencyMs:n,inputTokens:o,outputTokens:a,model:p,system:i}}function Rs(e,t){let s=e.toolCalls.map(r=>({id:r.id,name:r.name,arguments:r.arguments}));return{outputText:e.outputText,toolCalls:s,configDir:t.configDir,latencyMs:e.latencyMs,judgeAdapter:t.judgeAdapter,judgeModel:t.judgeModel,baselineText:t.baselineText}}export{U as BASELINE_VERSION,Ce as KindLMConfigSchema,g as ProviderError,Te as SpanFilterSchema,xe as SpanMappingSchema,le as TraceConfigSchema,ce as aggregateRuns,cs as buildBaselineData,Rs as buildContextFromTrace,D as classifyAssertion,ds as compareBaseline,se as createAnthropicAdapter,Qe as createAssertionRegistry,H as createAssertionsFromExpect,ae as createCohereAdapter,is as createComplianceReporter,ee as createCostAssertion,K as createDriftAssertion,oe as createGeminiAdapter,os as createJsonReporter,V as createJudgeAssertion,ns as createJunitReporter,Z as createKeywordsAbsentAssertion,Q as createKeywordsPresentAssertion,q as createLatencyAssertion,ne as createMistralAdapter,re as createOllamaAdapter,te as createOpenAIAdapter,J as createPiiAssertion,Zt as createPrettyReporter,bt as createProvider,Ht as createRunner,G as createSchemaAssertion,B as createToolCalledAssertion,W as createToolNotCalledAssertion,X as createToolOrderAssertion,$e as deserializeBaseline,A as err,Qt as evaluateGates,gs as filterSpans,Se as findMissingVars,F as interpolate,qe as isDeterministic,et as isProbabilistic,ps as listBaselines,hs as mapSpansToResult,fe as noColor,M as ok,de as parseCommandOutput,Vt as parseConfig,ms as parseOtlpPayload,ls as readBaseline,ie as runConversation,Ne as serializeBaseline,pe as validateConfig,_ as withRetry,us as writeBaseline};
|
|
40
|
+
//# sourceMappingURL=index.js.map
|