@kindlm/core 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +108 -0
- package/dist/index.cjs +15 -14
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +245 -100
- package/dist/index.d.ts +245 -100
- package/dist/index.js +15 -14
- package/dist/index.js.map +1 -1
- package/package.json +5 -4
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
var
|
|
1
|
+
var h=class extends Error{constructor(s,n,o,r=!1,a){super(n);this.code=s;this.statusCode=o;this.retryable=r;this.raw=a;this.name="ProviderError"}};function k(e){return{success:!0,data:e}}function C(e){return{success:!1,error:e}}function Pe(e){return typeof e=="object"&&e!==null&&!Array.isArray(e)}function Z(e,t){if(Object.is(e,t))return!0;if(Array.isArray(t)){if(!Array.isArray(e)||e.length!==t.length)return!1;for(let s=0;s<e.length;s++)if(!Z(e[s],t[s]))return!1;return!0}if(Pe(t)){if(!Pe(e))return!1;for(let[s,n]of Object.entries(t))if(!(s in e)||!Z(e[s],n))return!1;return!0}return!1}function we(e,t){for(let[s,n]of Object.entries(t))if(!(s in e)||!Z(e[s],n))return!1;return!0}function Me(e,t,s,n,o,r){if(!s)return;if(!o.validateJsonSchema){r.push({assertionType:t,label:`Tool "${e}" args schema valid`,passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"argsSchema provided but no JSON Schema validator was injected"});return}let a=typeof s=="string"?JSON.parse(s):s,c=n.some(i=>{let l=o.validateJsonSchema;return l?l(a,i.arguments).valid:!1});r.push({assertionType:t,label:`Tool "${e}" args schema valid`,passed:c,score:c?1:0,failureCode:c?void 0:"TOOL_CALL_ARGS_SCHEMA_INVALID",failureMessage:c?void 0:`Tool "${e}" arguments did not match argsSchema`})}function q(e,t,s){return{type:"tool_called",evaluate(n){let o=[],r=n.toolCalls.filter(c=>c.name===e),a=n.toolCalls.map(c=>c.name);if(r.length===0)return o.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${e}" to be called, but got: [${a.join(", ")}]`}),Promise.resolve(o);if(o.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!0,score:1}),t){let c=r.some(i=>we(i.arguments,t));o.push({assertionType:"tool_called",label:`Tool "${e}" args match`,passed:c,score:c?1:0,failureCode:c?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:c?void 0:`Expected args ${JSON.stringify(t)}, got ${JSON.stringify(r[0]?.arguments)}`})}return Me(e,"tool_called",s,r,n,o),Promise.resolve(o)}}}function ee(e){return{type:"tool_not_called",evaluate(t){let s=t.toolCalls.some(n=>n.name===e);return Promise.resolve([{assertionType:"tool_not_called",label:`Tool "${e}" not called`,passed:!s,score:s?0:1,failureCode:s?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:s?`Expected tool "${e}" to NOT be called, but it was`:void 0}])}}}function te(e){return{type:"tool_order",evaluate(t){let s=[];for(let n of e){if(n.shouldNotCall){let r=t.toolCalls.some(a=>a.name===n.tool);s.push({assertionType:"tool_order",label:`Tool "${n.tool}" not called`,passed:!r,score:r?0:1,failureCode:r?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:r?`Expected tool "${n.tool}" to NOT be called, but it was`:void 0});continue}let o=t.toolCalls.filter(r=>r.name===n.tool);if(o.length===0){s.push({assertionType:"tool_order",label:`Tool "${n.tool}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${n.tool}" to be called, but it was not`});continue}if(s.push({assertionType:"tool_order",label:`Tool "${n.tool}" called`,passed:!0,score:1}),n.argsMatch){let r=o.some(a=>we(a.arguments,n.argsMatch??{}));s.push({assertionType:"tool_order",label:`Tool "${n.tool}" args match`,passed:r,score:r?1:0,failureCode:r?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:r?void 0:`Expected args ${JSON.stringify(n.argsMatch)}, got ${JSON.stringify(o[0]?.arguments)}`})}if(Me(n.tool,"tool_order",n.argsSchema,o,t,s),n.order!==void 0){let a=t.toolCalls[n.order]?.name===n.tool,c=t.toolCalls.findIndex(i=>i.name===n.tool);s.push({assertionType:"tool_order",label:`Tool "${n.tool}" at position ${n.order}`,passed:a,score:a?1:0,failureCode:a?void 0:"TOOL_CALL_ORDER_WRONG",failureMessage:a?void 0:`Expected "${n.tool}" at position ${n.order}, but found at position ${c}`})}}return Promise.resolve(s)}}}function re(e){let t;async function s(){if(t)return t;let r=await import("ajv"),a=await import("ajv-formats"),c=r.default,i=a.default,l=new c({allErrors:!0,strict:!1});return i(l),t=l,l}let n=new Map;function o(r,a){let c=JSON.stringify(a),i=n.get(c);if(i)return i;try{let l=r.compile(a);return n.set(c,l),l}catch(l){return{compileError:l instanceof Error?l.message:String(l)}}}return{type:"schema",async evaluate(r){let a=[],c;if(e.format==="json")try{c=JSON.parse(r.outputText),a.push({assertionType:"schema",label:"Output is valid JSON",passed:!0,score:1})}catch(i){return a.push({assertionType:"schema",label:"Output is valid JSON",passed:!1,score:0,failureCode:"SCHEMA_PARSE_ERROR",failureMessage:`Failed to parse output as JSON: ${i instanceof Error?i.message:String(i)}`}),a}if(e.schemaContent){let i=await s(),l=o(i,e.schemaContent);if("compileError"in l)a.push({assertionType:"schema",label:"Output matches JSON Schema",passed:!1,score:0,failureCode:"SCHEMA_INVALID",failureMessage:`Schema compilation failed: ${l.compileError}`});else{let p=l,f=p(c??r.outputText);a.push({assertionType:"schema",label:"Output matches JSON Schema",passed:f,score:f?1:0,failureCode:f?void 0:"SCHEMA_INVALID",failureMessage:f?void 0:`Schema validation failed: ${i.errorsText(p.errors)}`,metadata:f?void 0:{errors:p.errors}})}}if(e.contains){let i=r.outputText.toLowerCase();for(let l of e.contains){let p=i.includes(l.toLowerCase());a.push({assertionType:"schema",label:`Output contains "${l}"`,passed:p,score:p?1:0,failureCode:p?void 0:"CONTAINS_FAILED",failureMessage:p?void 0:`Expected output to contain "${l}"`})}}if(e.notContains){let i=r.outputText.toLowerCase();for(let l of e.notContains){let p=i.includes(l.toLowerCase());a.push({assertionType:"schema",label:`Output does not contain "${l}"`,passed:!p,score:p?0:1,failureCode:p?"NOT_CONTAINS_FAILED":void 0,failureMessage:p?`Expected output to NOT contain "${l}"`:void 0})}}if(e.maxLength!==void 0){let i=r.outputText.length<=e.maxLength;a.push({assertionType:"schema",label:`Output length <= ${e.maxLength}`,passed:i,score:i?1:0,failureCode:i?void 0:"MAX_LENGTH_EXCEEDED",failureMessage:i?void 0:`Output length ${r.outputText.length} exceeds max ${e.maxLength}`})}return a}}}function rt(e){return e.length<=4?"*".repeat(e.length):e.slice(0,2)+"*".repeat(e.length-4)+e.slice(-2)}var st=/(\+|\*|\{[^}]+\})\)?(\+|\*|\{[^}]+\})/;function X(e){return st.test(e)}function nt(e,t,s){let n=[],o=Date.now();e.lastIndex=0;let r;for(;(r=e.exec(t))!==null&&(n.push(r[0]),!(n.length>=s||Date.now()-o>100)););return n}function se(e){let t=[],s;for(let n=0;n<e.denyPatterns.length;n++){let o=e.denyPatterns[n];if(o!==void 0){if(X(o)){s=[{assertionType:"pii",label:"No PII detected",passed:!1,score:0,failureCode:"INVALID_PATTERN",failureMessage:`Deny pattern "pii-pattern-${n+1}" contains nested quantifiers and may cause catastrophic backtracking`}];break}t.push({name:`pii-pattern-${n+1}`,regex:new RegExp(o,"gi")})}}if(e.customPatterns)for(let n of e.customPatterns){if(X(n.pattern)){s=[{assertionType:"pii",label:"No PII detected",passed:!1,score:0,failureCode:"INVALID_PATTERN",failureMessage:`Custom pattern "${n.name}" contains nested quantifiers and may cause catastrophic backtracking`}];break}t.push({name:n.name,regex:new RegExp(n.pattern,"gi")})}return{type:"pii",evaluate(n){if(s)return Promise.resolve(s);let o=[],r=0,a=Date.now();for(let{name:i,regex:l}of t){if(r>=1e3||Date.now()-a>500)break;let p=1e3-r,f=nt(l,n.outputText,p);for(let y of f)o.push({name:i,redacted:rt(y)});r+=f.length}let c=o.length===0;return Promise.resolve([{assertionType:"pii",label:"No PII detected",passed:c,score:c?1:0,failureCode:c?void 0:"PII_DETECTED",failureMessage:c?void 0:`Found ${o.length} PII match(es): ${o.map(i=>`${i.name}=${i.redacted}`).join(", ")}`,metadata:c?void 0:{matches:o}}])}}}function ne(e){return{type:"keywords_present",evaluate(t){let s=t.outputText.toLowerCase(),n=e.some(o=>s.includes(o.toLowerCase()));return Promise.resolve([{assertionType:"keywords_present",label:"Required keyword present",passed:n,score:n?1:0,failureCode:n?void 0:"KEYWORD_MISSING",failureMessage:n?void 0:`Expected at least one of [${e.join(", ")}] in output`}])}}}function oe(e){return{type:"keywords_absent",evaluate(t){let s=t.outputText.toLowerCase(),n=[];for(let o of e){let r=s.includes(o.toLowerCase());n.push({assertionType:"keywords_absent",label:`Keyword "${o}" absent`,passed:!r,score:r?0:1,failureCode:r?"KEYWORD_DENIED":void 0,failureMessage:r?`Denied keyword "${o}" found in output`:void 0})}return Promise.resolve(n)}}}function K(e,t){return typeof e!="number"?{ok:!1,reason:`${t} must be a number`}:Number.isFinite(e)?e<0||e>1?{ok:!1,reason:`${t} must be between 0 and 1 inclusive`}:{ok:!0,score:e}:{ok:!1,reason:`${t} must be a finite number`}}var ot=`You are an impartial AI judge evaluating an AI assistant's response.
|
|
2
2
|
You will be given:
|
|
3
3
|
- The assistant's response
|
|
4
4
|
- Evaluation criteria
|
|
@@ -7,14 +7,14 @@ You will be given:
|
|
|
7
7
|
Score the response from 0.0 to 1.0 based on how well it meets the criteria.
|
|
8
8
|
|
|
9
9
|
Respond ONLY with a JSON object in this exact format:
|
|
10
|
-
{"score": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function
|
|
10
|
+
{"score": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function at(e,t,s){let n=`## Assistant Response
|
|
11
11
|
${e}
|
|
12
12
|
|
|
13
13
|
## Criteria
|
|
14
|
-
${t}`;return s&&(
|
|
14
|
+
${t}`;return s&&(n+=`
|
|
15
15
|
|
|
16
16
|
## Rubric
|
|
17
|
-
${s}`),
|
|
17
|
+
${s}`),n}function it(e){let t=e.match(/```(?:json)?\s*([\s\S]*?)```/)??e.match(/(\{[\s\S]*\})/);if(!t?.[1])return{ok:!1,reason:"No JSON object found in judge response"};let s;try{s=JSON.parse(t[1])}catch{return{ok:!1,reason:"Invalid JSON in judge response"}}let n=K(s.score,"score");return n.ok?typeof s.reasoning!="string"?{ok:!1,reason:"reasoning must be a string"}:{ok:!0,score:n.score,reasoning:s.reasoning}:{ok:!1,reason:n.reason}}function ae(e){return{type:"judge",async evaluate(t){if(!t.judgeAdapter||!t.judgeModel)return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Judge assertion requires judgeAdapter and judgeModel in context"}];let s;try{s=await t.judgeAdapter.complete({model:e.model??t.judgeModel,messages:[{role:"system",content:ot},{role:"user",content:at(t.outputText,e.criteria,e.rubric)}],params:{temperature:0,maxTokens:512}})}catch(r){return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"JUDGE_EVAL_ERROR",failureMessage:`Judge adapter error: ${r instanceof Error?r.message:String(r)}`}]}let n=it(s.text);if(!n.ok)return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"JUDGE_PARSE_ERROR",failureMessage:`Failed to parse judge response: ${n.reason}`}];let o=n.score>=e.minScore;return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:o,score:n.score,failureCode:o?void 0:"JUDGE_BELOW_THRESHOLD",failureMessage:o?void 0:`Score ${n.score} below threshold ${e.minScore}: ${n.reasoning}`,metadata:{reasoning:n.reasoning,threshold:e.minScore}}]}}}var lt=`You are an impartial AI judge comparing two AI assistant responses.
|
|
18
18
|
You will be given a baseline response and a new response.
|
|
19
19
|
Evaluate how much the new response has drifted from the baseline.
|
|
20
20
|
|
|
@@ -24,17 +24,18 @@ Score from 0.0 to 1.0 where:
|
|
|
24
24
|
- 1.0 = completely different meaning or contradictory
|
|
25
25
|
|
|
26
26
|
Respond ONLY with a JSON object in this exact format:
|
|
27
|
-
{"driftScore": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function
|
|
27
|
+
{"driftScore": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function ut(e){let t=e.match(/```(?:json)?\s*([\s\S]*?)```/)??e.match(/(\{[\s\S]*\})/);if(!t?.[1])return{ok:!1,reason:"No JSON object found in drift judge response"};let s;try{s=JSON.parse(t[1])}catch{return{ok:!1,reason:"Invalid JSON in drift judge response"}}let n=K(s.driftScore,"driftScore");return n.ok?typeof s.reasoning!="string"?{ok:!1,reason:"reasoning must be a string"}:{ok:!0,driftScore:n.score,reasoning:s.reasoning}:{ok:!1,reason:n.reason}}function Se(e,t){let s=t.split("."),n=e;for(let o of s){if(n==null||typeof n!="object")return;n=n[o]}return n}function ct(e,t,s){let n,o;try{n=JSON.parse(e),o=JSON.parse(t)}catch{return{driftScore:1,mismatched:["(parse error)"]}}let r=[];for(let c of s){let i=Se(n,c),l=Se(o,c);JSON.stringify(i)!==JSON.stringify(l)&&r.push(c)}return{driftScore:s.length>0?r.length/s.length:0,mismatched:r}}function Oe(e,t){if(e.length!==t.length||e.length===0)return 0;let s=0,n=0,o=0;for(let a=0;a<e.length;a++){let c=e[a]??0,i=t[a]??0;s+=c*i,n+=c*c,o+=i*i}let r=Math.sqrt(n)*Math.sqrt(o);return r===0?0:s/r}function ie(e){return{type:"drift",async evaluate(t){if(!t.baselineText)return[{assertionType:"drift",label:"Drift check",passed:!1,score:0,failureCode:"DRIFT_EXCEEDED",failureMessage:"No baseline available \u2014 run `kindlm baseline set` first",metadata:{reason:"No baseline available"}}];if(e.method==="embedding"){if(!t.getEmbedding)return[{assertionType:"drift",label:"Drift check (embedding)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Drift embedding method requires getEmbedding in context"}];let a,c;try{[a,c]=await Promise.all([t.getEmbedding(t.baselineText),t.getEmbedding(t.outputText)])}catch(y){return[{assertionType:"drift",label:"Drift check (embedding)",passed:!1,score:0,failureCode:"JUDGE_EVAL_ERROR",failureMessage:`Embedding error: ${y instanceof Error?y.message:String(y)}`}]}let i=Oe(a,c),l=1-i,p=i,f=l<=e.maxScore;return[{assertionType:"drift",label:"Drift check (embedding)",passed:f,score:p,failureCode:f?void 0:"DRIFT_EXCEEDED",failureMessage:f?void 0:`Drift score ${l.toFixed(3)} exceeds max ${e.maxScore}`,metadata:{driftScore:l,similarity:i,threshold:e.maxScore}}]}if(e.method==="field-diff"){let a=e.fields??[],{driftScore:c,mismatched:i}=ct(t.baselineText,t.outputText,a),l=1-c,p=c<=e.maxScore;return[{assertionType:"drift",label:"Drift check (field-diff)",passed:p,score:l,failureCode:p?void 0:"DRIFT_EXCEEDED",failureMessage:p?void 0:`Drift score ${c.toFixed(2)} exceeds max ${e.maxScore}. Mismatched: [${i.join(", ")}]`,metadata:{driftScore:c,mismatched:i,threshold:e.maxScore}}]}if(!t.judgeAdapter||!t.judgeModel)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Drift judge method requires judgeAdapter and judgeModel in context"}];let s;try{s=await t.judgeAdapter.complete({model:t.judgeModel,messages:[{role:"system",content:lt},{role:"user",content:`## Baseline Response
|
|
28
28
|
${t.baselineText}
|
|
29
29
|
|
|
30
30
|
## New Response
|
|
31
|
-
${t.outputText}`}],params:{temperature:0,maxTokens:512}}),r=Xe(s.text);if(!r)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:`Failed to parse drift judge response: ${s.text.slice(0,200)}`}];let n=1-r.driftScore,o=r.driftScore<=e.maxScore;return[{assertionType:"drift",label:"Drift check (judge)",passed:o,score:n,failureCode:o?void 0:"DRIFT_EXCEEDED",failureMessage:o?void 0:`Drift score ${r.driftScore.toFixed(2)} exceeds max ${e.maxScore}: ${r.reasoning}`,metadata:{driftScore:r.driftScore,reasoning:r.reasoning}}]}}}function q(e){return{type:"latency",evaluate(t){let s=t.latencyMs??0,r=s<=e.maxMs;return Promise.resolve([{assertionType:"latency",label:`Latency <= ${e.maxMs}ms`,passed:r,score:r?1:0,failureCode:r?void 0:"PROVIDER_TIMEOUT",failureMessage:r?void 0:`Latency ${s}ms exceeds max ${e.maxMs}ms`,metadata:{latencyMs:s}}])}}}function ee(e){return{type:"cost",evaluate(t){let s=t.costUsd??0,r=s<=e.maxUsd;return Promise.resolve([{assertionType:"cost",label:`Cost <= $${e.maxUsd}`,passed:r,score:r?1:0,failureCode:r?void 0:"INTERNAL_ERROR",failureMessage:r?void 0:`Cost $${s.toFixed(4)} exceeds max $${e.maxUsd}`,metadata:{costUsd:s}}])}}}function H(e,t){let s=[];if(e.toolCalls)if(e.toolCalls.some(n=>n.order!==void 0))s.push(X(e.toolCalls));else for(let n of e.toolCalls)n.shouldNotCall?s.push(W(n.tool)):s.push(B(n.tool,n.argsMatch??void 0));if(e.output&&s.push(G({format:e.output.format,schemaFile:e.output.schemaFile,schemaContent:t?.schemaContent,contains:e.output.contains,notContains:e.output.notContains,maxLength:e.output.maxLength})),e.guardrails?.pii&&s.push(J({denyPatterns:e.guardrails.pii.denyPatterns,customPatterns:e.guardrails.pii.customPatterns})),e.guardrails?.keywords&&(e.guardrails.keywords.allow&&e.guardrails.keywords.allow.length>0&&s.push(Q(e.guardrails.keywords.allow)),e.guardrails.keywords.deny.length>0&&s.push(Z(e.guardrails.keywords.deny))),e.judge)for(let r of e.judge)s.push(V({criteria:r.criteria,minScore:r.minScore,rubric:r.rubric}));return e.baseline?.drift&&s.push(K({maxScore:e.baseline.drift.maxScore,method:e.baseline.drift.method,fields:e.baseline.drift.fields})),s}function Qe(){return new Map([["tool_called",e=>B(e.toolCalls?.[0]?.tool??"")],["schema",e=>G(e.output??{format:"text"})],["pii",e=>J(e.guardrails?.pii??{denyPatterns:[]})],["judge",e=>V(e.judge?.[0]??{criteria:"",minScore:.7})],["drift",e=>K(e.baseline?.drift??{maxScore:.15,method:"judge"})],["latency",()=>q({maxMs:6e4})],["cost",()=>ee({maxUsd:1})]])}var Ze=new Set(["judge","drift"]);function D(e){return Ze.has(e)?"probabilistic":"deterministic"}function qe(e){return D(e)==="deterministic"}function et(e){return D(e)==="probabilistic"}async function _(e,t){let{maxRetries:s,shouldRetry:r,baseDelayMs:n=500}=t,o;for(let a=0;a<=s;a++)try{return await e()}catch(p){if(o=p,a>=s||!r(p))throw p;let i=n*Math.pow(2,a);await tt(i)}throw o}function tt(e){return new Promise(t=>setTimeout(t,e))}var st={"gpt-4o":{input:2.5,output:10},"gpt-4o-mini":{input:.15,output:.6},"gpt-4-turbo":{input:10,output:30},"o3-mini":{input:1.1,output:4.4}};function rt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function ot(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function te(e){let t="",s="https://api.openai.com/v1",r,n=6e4,o=2;return{name:"openai",async initialize(a){if(!a.apiKey)throw new g("AUTH_FAILED","API key is required");t=a.apiKey,a.baseUrl&&(s=a.baseUrl),r=a.organization,n=a.timeoutMs,o=a.maxRetries},async complete(a){let p={model:a.model,messages:a.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content,tool_call_id:d.toolCallId}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||null,tool_calls:d.toolCalls.map(h=>({id:h.id,type:"function",function:{name:h.name,arguments:JSON.stringify(h.arguments)}}))}:{role:d.role,content:d.content}),temperature:a.params.temperature,max_tokens:a.params.maxTokens};a.params.topP!==void 0&&(p.top_p=a.params.topP),a.params.seed!==void 0&&(p.seed=a.params.seed),a.params.stopSequences&&(p.stop=a.params.stopSequences),a.tools&&a.tools.length>0&&(p.tools=a.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})),a.toolChoice&&(p.tool_choice=a.toolChoice));let i={"Content-Type":"application/json",Authorization:`Bearer ${t}`};r&&(i["OpenAI-Organization"]=r);let l=Date.now(),c=await _(()=>e.fetch(`${s}/chat/completions`,{method:"POST",headers:i,body:JSON.stringify(p),timeoutMs:n}),{maxRetries:o,shouldRetry:d=>d instanceof g&&d.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from OpenAI API",c.status,c.status>=500)}let y=Date.now()-l;if(!c.ok)throw ot(c.status,f);let m=f,x=m.choices?.[0],C=x?.message,b=(C?.tool_calls??[]).map(d=>{let h;try{h=JSON.parse(d.function.arguments||"{}")}catch{h={_raw:d.function.arguments}}return{id:d.id,name:d.function.name,arguments:h}});return{text:C?.content??"",toolCalls:b,usage:{inputTokens:m.usage?.prompt_tokens??0,outputTokens:m.usage?.completion_tokens??0,totalTokens:m.usage?.total_tokens??0},raw:f,latencyMs:y,modelId:m.model??a.model,finishReason:rt(x?.finish_reason)}},estimateCost(a,p){let i=st[a];return i?p.inputTokens/1e6*i.input+p.outputTokens/1e6*i.output:null},supportsTools(a){return!a.startsWith("o1-")}}}var nt={"claude-opus-4-5-20250929":{input:15,output:75},"claude-sonnet-4-5-20250929":{input:3,output:15},"claude-haiku-4-5-20251001":{input:.8,output:4}};function at(e){switch(e){case"end_turn":return"stop";case"max_tokens":return"max_tokens";case"tool_use":return"tool_calls";default:return"unknown"}}function it(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function se(e){let t="",s="https://api.anthropic.com",r=6e4,n=2;return{name:"anthropic",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a=o.messages.find(h=>h.role==="system"),p=o.messages.filter(h=>h.role!=="system").map(h=>{if(h.role==="tool")return{role:"user",content:[{type:"tool_result",tool_use_id:h.toolCallId,content:h.content}]};if(h.role==="assistant"){if(h.toolCalls&&h.toolCalls.length>0){let R=[];h.content&&R.push({type:"text",text:h.content});for(let P of h.toolCalls)R.push({type:"tool_use",id:P.id,name:P.name,input:P.arguments});return{role:"assistant",content:R}}return{role:"assistant",content:h.content}}return{role:"user",content:h.content}}),i={model:o.model,max_tokens:o.params.maxTokens,messages:p};a&&(i.system=a.content),o.params.temperature!==void 0&&(i.temperature=o.params.temperature),o.params.topP!==void 0&&(i.top_p=o.params.topP),o.params.stopSequences&&(i.stop_sequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(i.tools=o.tools.map(h=>({name:h.name,description:h.description??"",input_schema:h.parameters??{type:"object",properties:{}}})),o.toolChoice&&(i.tool_choice=o.toolChoice==="required"?{type:"any"}:{type:o.toolChoice}));let l=Date.now(),c=await _(()=>e.fetch(`${s}/v1/messages`,{method:"POST",headers:{"Content-Type":"application/json","x-api-key":t,"anthropic-version":"2023-06-01"},body:JSON.stringify(i),timeoutMs:r}),{maxRetries:n,shouldRetry:h=>h instanceof g&&h.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Anthropic API",c.status,c.status>=500)}let y=Date.now()-l;if(!c.ok)throw it(c.status,f);let m=f,x="",C=[];for(let h of m.content??[])h.type==="text"?x+=h.text:h.type==="tool_use"&&C.push({id:h.id,name:h.name,arguments:h.input??{}});let b=m.usage?.input_tokens??0,d=m.usage?.output_tokens??0;return{text:x,toolCalls:C,usage:{inputTokens:b,outputTokens:d,totalTokens:b+d},raw:f,latencyMs:y,modelId:m.model??o.model,finishReason:at(m.stop_reason)}},estimateCost(o,a){let p=Object.entries(nt).find(([i])=>o.includes(i)||i.includes(o));return p?a.inputTokens/1e6*p[1].input+a.outputTokens/1e6*p[1].output:null},supportsTools(o){return!0}}}function lt(e,t){if(t)return"tool_calls";switch(e){case"stop":return"stop";case"length":return"max_tokens";default:return"unknown"}}function ut(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error):"Unknown error";switch(e){case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function re(e){let t="http://localhost:11434",s=6e4,r=2;return{name:"ollama",async initialize(n){n.baseUrl&&(t=n.baseUrl),s=n.timeoutMs,r=n.maxRetries},async complete(n){let o=n.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||"",tool_calls:d.toolCalls.map(h=>({function:{name:h.name,arguments:h.arguments}}))}:{role:d.role,content:d.content}),a={model:n.model,messages:o,stream:!1,options:{temperature:n.params.temperature,num_predict:n.params.maxTokens}},p=a.options;n.params.topP!==void 0&&(p.top_p=n.params.topP),n.params.seed!==void 0&&(p.seed=n.params.seed),n.params.stopSequences&&(p.stop=n.params.stopSequences),n.tools&&n.tools.length>0&&(a.tools=n.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})));let i={"Content-Type":"application/json"},l=Date.now(),c=await _(()=>e.fetch(`${t}/api/chat`,{method:"POST",headers:i,body:JSON.stringify(a),timeoutMs:s}),{maxRetries:r,shouldRetry:d=>d instanceof g&&d.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Ollama API",c.status,c.status>=500)}let y=Date.now()-l;if(!c.ok)throw ut(c.status,f);let m=f,x=m.message,C=x?.tool_calls??[],b=C.map((d,h)=>({id:`ollama_call_${h}`,name:d.function.name,arguments:d.function.arguments??{}}));return{text:x?.content??"",toolCalls:b,usage:{inputTokens:m.prompt_eval_count??0,outputTokens:m.eval_count??0,totalTokens:(m.prompt_eval_count??0)+(m.eval_count??0)},raw:f,latencyMs:y,modelId:m.model??n.model,finishReason:lt(m.done_reason,C.length>0)}},estimateCost(){return 0},supportsTools(){return!0}}}var pt={"gemini-2.0-flash":{input:.1,output:.4},"gemini-2.0-flash-lite":{input:.075,output:.3},"gemini-1.5-pro":{input:1.25,output:5},"gemini-1.5-flash":{input:.075,output:.3},"gemini-1.5-flash-8b":{input:.0375,output:.15}};function ct(e,t){if(t)return"tool_calls";switch(e){case"STOP":return"stop";case"MAX_TOKENS":return"max_tokens";case"SAFETY":return"stop";default:return"unknown"}}function dt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 400:return s.toLowerCase().includes("api key")?new g("AUTH_FAILED",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 401:case 403:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function oe(e){let t="",s="https://generativelanguage.googleapis.com/v1beta",r=6e4,n=2;return{name:"gemini",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a=[],p;for(let T of o.messages){if(T.role==="system"){p={parts:[{text:T.content}]};continue}if(T.role==="tool"){a.push({role:"function",parts:[{functionResponse:{name:T.toolName??"unknown",response:mt(T.content)}}]});continue}if(T.role==="assistant"&&T.toolCalls&&T.toolCalls.length>0){let S=[];T.content&&S.push({text:T.content});for(let ge of T.toolCalls)S.push({functionCall:{name:ge.name,args:ge.arguments}});a.push({role:"model",parts:S});continue}let v=T.role==="assistant"?"model":"user";a.push({role:v,parts:[{text:T.content}]})}let i={contents:a,generationConfig:{temperature:o.params.temperature,maxOutputTokens:o.params.maxTokens}};p&&(i.systemInstruction=p);let l=i.generationConfig;o.params.topP!==void 0&&(l.topP=o.params.topP),o.params.stopSequences&&(l.stopSequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(i.tools=[{functionDeclarations:o.tools.map(T=>({name:T.name,description:T.description,parameters:T.parameters}))}],o.toolChoice&&(i.toolConfig=ft(o.toolChoice)));let c={"Content-Type":"application/json","x-goog-api-key":t},f=Date.now(),y=`${s}/models/${o.model}:generateContent`,m=await _(()=>e.fetch(y,{method:"POST",headers:c,body:JSON.stringify(i),timeoutMs:r}),{maxRetries:n,shouldRetry:T=>T instanceof g&&T.retryable}),x;try{x=await m.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Gemini API",m.status,m.status>=500)}let C=Date.now()-f;if(!m.ok)throw dt(m.status,x);let b=x,d=b.candidates?.[0],h=d?.content?.parts??[],R="",P=[],E=0;for(let T of h)T.text!==void 0&&(R+=T.text),T.functionCall&&(P.push({id:`gemini_call_${E}`,name:T.functionCall.name,arguments:T.functionCall.args??{}}),E++);return{text:R,toolCalls:P,usage:{inputTokens:b.usageMetadata?.promptTokenCount??0,outputTokens:b.usageMetadata?.candidatesTokenCount??0,totalTokens:b.usageMetadata?.totalTokenCount??0},raw:x,latencyMs:C,modelId:b.modelVersion??o.model,finishReason:ct(d?.finishReason,P.length>0)}},estimateCost(o,a){let p=pt[o];return p?a.inputTokens/1e6*p.input+a.outputTokens/1e6*p.output:null},supportsTools(){return!0}}}function mt(e){try{return JSON.parse(e)}catch{return{result:e}}}function ft(e){switch(e){case"auto":return{functionCallingConfig:{mode:"AUTO"}};case"required":return{functionCallingConfig:{mode:"ANY"}};case"none":return{functionCallingConfig:{mode:"NONE"}}}}function gt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function ht(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ne(e){let t="",s="https://api.mistral.ai/v1",r=6e4,n=2;return{name:"mistral",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a={model:o.model,messages:o.messages.map(b=>b.role==="tool"?{role:"tool",content:b.content,tool_call_id:b.toolCallId}:b.role==="assistant"&&b.toolCalls&&b.toolCalls.length>0?{role:"assistant",content:b.content||null,tool_calls:b.toolCalls.map(d=>({id:d.id,type:"function",function:{name:d.name,arguments:JSON.stringify(d.arguments)}}))}:{role:b.role,content:b.content}),temperature:o.params.temperature,max_tokens:o.params.maxTokens};o.params.topP!==void 0&&(a.top_p=o.params.topP),o.params.stopSequences&&(a.stop=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(b=>({type:"function",function:{name:b.name,description:b.description,parameters:b.parameters}})),o.toolChoice&&(a.tool_choice=o.toolChoice));let p={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),l=await _(()=>e.fetch(`${s}/chat/completions`,{method:"POST",headers:p,body:JSON.stringify(a),timeoutMs:r}),{maxRetries:n,shouldRetry:b=>b instanceof g&&b.retryable}),c;try{c=await l.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Mistral API",l.status,l.status>=500)}let f=Date.now()-i;if(!l.ok)throw ht(l.status,c);let y=c,m=y.choices?.[0],x=m?.message,C=(x?.tool_calls??[]).map(b=>{let d;try{d=JSON.parse(b.function.arguments||"{}")}catch{d={_raw:b.function.arguments}}return{id:b.id,name:b.function.name,arguments:d}});return{text:x?.content??"",toolCalls:C,usage:{inputTokens:y.usage?.prompt_tokens??0,outputTokens:y.usage?.completion_tokens??0,totalTokens:y.usage?.total_tokens??0},raw:c,latencyMs:f,modelId:y.model??o.model,finishReason:gt(m?.finish_reason)}},estimateCost(o,a){return null},supportsTools(o){return!0}}}function Rt(e){switch(e){case"COMPLETE":return"stop";case"MAX_TOKENS":return"max_tokens";case"TOOL_CALL":return"tool_calls";default:return"unknown"}}function yt(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ae(e){let t="",s="https://api.cohere.com",r=6e4,n=2;return{name:"cohere",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a={model:o.model,messages:o.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content,tool_call_id:d.toolCallId}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||null,tool_calls:d.toolCalls.map(h=>({id:h.id,type:"function",function:{name:h.name,arguments:JSON.stringify(h.arguments)}}))}:{role:d.role,content:d.content}),temperature:o.params.temperature,max_tokens:o.params.maxTokens};o.params.topP!==void 0&&(a.p=o.params.topP),o.params.stopSequences&&(a.stop_sequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})),o.toolChoice&&(a.tool_choice=o.toolChoice));let p={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),l=await _(()=>e.fetch(`${s}/v2/chat`,{method:"POST",headers:p,body:JSON.stringify(a),timeoutMs:r}),{maxRetries:n,shouldRetry:d=>d instanceof g&&d.retryable}),c;try{c=await l.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Cohere API",l.status,l.status>=500)}let f=Date.now()-i;if(!l.ok)throw yt(l.status,c);let y=c,m=y.message?.content?.filter(d=>d.type==="text").map(d=>d.text).join("")??"",x=(y.message?.tool_calls??[]).map(d=>{let h;try{h=JSON.parse(d.function.arguments||"{}")}catch{h={_raw:d.function.arguments}}return{id:d.id,name:d.function.name,arguments:h}}),C=y.usage?.tokens?.input_tokens??0,b=y.usage?.tokens?.output_tokens??0;return{text:m,toolCalls:x,usage:{inputTokens:C,outputTokens:b,totalTokens:C+b},raw:c,latencyMs:f,modelId:y.model??o.model,finishReason:Rt(y.finish_reason)}},estimateCost(o,a){return null},supportsTools(o){return!0}}}var be={openai:te,anthropic:se,ollama:re,gemini:oe,mistral:ne,cohere:ae};function bt(e,t){let s=be[e];if(!s){let r=Object.keys(be).join(", ");throw new Error(`Unknown provider: "${e}". Supported providers: ${r}`)}return s(t)}async function ie(e,t,s,r){let n=r?.maxTurns??10,o=[],a=[],p=[...t.messages],i={inputTokens:0,outputTokens:0,totalTokens:0},l=0;for(let y=0;y<n;y++){let m={...t,messages:p},x=await e.complete(m);if(o.push({request:m,response:x}),i.inputTokens+=x.usage.inputTokens,i.outputTokens+=x.usage.outputTokens,i.totalTokens+=x.usage.totalTokens,l+=x.latencyMs,x.toolCalls.length===0)return{turns:o,finalText:x.text,allToolCalls:a,totalUsage:i,totalLatencyMs:l};a.push(...x.toolCalls),p=[...p,{role:"assistant",content:x.text,toolCalls:x.toolCalls}];for(let C of x.toolCalls){let b=s.find(h=>h.name===C.name),d;b?d=xt(b,C.arguments):d={error:`Tool "${C.name}" not simulated`},p.push({role:"tool",content:JSON.stringify(d),toolCallId:C.id,toolName:C.name})}}let f=o[o.length-1]?.response??{text:"",toolCalls:[],usage:{inputTokens:0,outputTokens:0,totalTokens:0},raw:null,latencyMs:0,modelId:t.model,finishReason:"unknown"};return{turns:o,finalText:f.text,allToolCalls:a,totalUsage:i,totalLatencyMs:l}}function xt(e,t){if(e.responses){for(let s of e.responses)if(Tt(s.when,t))return s.then}return e.defaultResponse!==void 0?e.defaultResponse:{error:"No matching simulation response"}}function Tt(e,t){for(let[s,r]of Object.entries(e))if(JSON.stringify(t[s])!==JSON.stringify(r))return!1;return!0}import{z as u}from"zod";import{z as O}from"zod";var xe=O.object({outputTextAttr:O.string().default("gen_ai.completion.0.content"),modelAttr:O.string().default("gen_ai.response.model"),systemAttr:O.string().default("gen_ai.system"),inputTokensAttr:O.string().default("gen_ai.usage.input_tokens"),outputTokensAttr:O.string().default("gen_ai.usage.output_tokens")}),Te=O.object({namePattern:O.string().optional().describe("Regex to filter span names"),attributeMatch:O.record(O.string()).optional().describe("Attributes that must match"),minDurationMs:O.number().min(0).optional()}),le=O.object({port:O.number().int().min(1).max(65535).default(4318),timeoutMs:O.number().int().min(1e3).default(3e4),spanMapping:xe.default({}),spanFilter:Te.optional()});var k=u.string().min(1,"Must not be empty"),Ct=u.number().min(0).max(2).default(.2),$=u.number().min(0).max(1),ue=u.string().refine(e=>{try{return new RegExp(e),!0}catch{return!1}},{message:"Must be a valid regex pattern"}),L=u.object({apiKeyEnv:k.describe("Environment variable name containing the API key. Never a raw key."),baseUrl:u.string().url().optional().describe("Custom base URL for API-compatible proxies (e.g., Azure OpenAI, LiteLLM)"),organization:u.string().optional().describe("Organization ID (OpenAI-specific)")}),At=u.object({apiKeyEnv:u.string().min(1).optional().describe("Environment variable name containing the API key. Optional for Ollama (local)."),baseUrl:u.string().url().optional().describe("Ollama server URL. Defaults to http://localhost:11434.")}),Pt=u.object({openai:L.optional(),anthropic:L.optional(),ollama:At.optional(),gemini:L.optional(),mistral:L.optional(),cohere:L.optional()}).refine(e=>Object.keys(e).some(t=>e[t]!==void 0),{message:"At least one provider must be configured"}),Et=u.object({temperature:Ct,maxTokens:u.number().int().min(1).max(128e3).default(1024),topP:u.number().min(0).max(1).optional(),stopSequences:u.array(u.string()).optional(),seed:u.number().int().optional().describe("Seed for reproducibility (provider-dependent support)")}),vt=u.object({id:k.describe("Unique identifier for this model config, referenced in reports"),provider:u.enum(["openai","anthropic","ollama","gemini","mistral","cohere"]).describe("Must match a key in the providers section"),model:k.describe("Model name as the provider expects it (e.g., 'gpt-4o', 'claude-sonnet-4-5-20250929')"),params:Et.default({})}),wt=u.object({system:u.string().optional().describe("System prompt template. Supports {{variable}} interpolation."),user:k.describe("User prompt template. Supports {{variable}} interpolation."),assistant:u.string().optional().describe("Prefill for assistant response (Anthropic-specific)")}),St=u.object({format:u.enum(["text","json"]).default("text"),schemaFile:u.string().optional().describe("Path to JSON Schema file (relative to config file). Required if format is 'json'."),contains:u.array(u.string()).optional().describe("Output must contain all of these substrings"),notContains:u.array(u.string()).optional().describe("Output must not contain any of these substrings"),maxLength:u.number().int().positive().optional().describe("Maximum character length of the output")}).refine(e=>!(e.format==="json"&&!e.schemaFile),{message:"schemaFile is required when format is 'json'"}),Mt=u.object({enabled:u.boolean().default(!0),denyPatterns:u.array(ue).default(["\\b\\d{3}-\\d{2}-\\d{4}\\b","\\b\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}\\b","\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"]).describe("Regex patterns that must NOT appear in output. Defaults include SSN, credit card, email."),customPatterns:u.array(u.object({name:k,pattern:ue})).optional().describe("Named custom PII patterns for reporting clarity")}),Ot=u.object({deny:u.array(u.string()).default([]).describe("Words/phrases that must NOT appear in output (case-insensitive)"),allow:u.array(u.string()).optional().describe("If set, output MUST contain at least one of these words/phrases")}),_t=u.object({criteria:k.describe("Natural language description of what to evaluate (e.g., 'Response is empathetic and professional')"),minScore:$.default(.7).describe("Minimum score (0-1) for this criterion to pass"),model:u.string().optional().describe("Override judge model for this criterion. Defaults to first model in models list."),rubric:u.string().optional().describe("Detailed rubric for the judge. If omitted, a default rubric is generated from criteria.")}),kt=u.object({tool:k.describe("Expected tool/function name"),shouldNotCall:u.boolean().optional().default(!1).describe("If true, assert this tool was NOT called"),argsMatch:u.record(u.unknown()).optional().describe("Key-value pairs that must be present in the tool call arguments (partial match)"),argsSchema:u.string().optional().describe("Path to JSON Schema file to validate the tool call arguments"),order:u.number().int().min(0).optional().describe("Expected position in the sequence of tool calls (0-indexed)"),responseContains:u.string().optional().describe("Assert the simulated tool response contains this substring")}),It=u.object({maxScore:$.default(.15).describe("Maximum drift score (0-1). Higher = more drift allowed. Fail if exceeded."),method:u.enum(["judge","embedding","field-diff"]).default("judge").describe("Drift detection method. 'judge' uses LLM comparison, 'embedding' uses cosine similarity, 'field-diff' compares JSON fields."),fields:u.array(u.string()).optional().describe("For field-diff method: JSON paths to compare (e.g., ['response.action', 'response.message'])")}),jt=u.object({pii:Mt.optional(),keywords:Ot.optional()}),Nt=u.object({output:St.optional(),guardrails:jt.optional(),judge:u.array(_t).optional().describe("LLM-as-judge evaluations. Each criterion is scored independently."),toolCalls:u.array(kt).optional().describe("Expected tool/function calls in the model response"),baseline:u.object({drift:It.optional()}).optional()}),$t=u.object({name:k.describe("Tool/function name as the model sees it"),description:u.string().optional().describe("Tool description for documentation"),parameters:u.record(u.unknown()).optional().describe("JSON Schema for the tool's parameters"),responses:u.array(u.object({when:u.record(u.unknown()).describe("Condition: match tool call arguments (partial match)"),then:u.unknown().describe("Simulated response to return when condition matches")})).optional().describe("Simulated responses based on argument matching"),defaultResponse:u.unknown().optional().describe("Response when no 'when' condition matches")}),Dt=u.object({name:k.describe("Unique test case name within the suite. Used in reports and JUnit output."),prompt:k.optional().describe("Reference to a key in the prompts section. Exactly one of prompt or command must be set."),command:k.optional().describe("Shell command to execute. Stdout is captured and assertions run against it. Exactly one of prompt or command must be set."),vars:u.record(u.string()).default({}).describe("Variables to interpolate into the prompt template or command"),models:u.array(u.string()).optional().describe("Override: run this test only against these model IDs. Defaults to all models. Ignored for command tests."),repeat:u.number().int().min(1).optional().describe("Override: number of repeat runs for this specific test case"),tools:u.array($t).optional().describe("Simulated tools available to the model for this test case"),expect:Nt.describe("Assertions to evaluate against the model output"),tags:u.array(u.string()).optional().describe("Tags for filtering test cases in CLI (e.g., --tags regression)"),skip:u.boolean().optional().default(!1).describe("Skip this test case during execution")}).refine(e=>{let t=e.prompt!==void 0,s=e.command!==void 0;return(t||s)&&!(t&&s)},{message:"Exactly one of 'prompt' or 'command' must be set on each test case"}),Lt=u.object({passRateMin:$.default(.95).describe("Minimum overall pass rate (0-1). Computed after repeats and aggregation."),schemaFailuresMax:u.number().int().min(0).default(0).describe("Maximum allowed schema validation failures across entire suite"),judgeAvgMin:$.optional().describe("Minimum average LLM-as-judge score across all criteria and test cases"),driftScoreMax:$.optional().describe("Maximum allowed drift score against active baseline"),piiFailuresMax:u.number().int().min(0).default(0).describe("Maximum allowed PII detection failures"),keywordFailuresMax:u.number().int().min(0).default(0).describe("Maximum allowed keyword guardrail failures"),costMaxUsd:u.number().positive().optional().describe("Maximum total cost in USD for the entire run. Aborts if exceeded mid-run."),latencyMaxMs:u.number().positive().optional().describe("Maximum average latency in ms. Fails gate if exceeded."),deterministicPassRate:$.optional().describe("Minimum pass rate for deterministic assertions only (tool_called, schema, pii, keywords, etc.)"),probabilisticPassRate:$.optional().describe("Minimum pass rate for probabilistic assertions only (judge, drift)")}),Ft=u.object({enabled:u.boolean().default(!1),framework:u.enum(["eu-ai-act","custom"]).default("eu-ai-act"),outputDir:u.string().default("./compliance-reports"),metadata:u.object({systemName:u.string().optional().describe("Name of the AI system being tested"),systemVersion:u.string().optional().describe("Version of the AI system"),riskLevel:u.enum(["high","limited","minimal"]).optional(),operator:u.string().optional().describe("Organization operating the AI system"),intendedPurpose:u.string().optional().describe("Documented intended purpose of the AI system"),dataGovernanceNotes:u.string().optional()}).optional()}),Ut=u.object({enabled:u.boolean().default(!1),includeArtifacts:u.boolean().default(!1).describe("Upload raw prompt inputs and model outputs. Disabled by default for privacy."),redactPatterns:u.array(ue).optional().describe("Patterns to redact from artifacts before upload (applied on top of PII guardrails)"),apiUrl:u.string().url().default("https://api.kindlm.com/v1").describe("Cloud API URL. Override for self-hosted deployments.")}),Bt=u.object({name:k,description:u.string().optional(),tags:u.array(u.string()).optional()}),Ce=u.object({kindlm:u.literal(1).describe("Config schema version. Must be 1."),project:k.describe("Project identifier for cloud upload and report grouping"),suite:Bt,providers:Pt,models:u.array(vt).min(1,"At least one model must be configured"),prompts:u.record(wt).refine(e=>Object.keys(e).length>0,{message:"At least one prompt must be defined"}),tests:u.array(Dt).min(1,"At least one test case must be defined"),gates:Lt.default({}),compliance:Ft.optional(),trace:le.optional().describe("OpenTelemetry trace ingestion configuration for the 'kindlm trace' command"),upload:Ut.default({}),defaults:u.object({repeat:u.number().int().min(1).default(1).describe("Default repeat count per test case"),concurrency:u.number().int().min(1).max(32).default(4).describe("Default concurrency for test execution"),timeoutMs:u.number().int().min(1e3).default(6e4).describe("Default timeout per provider call in ms"),judgeModel:u.string().optional().describe("Default model ID for LLM-as-judge assertions. Must reference a configured model.")}).default({})});function pe(e){let t=Ce.safeParse(e);return t.success?M(t.data):A({code:"CONFIG_VALIDATION_ERROR",message:"Config validation failed",details:{errors:t.error.issues.map(s=>`${s.path.join(".")}: ${s.message}`)}})}import{parse as Gt}from"yaml";var Jt=1048576,Ae=1e3,Pe=50;function Vt(e,t){if(e.length>Jt)return A({code:"CONFIG_TOO_LARGE",message:`Config exceeds maximum size of 1MB (got ${(e.length/1048576).toFixed(1)}MB)`});let s;try{s=Gt(e)}catch(i){return A({code:"CONFIG_PARSE_ERROR",message:`Failed to parse YAML: ${i.message}`,cause:i})}let r=pe(s);if(!r.success)return r;let n=r.data;if(n.tests.length>Ae)return A({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${Ae} tests (got ${n.tests.length})`});if(n.models.length>Pe)return A({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${Pe} models (got ${n.models.length})`});let o=[],a=new Set;for(let i of n.models)a.has(i.id)&&o.push(`Duplicate model ID "${i.id}"`),a.add(i.id);let p=new Set;for(let i of n.tests)p.has(i.name)&&o.push(`Duplicate test name "${i.name}"`),p.add(i.name);for(let i of n.tests)i.prompt&&!(i.prompt in n.prompts)&&o.push(`Test "${i.name}" references prompt "${i.prompt}" which is not defined`);for(let i of n.tests)if(i.models)for(let l of i.models)a.has(l)||o.push(`Test "${i.name}" references model "${l}" which is not configured`);for(let i of n.models)n.providers[i.provider]||o.push(`Model "${i.id}" references provider "${i.provider}" which is not configured`);if(n.defaults.judgeModel&&!a.has(n.defaults.judgeModel)&&o.push(`defaults.judgeModel "${n.defaults.judgeModel}" is not a configured model`),t.fileReader)for(let i of n.tests){if(i.expect.output?.schemaFile){let l=Ee(t.configDir,i.expect.output.schemaFile);l.success?t.fileReader.readFile(l.data).success||o.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" not found at ${l.data}`):o.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" \u2014 ${l.error.message}`)}if(i.expect.toolCalls){for(let l of i.expect.toolCalls)if(l.argsSchema){let c=Ee(t.configDir,l.argsSchema);c.success?t.fileReader.readFile(c.data).success||o.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" not found at ${c.data}`):o.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" \u2014 ${c.error.message}`)}}}return o.length>0?A({code:"CONFIG_VALIDATION_ERROR",message:"Config cross-reference validation failed",details:{errors:o}}):M(n)}function Ee(e,t){if(t.startsWith("/")||t.startsWith("\\"))return A({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});if(/^[a-zA-Z]:/.test(t))return A({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});let s=e.endsWith("/")?e.slice(0,-1):e,r=`${s}/${t}`,n=ve(r),o=ve(s);return!n.startsWith(o+"/")&&n!==o?A({code:"PATH_TRAVERSAL",message:`Path "${t}" escapes the config directory`}):M(n)}function ve(e){let t=e.split("/"),s=[];for(let n of t)n==="."||n===""||(n===".."?s.pop():s.push(n));return(e.startsWith("/")?"/":"")+s.join("/")}var we=/\{\{(\w+)\}\}/g;function F(e,t){let s=Se(e,t);if(s.length>0)return A({code:"CONFIG_VALIDATION_ERROR",message:`Missing template variables: ${s.join(", ")}`,details:{missing:s}});let r=e.replace(we,(n,o)=>t[o]);return M(r)}function Se(e,t){let s=new Set;for(let r of e.matchAll(we)){let n=r[1];n!==void 0&&!(n in t)&&s.add(n)}return[...s]}function ce(e){let t=e[0];if(!t)throw new Error("aggregateRuns requires at least one run");let{testCaseName:s,modelId:r}=t,o=e.filter(y=>y.assertions.every(m=>m.passed)).length/e.length,a=new Map;for(let y of e)for(let m of y.assertions){let x=a.get(m.assertionType);x||(x=[],a.set(m.assertionType,x)),x.push(m.score)}let p={};for(let[y,m]of a){let x=m.reduce((C,b)=>C+b,0);p[y]={mean:x/m.length,min:Math.min(...m),max:Math.max(...m)}}let i=new Set;for(let y of e)for(let m of y.assertions)!m.passed&&m.failureCode&&i.add(m.failureCode);let l=e.reduce((y,m)=>y+m.latencyMs,0)/e.length,c=e.reduce((y,m)=>y+(m.costEstimateUsd??0),0),f=e.reduce((y,m)=>y+m.tokenUsage.totalTokens,0);return{testCaseName:s,modelId:r,runCount:e.length,passed:o===1,passRate:o,assertionScores:p,failureCodes:[...i],latencyAvgMs:l,totalCostUsd:c,totalTokens:f,runs:e}}function de(e){let t=e.stdout.split(`
|
|
32
|
-
`),s=[],r=[],n,o=0;for(let a of t){let p=a.trimStart();if(!p.startsWith('{"kindlm":')){s.push(a);continue}let i=Kt(p);if(!i){s.push(a);continue}i.kindlm==="tool_call"?r.push({id:i.id??`cmd_tc_${o++}`,name:i.name,arguments:i.arguments}):i.kindlm==="output_json"&&(n=i.data)}return{outputText:s.join(`
|
|
33
|
-
`).trim(),toolCalls:r,outputJson:n,exitCode:e.exitCode,stderr:e.stderr}}function Kt(e){try{let t=JSON.parse(e);return typeof t.kindlm!="string"?null:t.kindlm==="tool_call"?typeof t.name!="string"?null:{kindlm:"tool_call",id:typeof t.id=="string"?t.id:void 0,name:t.name,arguments:typeof t.arguments=="object"&&t.arguments!==null?t.arguments:{}}:t.kindlm==="output_json"?{kindlm:"output_json",data:t.data}:null}catch{return null}}function Ht(e,t){return{async run(){let s=Date.now(),r=new Map;for(let R of e.tests)if(R.expect.output?.schemaFile){let P=Yt(t.configDir,R.expect.output.schemaFile),E=t.fileReader.readFile(P);if(!E.success)return A({code:"SCHEMA_FILE_ERROR",message:`Failed to read schema file "${R.expect.output.schemaFile}": ${E.error.message}`});try{r.set(R.name,JSON.parse(E.data))}catch(T){return A({code:"SCHEMA_FILE_ERROR",message:`Failed to parse schema file "${R.expect.output.schemaFile}" as JSON: ${T instanceof Error?T.message:String(T)}`})}}let n=[];for(let R of e.tests){if(R.skip)continue;let P=R.repeat??e.defaults.repeat;if(R.command)for(let E=0;E<P;E++)n.push({test:R,modelConfig:null,runIndex:E});else{let E=R.models??e.models.map(T=>T.id);for(let T of E){let v=e.models.find(S=>S.id===T);if(v)for(let S=0;S<P;S++)n.push({test:R,modelConfig:v,runIndex:S})}}}let o=await Xt(n.map(R=>()=>zt(e,t,R,r)),e.defaults.concurrency),a=R=>`${R.testCaseName}::${R.modelId}`,p=new Map;for(let R of o){let P=a(R),E=p.get(P);E||(E=[],p.set(P,E)),E.push(R)}let i=[];for(let R of p.values())i.push(ce(R));let l=i.map(R=>({name:R.testCaseName,modelId:R.modelId,status:R.passed?"passed":"failed",assertions:R.runs[0]?.assertions??[],latencyMs:R.latencyAvgMs,costUsd:R.totalCostUsd})),c=e.tests.filter(R=>R.skip).map(R=>({name:R.name,modelId:"",status:"skipped",assertions:[],latencyMs:0,costUsd:0})),f=[...l,...c],y=f.filter(R=>R.status==="passed").length,m=f.filter(R=>R.status==="failed").length,x=f.filter(R=>R.status==="errored").length,C=f.filter(R=>R.status==="skipped").length,b=x>0?"errored":m>0?"failed":"passed",h={suites:[{name:e.suite.name,status:b,tests:f}],totalTests:f.length,passed:y,failed:m,errored:x,skipped:C,durationMs:Date.now()-s};return M({runResult:h,aggregated:i})}}}async function zt(e,t,s,r){let{test:n,modelConfig:o,runIndex:a}=s;if(n.command)return Wt(e,t,n,a,r);if(!o)return j(n.name,"unknown",a,"No model config for prompt-based test");t.onProgress?.({type:"test_start",test:n.name,model:o.id,run:a});try{let p=t.adapters.get(o.provider);if(!p)return j(n.name,o.id,a,`Provider adapter "${o.provider}" not found`);let i=n.prompt?e.prompts[n.prompt]:void 0;if(!i)return j(n.name,o.id,a,`Prompt "${n.prompt}" not defined`);let l=F(i.user,n.vars);if(!l.success)return j(n.name,o.id,a,l.error.message);let c=[];if(i.system){let v=F(i.system,n.vars);if(!v.success)return j(n.name,o.id,a,v.error.message);c.push({role:"system",content:v.data})}c.push({role:"user",content:l.data});let f=(n.tools??[]).map(v=>({name:v.name,description:v.description,parameters:v.parameters})),y={model:o.model,messages:c,params:{temperature:o.params.temperature,maxTokens:o.params.maxTokens,topP:o.params.topP,stopSequences:o.params.stopSequences,seed:o.params.seed},tools:f.length>0?f:void 0},m=await ie(p,y,n.tools??[]),x=p.estimateCost(o.model,m.totalUsage),C={};n.expect.output?.schemaFile&&r.has(n.name)&&(C.schemaContent=r.get(n.name));let b=H(n.expect,C),d=e.defaults.judgeModel??e.models[0]?.id,h=e.models.find(v=>v.id===d),R=h?t.adapters.get(h.provider):void 0,P={outputText:m.finalText,toolCalls:m.allToolCalls,configDir:t.configDir,latencyMs:m.totalLatencyMs,costUsd:x??void 0,judgeAdapter:R,judgeModel:h?.model};if(t.baselineData){let v=`${n.name}::${o.id}`,S=t.baselineData.results[v];S&&(P.baselineText=S.outputText)}let E=[];for(let v of b){let S=await v.evaluate(P);E.push(...S)}let T=E.every(v=>v.passed);return t.onProgress?.({type:"test_complete",test:n.name,model:o.id,run:a,passed:T}),{testCaseName:n.name,modelId:o.id,runIndex:a,outputText:m.finalText,assertions:E,latencyMs:m.totalLatencyMs,tokenUsage:m.totalUsage,costEstimateUsd:x}}catch(p){return t.onProgress?.({type:"test_complete",test:n.name,model:o.id,run:a,passed:!1}),j(n.name,o.id,a,p instanceof Error?p.message:String(p))}}async function Wt(e,t,s,r,n){let o="command";t.onProgress?.({type:"test_start",test:s.name,model:o,run:r});try{if(!t.commandExecutor)return j(s.name,o,r,"Command executor not available");if(!s.command)return j(s.name,o,r,"No command specified");let a=F(s.command,s.vars);if(!a.success)return j(s.name,o,r,a.error.message);let p=Date.now(),i=await t.commandExecutor.execute(a.data,{timeoutMs:e.defaults.timeoutMs,cwd:t.configDir});if(!i.success)return j(s.name,o,r,i.error.message);let l=Date.now()-p,c=de(i.data),f={};s.expect.output?.schemaFile&&n.has(s.name)&&(f.schemaContent=n.get(s.name));let y=H(s.expect,f),m=e.defaults.judgeModel??e.models[0]?.id,x=e.models.find(R=>R.id===m),C=x?t.adapters.get(x.provider):void 0,b={outputText:c.outputText,outputJson:c.outputJson,toolCalls:c.toolCalls,configDir:t.configDir,latencyMs:l,judgeAdapter:C,judgeModel:x?.model};if(t.baselineData){let R=`${s.name}::${o}`,P=t.baselineData.results[R];P&&(b.baselineText=P.outputText)}let d=[];for(let R of y){let P=await R.evaluate(b);d.push(...P)}let h=d.every(R=>R.passed);return t.onProgress?.({type:"test_complete",test:s.name,model:o,run:r,passed:h}),{testCaseName:s.name,modelId:o,runIndex:r,outputText:c.outputText,assertions:d,latencyMs:l,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}catch(a){return t.onProgress?.({type:"test_complete",test:s.name,model:o,run:r,passed:!1}),j(s.name,o,r,a instanceof Error?a.message:String(a))}}function j(e,t,s,r){return{testCaseName:e,modelId:t,runIndex:s,outputText:"",assertions:[{assertionType:"internal",label:"Execution error",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:r}],latencyMs:0,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}async function Xt(e,t){let s=new Array(e.length),r=0;async function n(){for(;r<e.length;){let a=r++,p=e[a];p&&(s[a]=await p())}}let o=Array.from({length:Math.min(t,e.length)},()=>n());return await Promise.all(o),s}function Yt(e,t){return t.startsWith("/")?t:`${e.endsWith("/")?e.slice(0,-1):e}/${t}`}function Qt(e,t){let s=[],r=t.reduce((l,c)=>l+c.runCount,0),n=t.reduce((l,c)=>l+Math.round(c.passRate*c.runCount),0),o=r>0?n/r:0;s.push({gateName:"passRateMin",passed:o>=e.passRateMin,actual:o,threshold:e.passRateMin,message:o>=e.passRateMin?`Pass rate ${w(o)} meets minimum ${w(e.passRateMin)}`:`Pass rate ${w(o)} below minimum ${w(e.passRateMin)}`});let a=me(t,["SCHEMA_INVALID","SCHEMA_PARSE_ERROR"]);if(s.push({gateName:"schemaFailuresMax",passed:a<=e.schemaFailuresMax,actual:a,threshold:e.schemaFailuresMax,message:a<=e.schemaFailuresMax?`Schema failures ${a} within limit ${e.schemaFailuresMax}`:`Schema failures ${a} exceed limit ${e.schemaFailuresMax}`}),e.judgeAvgMin!==void 0){let l=Oe(t,"judge"),c=l.length>0?l.reduce((f,y)=>f+y,0)/l.length:1;s.push({gateName:"judgeAvgMin",passed:c>=e.judgeAvgMin,actual:c,threshold:e.judgeAvgMin,message:c>=e.judgeAvgMin?`Judge average ${w(c)} meets minimum ${w(e.judgeAvgMin)}`:`Judge average ${w(c)} below minimum ${w(e.judgeAvgMin)}`})}if(e.driftScoreMax!==void 0){let l=Oe(t,"drift"),c=l.length>0?Math.max(...l):0;s.push({gateName:"driftScoreMax",passed:c<=e.driftScoreMax,actual:c,threshold:e.driftScoreMax,message:c<=e.driftScoreMax?`Drift score ${w(c)} within limit ${w(e.driftScoreMax)}`:`Drift score ${w(c)} exceeds limit ${w(e.driftScoreMax)}`})}let p=me(t,["PII_DETECTED"]);s.push({gateName:"piiFailuresMax",passed:p<=e.piiFailuresMax,actual:p,threshold:e.piiFailuresMax,message:p<=e.piiFailuresMax?`PII failures ${p} within limit ${e.piiFailuresMax}`:`PII failures ${p} exceed limit ${e.piiFailuresMax}`});let i=me(t,["KEYWORD_DENIED","KEYWORD_MISSING"]);if(s.push({gateName:"keywordFailuresMax",passed:i<=e.keywordFailuresMax,actual:i,threshold:e.keywordFailuresMax,message:i<=e.keywordFailuresMax?`Keyword failures ${i} within limit ${e.keywordFailuresMax}`:`Keyword failures ${i} exceed limit ${e.keywordFailuresMax}`}),e.costMaxUsd!==void 0){let l=t.reduce((c,f)=>c+f.totalCostUsd,0);s.push({gateName:"costMaxUsd",passed:l<=e.costMaxUsd,actual:l,threshold:e.costMaxUsd,message:l<=e.costMaxUsd?`Total cost $${l.toFixed(4)} within limit $${e.costMaxUsd.toFixed(4)}`:`Total cost $${l.toFixed(4)} exceeds limit $${e.costMaxUsd.toFixed(4)}`})}if(e.latencyMaxMs!==void 0){let l=t.length>0?t.reduce((c,f)=>c+f.latencyAvgMs,0)/t.length:0;s.push({gateName:"latencyMaxMs",passed:l<=e.latencyMaxMs,actual:l,threshold:e.latencyMaxMs,message:l<=e.latencyMaxMs?`Average latency ${Math.round(l)}ms within limit ${e.latencyMaxMs}ms`:`Average latency ${Math.round(l)}ms exceeds limit ${e.latencyMaxMs}ms`})}if(e.deterministicPassRate!==void 0){let l=Me(t,"deterministic");s.push({gateName:"deterministicPassRate",passed:l>=e.deterministicPassRate,actual:l,threshold:e.deterministicPassRate,message:l>=e.deterministicPassRate?`Deterministic pass rate ${w(l)} meets minimum ${w(e.deterministicPassRate)}`:`Deterministic pass rate ${w(l)} below minimum ${w(e.deterministicPassRate)}`})}if(e.probabilisticPassRate!==void 0){let l=Me(t,"probabilistic");s.push({gateName:"probabilisticPassRate",passed:l>=e.probabilisticPassRate,actual:l,threshold:e.probabilisticPassRate,message:l>=e.probabilisticPassRate?`Probabilistic pass rate ${w(l)} meets minimum ${w(e.probabilisticPassRate)}`:`Probabilistic pass rate ${w(l)} below minimum ${w(e.probabilisticPassRate)}`})}return{passed:s.every(l=>l.passed),gates:s}}function Me(e,t){let s=0,r=0;for(let n of e)for(let o of n.runs)for(let a of o.assertions)D(a.assertionType)===t&&(s++,a.passed&&r++);return s>0?r/s:1}function me(e,t){let s=0;for(let r of e)for(let n of t)r.failureCodes.includes(n)&&s++;return s}function Oe(e,t){let s=[];for(let r of e){let n=r.assertionScores[t];n&&s.push(n.mean)}return s}function w(e){return(e*100).toFixed(1)+"%"}var N=e=>e,fe={bold:N,red:N,green:N,yellow:N,cyan:N,dim:N,greenBold:N,redBold:N};function Zt(e=fe){return{name:"pretty",generate(t,s){let r=[],n=e;r.push(""),r.push(n.bold(" KindLM Test Results")),r.push("");let o=0;for(let c of t.suites){r.push(qt(c,n));for(let f of c.tests){r.push(es(f,n));let y=ts(f,n);y&&r.push(y);for(let m of f.assertions)r.push(ss(m,n));o+=f.costUsd}r.push("")}r.push(n.bold(" Summary"));let a=n.green(`${t.passed} passed`),p=t.failed>0?n.red(`${t.failed} failed`):`${t.failed} failed`,i=t.errored>0?n.yellow(`${t.errored} errored`):`${t.errored} errored`;if(r.push(` ${a}, ${p}, ${i} (${t.totalTests} total)`),r.push(` Duration: ${ke(t.durationMs)}`),o>0&&r.push(` Cost: ${Ie(o)}`),r.push(""),s.gates.length>0){r.push(n.bold(" Quality Gates"));for(let c of s.gates){let f=c.passed?n.green("\u2713"):n.red("\u2717");r.push(` ${f} ${c.message}`)}r.push("")}return t.failed===0&&t.errored===0&&s.passed?r.push(n.greenBold(" \u2713 All tests passed")):r.push(n.redBold(" \u2717 Some tests failed")),r.push(""),{content:r.join(`
|
|
34
|
-
`),format:"text"}}}}function qt(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${t.bold(e.name)}`}function es(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${e.name}`}function ts(e,t){if(e.status==="skipped")return null;let s=[];return e.modelId&&s.push(e.modelId),e.latencyMs>0&&s.push(ke(e.latencyMs)),e.costUsd>=5e-5&&s.push(Ie(e.costUsd)),s.length===0?null:` ${t.dim(s.join(" \xB7 "))}`}function ss(e,t){if(e.passed){let o=_e(e),a=o?`${e.label} ${t.cyan(o)}`:e.label;return` ${t.green("\u2713")} ${t.dim(a)}`}let s=_e(e),r=e.failureMessage??"failed",n=s?`${e.label} ${t.cyan(s)}`:e.label;return` ${t.red("\u2717")} ${n}: ${r}`}function _e(e){if(e.assertionType==="judge"||e.assertionType==="drift"){let t=rs(e);if(t!==null){let s=e.passed?"\u2265":"<";return`(${e.score.toFixed(2)} ${s} ${t.toFixed(2)})`}return`(${e.score.toFixed(2)})`}return""}function rs(e){if(e.metadata&&typeof e.metadata=="object"&&"threshold"in e.metadata){let t=e.metadata.threshold;if(typeof t=="number")return t}if(e.failureMessage){let t=e.failureMessage.match(/threshold (\d+\.?\d*)/i);if(t?.[1])return parseFloat(t[1]);let s=e.failureMessage.match(/below (\d+\.?\d*)/i);if(s?.[1])return parseFloat(s[1])}return null}function ke(e){return e<1e3?`${e}ms`:`${(e/1e3).toFixed(2)}s`}function Ie(e){return e<.01?`$${e.toFixed(4)}`:`$${e.toFixed(2)}`}function os(){return{name:"json",generate(e,t){let s={kindlm:{version:"1.0.0",timestamp:new Date().toISOString()},summary:{totalTests:e.totalTests,passed:e.passed,failed:e.failed,errored:e.errored,skipped:e.skipped,durationMs:e.durationMs},gates:{passed:t.passed,results:t.gates},suites:e.suites.map(r=>({name:r.name,status:r.status,tests:r.tests.map(n=>({name:n.name,status:n.status,assertions:n.assertions,latencyMs:n.latencyMs,costUsd:n.costUsd}))}))};return{content:JSON.stringify(s,null,2),format:"json"}}}}function ns(){return{name:"junit",generate(e,t){let s=e.durationMs/1e3,r=[];r.push('<?xml version="1.0" encoding="UTF-8"?>'),r.push(`<testsuites name="KindLM" tests="${e.totalTests}" failures="${e.failed}" errors="${e.errored}" time="${s.toFixed(3)}">`);for(let n of e.suites){let o=n.tests.filter(i=>i.status==="failed").length,a=n.tests.filter(i=>i.status==="errored").length,p=n.tests.reduce((i,l)=>i+l.latencyMs,0)/1e3;r.push(` <testsuite name="${I(n.name)}" tests="${n.tests.length}" failures="${o}" errors="${a}" time="${p.toFixed(3)}">`);for(let i of n.tests){let l=i.latencyMs/1e3;if(r.push(` <testcase name="${I(i.name)}" classname="${I(n.name)}" time="${l.toFixed(3)}">`),i.status==="skipped")r.push(" <skipped/>");else if(i.status==="errored"&&i.error)r.push(` <error message="${I(i.error.message)}" type="${I(i.error.code)}">${I(i.error.message)}</error>`);else if(i.status==="failed"){let c=i.assertions.filter(f=>!f.passed);for(let f of c)r.push(` <failure message="${I(f.label)}" type="${I(f.failureCode??"ASSERTION_FAILED")}">${I(f.failureMessage??"Assertion failed")}</failure>`)}r.push(" </testcase>")}r.push(" </testsuite>")}if(t.gates.length>0){let n=t.gates.filter(o=>!o.passed).length;r.push(` <testsuite name="Quality Gates" tests="${t.gates.length}" failures="${n}" errors="0" time="0.000">`);for(let o of t.gates)r.push(` <testcase name="${I(o.gateName)}" classname="Quality Gates" time="0.000">`),o.passed||r.push(` <failure message="${I(o.message)}" type="GATE_FAILED">${I(o.message)}</failure>`),r.push(" </testcase>");r.push(" </testsuite>")}return r.push("</testsuites>"),{content:r.join(`
|
|
35
|
-
`),format:"
|
|
36
|
-
`),
|
|
37
|
-
`),
|
|
38
|
-
`)
|
|
39
|
-
|
|
31
|
+
${t.outputText}`}],params:{temperature:0,maxTokens:512}})}catch(a){return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"JUDGE_EVAL_ERROR",failureMessage:`Drift judge adapter error: ${a instanceof Error?a.message:String(a)}`}]}let n=ut(s.text);if(!n.ok)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"DRIFT_PARSE_ERROR",failureMessage:`Failed to parse drift judge response: ${n.reason}`}];let o=1-n.driftScore,r=n.driftScore<=e.maxScore;return[{assertionType:"drift",label:"Drift check (judge)",passed:r,score:o,failureCode:r?void 0:"DRIFT_EXCEEDED",failureMessage:r?void 0:`Drift score ${n.driftScore.toFixed(2)} exceeds max ${e.maxScore}: ${n.reasoning}`,metadata:{driftScore:n.driftScore,reasoning:n.reasoning,threshold:e.maxScore}}]}}}function le(e){return{type:"latency",evaluate(t){let s=t.latencyMs??0,n=s<=e.maxMs;return Promise.resolve([{assertionType:"latency",label:`Latency <= ${e.maxMs}ms`,passed:n,score:n?1:0,failureCode:n?void 0:"PROVIDER_TIMEOUT",failureMessage:n?void 0:`Latency ${s}ms exceeds max ${e.maxMs}ms`,metadata:{latencyMs:s}}])}}}function ue(e){return{type:"cost",evaluate(t){let s=t.costUsd??0,n=s<=e.maxUsd;return Promise.resolve([{assertionType:"cost",label:`Cost <= $${e.maxUsd}`,passed:n,score:n?1:0,failureCode:n?void 0:"BUDGET_EXCEEDED",failureMessage:n?void 0:`Cost $${s.toFixed(4)} exceeds max $${e.maxUsd}`,metadata:{costUsd:s}}])}}}function Y(e,t){let s=[];if(e.toolCalls)if(e.toolCalls.some(o=>o.order!==void 0)){let o=e.toolCalls.map(r=>({...r,argsSchema:r.argsSchemaResolved?JSON.stringify(r.argsSchemaResolved):r.argsSchema}));s.push(te(o))}else for(let o of e.toolCalls)if(o.shouldNotCall)s.push(ee(o.tool));else{let r=o.argsSchemaResolved?JSON.stringify(o.argsSchemaResolved):o.argsSchema??void 0;s.push(q(o.tool,o.argsMatch??void 0,r))}if(e.output&&s.push(re({format:e.output.format,schemaFile:e.output.schemaFile,schemaContent:t?.schemaContent,contains:e.output.contains,notContains:e.output.notContains,maxLength:e.output.maxLength})),e.guardrails?.pii&&s.push(se({denyPatterns:e.guardrails.pii.denyPatterns,customPatterns:e.guardrails.pii.customPatterns})),e.guardrails?.keywords&&(e.guardrails.keywords.allow&&e.guardrails.keywords.allow.length>0&&s.push(ne(e.guardrails.keywords.allow)),e.guardrails.keywords.deny.length>0&&s.push(oe(e.guardrails.keywords.deny))),e.judge)for(let n of e.judge)s.push(ae({criteria:n.criteria,minScore:n.minScore,rubric:n.rubric,model:n.model}));return e.baseline?.drift&&s.push(ie({maxScore:e.baseline.drift.maxScore,method:e.baseline.drift.method,fields:e.baseline.drift.fields})),e.latency&&s.push(le({maxMs:e.latency.maxMs})),e.cost&&s.push(ue({maxUsd:e.cost.maxUsd})),s}var dt=new Set(["judge","drift"]);function H(e){return dt.has(e)?"probabilistic":"deterministic"}function pt(e){return H(e)==="deterministic"}function mt(e){return H(e)==="probabilistic"}function B(e,t){let s=t[e];if(s)return{ok:!0,price:s,matchedModel:e,matchType:"exact"};let n=Object.keys(t).filter(o=>e.startsWith(`${o}-`)||e.startsWith(`${o}:`));if(n.length===1){let o=n[0];if(!o)return{ok:!1};let r=t[o];if(r)return{ok:!0,price:r,matchedModel:o,matchType:"prefix"}}return{ok:!1}}async function v(e,t){let{maxRetries:s,shouldRetry:n,baseDelayMs:o=500,maxDelayMs:r=3e4,getRetryAfterMs:a}=t,c;for(let i=0;i<=s;i++)try{return await e()}catch(l){if(c=l,i>=s||!n(l))throw l;let p=a?.(l),f=ft({attempt:i,baseDelayMs:o,maxDelayMs:r,retryAfterMs:p});await gt(f)}throw c}function ft(e){let{attempt:t,baseDelayMs:s,maxDelayMs:n,retryAfterMs:o}=e;if(o!==void 0&&o>0)return o;let r=Math.min(n,s*Math.pow(2,t));return Math.max(Math.floor(s/2),Math.floor(Math.random()*r))}function gt(e){return new Promise(t=>setTimeout(t,e))}var ht={"gpt-4o":{input:2.5,output:10},"gpt-4o-mini":{input:.15,output:.6},"gpt-4-turbo":{input:10,output:30},"o3-mini":{input:1.1,output:4.4}};function Rt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function ke(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new h("CONTEXT_LENGTH",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ce(e){let t="",s="https://api.openai.com/v1",n,o=6e4,r=2;return{name:"openai",async initialize(a){if(!a.apiKey)throw new h("AUTH_FAILED","API key is required");t=a.apiKey,a.baseUrl&&(s=a.baseUrl),n=a.organization,o=a.timeoutMs,r=a.maxRetries},async complete(a){let c={model:a.model,messages:a.messages.map(u=>u.role==="tool"?{role:"tool",content:u.content,tool_call_id:u.toolCallId}:u.role==="assistant"&&u.toolCalls&&u.toolCalls.length>0?{role:"assistant",content:u.content||null,tool_calls:u.toolCalls.map(m=>({id:m.id,type:"function",function:{name:m.name,arguments:JSON.stringify(m.arguments)}}))}:{role:u.role,content:u.content}),temperature:a.params.temperature,max_tokens:a.params.maxTokens};a.params.topP!==void 0&&(c.top_p=a.params.topP),a.params.seed!==void 0&&(c.seed=a.params.seed),a.params.stopSequences&&(c.stop=a.params.stopSequences),a.tools&&a.tools.length>0&&(c.tools=a.tools.map(u=>({type:"function",function:{name:u.name,description:u.description,parameters:u.parameters}})),a.toolChoice&&(c.tool_choice=a.toolChoice));let i={"Content-Type":"application/json",Authorization:`Bearer ${t}`};n&&(i["OpenAI-Organization"]=n);let l=Date.now(),p=await v(async()=>{let u=await e.fetch(`${s}/chat/completions`,{method:"POST",headers:i,body:JSON.stringify(c),timeoutMs:o});if(!u.ok){let m;try{m=await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from OpenAI API",u.status,u.status>=500)}throw ke(u.status,m)}try{return await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from OpenAI API",u.status,u.status>=500)}},{maxRetries:r,shouldRetry:u=>u instanceof h&&u.retryable,getRetryAfterMs:u=>{if(!(u instanceof h)||!u.raw)return;let T=u.raw.headers?.["retry-after"];if(typeof T=="string"){let w=Number(T);if(!Number.isNaN(w)&&w>0)return w*1e3}}}),f=Date.now()-l,y=p,R=y.choices?.[0],x=R?.message,g=(x?.tool_calls??[]).map(u=>{let m;try{m=JSON.parse(u.function.arguments||"{}")}catch{m={_raw:u.function.arguments}}return{id:u.id,name:u.function.name,arguments:m}});return{text:x?.content??"",toolCalls:g,usage:{inputTokens:y.usage?.prompt_tokens??0,outputTokens:y.usage?.completion_tokens??0,totalTokens:y.usage?.total_tokens??0},raw:p,latencyMs:f,modelId:y.model??a.model,finishReason:Rt(R?.finish_reason)}},estimateCost(a,c){let i=B(a,ht);return i.ok?c.inputTokens/1e6*i.price.input+c.outputTokens/1e6*i.price.output:null},supportsTools(a){return!a.startsWith("o1-")},async embed(a,c){let i=c??"text-embedding-3-small",l={"Content-Type":"application/json",Authorization:`Bearer ${t}`};n&&(l["OpenAI-Organization"]=n);let y=(await v(async()=>{let R=await e.fetch(`${s}/embeddings`,{method:"POST",headers:l,body:JSON.stringify({model:i,input:a}),timeoutMs:o});if(!R.ok){let x;try{x=await R.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from OpenAI Embeddings API",R.status,R.status>=500)}throw ke(R.status,x)}try{return await R.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from OpenAI Embeddings API",R.status,R.status>=500)}},{maxRetries:r,shouldRetry:R=>R instanceof h&&R.retryable})).data?.[0]?.embedding;if(!y||!Array.isArray(y))throw new h("PROVIDER_ERROR","No embedding returned from OpenAI Embeddings API");return y}}}var yt={"claude-opus-4-5-20250929":{input:15,output:75},"claude-sonnet-4-5-20250929":{input:3,output:15},"claude-haiku-4-5-20251001":{input:.8,output:4}};function bt(e){switch(e){case"end_turn":return"stop";case"max_tokens":return"max_tokens";case"tool_use":return"tool_calls";default:return"unknown"}}function xt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new h("CONTEXT_LENGTH",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function de(e){let t="",s="https://api.anthropic.com",n=6e4,o=2;return{name:"anthropic",async initialize(r){if(!r.apiKey)throw new h("AUTH_FAILED","API key is required");t=r.apiKey,r.baseUrl&&(s=r.baseUrl),n=r.timeoutMs,o=r.maxRetries},async complete(r){let a=r.messages.find(m=>m.role==="system"),c=r.messages.filter(m=>m.role!=="system").map(m=>{if(m.role==="tool")return{role:"user",content:[{type:"tool_result",tool_use_id:m.toolCallId,content:m.content}]};if(m.role==="assistant"){if(m.toolCalls&&m.toolCalls.length>0){let E=[];m.content&&E.push({type:"text",text:m.content});for(let T of m.toolCalls)E.push({type:"tool_use",id:T.id,name:T.name,input:T.arguments});return{role:"assistant",content:E}}return{role:"assistant",content:m.content}}return{role:"user",content:m.content}}),i={model:r.model,max_tokens:r.params.maxTokens,messages:c};a&&(i.system=a.content),r.params.temperature!==void 0&&(i.temperature=r.params.temperature),r.params.topP!==void 0&&(i.top_p=r.params.topP),r.params.stopSequences&&(i.stop_sequences=r.params.stopSequences),r.tools&&r.tools.length>0&&(i.tools=r.tools.map(m=>({name:m.name,description:m.description??"",input_schema:m.parameters??{type:"object",properties:{}}})),r.toolChoice&&(i.tool_choice=r.toolChoice==="required"?{type:"any"}:{type:r.toolChoice}));let l=Date.now(),p=await v(async()=>{let m=await e.fetch(`${s}/v1/messages`,{method:"POST",headers:{"Content-Type":"application/json","x-api-key":t,"anthropic-version":"2023-06-01"},body:JSON.stringify(i),timeoutMs:n});if(!m.ok){let E;try{E=await m.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Anthropic API",m.status,m.status>=500)}throw xt(m.status,E)}try{return await m.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Anthropic API",m.status,m.status>=500)}},{maxRetries:o,shouldRetry:m=>m instanceof h&&m.retryable,getRetryAfterMs:m=>{if(!(m instanceof h)||!m.raw)return;let w=m.raw.headers?.["retry-after"];if(typeof w=="string"){let A=Number(w);if(!Number.isNaN(A)&&A>0)return A*1e3}}}),f=Date.now()-l,y=p,R="",x=[];for(let m of y.content??[])m.type==="text"?R+=m.text:m.type==="tool_use"&&x.push({id:m.id,name:m.name,arguments:m.input??{}});let g=y.usage?.input_tokens??0,u=y.usage?.output_tokens??0;return{text:R,toolCalls:x,usage:{inputTokens:g,outputTokens:u,totalTokens:g+u},raw:p,latencyMs:f,modelId:y.model??r.model,finishReason:bt(y.stop_reason)}},estimateCost(r,a){let c=B(r,yt);return c.ok?a.inputTokens/1e6*c.price.input+a.outputTokens/1e6*c.price.output:null},supportsTools(r){return!0}}}function At(e,t){if(t)return"tool_calls";switch(e){case"stop":return"stop";case"length":return"max_tokens";default:return"unknown"}}function Tt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error):"Unknown error";switch(e){case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function pe(e){let t="http://localhost:11434",s=6e4,n=2;return{name:"ollama",async initialize(o){o.baseUrl&&(t=o.baseUrl),s=o.timeoutMs,n=o.maxRetries},async complete(o){let r=o.messages.map(u=>u.role==="tool"?{role:"tool",content:u.content}:u.role==="assistant"&&u.toolCalls&&u.toolCalls.length>0?{role:"assistant",content:u.content||"",tool_calls:u.toolCalls.map(m=>({function:{name:m.name,arguments:m.arguments}}))}:{role:u.role,content:u.content}),a={model:o.model,messages:r,stream:!1,options:{temperature:o.params.temperature,num_predict:o.params.maxTokens}},c=a.options;o.params.topP!==void 0&&(c.top_p=o.params.topP),o.params.seed!==void 0&&(c.seed=o.params.seed),o.params.stopSequences&&(c.stop=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(u=>({type:"function",function:{name:u.name,description:u.description,parameters:u.parameters}})));let i={"Content-Type":"application/json"},l=Date.now(),p=await v(async()=>{let u=await e.fetch(`${t}/api/chat`,{method:"POST",headers:i,body:JSON.stringify(a),timeoutMs:s});if(!u.ok){let m;try{m=await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Ollama API",u.status,u.status>=500)}throw Tt(u.status,m)}try{return await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Ollama API",u.status,u.status>=500)}},{maxRetries:n,shouldRetry:u=>u instanceof h&&u.retryable}),f=Date.now()-l,y=p,R=y.message,x=R?.tool_calls??[],g=x.map((u,m)=>({id:`ollama_call_${Date.now()}_${m}`,name:u.function.name,arguments:u.function.arguments??{}}));return{text:R?.content??"",toolCalls:g,usage:{inputTokens:y.prompt_eval_count??0,outputTokens:y.eval_count??0,totalTokens:(y.prompt_eval_count??0)+(y.eval_count??0)},raw:p,latencyMs:f,modelId:y.model??o.model,finishReason:At(y.done_reason,x.length>0)}},estimateCost(){return 0},supportsTools(){return!0}}}var Ct={"gemini-2.0-flash":{input:.1,output:.4},"gemini-2.0-flash-lite":{input:.075,output:.3},"gemini-1.5-pro":{input:1.25,output:5},"gemini-1.5-flash":{input:.075,output:.3},"gemini-1.5-flash-8b":{input:.0375,output:.15}};function Et(e,t){if(t)return"tool_calls";switch(e){case"STOP":return"stop";case"MAX_TOKENS":return"max_tokens";case"SAFETY":return"stop";default:return"unknown"}}function ve(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 400:return s.toLowerCase().includes("api key")?new h("AUTH_FAILED",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 401:case 403:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function me(e){let t="",s="https://generativelanguage.googleapis.com/v1beta",n=6e4,o=2;return{name:"gemini",async initialize(r){if(!r.apiKey)throw new h("AUTH_FAILED","API key is required");t=r.apiKey,r.baseUrl&&(s=r.baseUrl),n=r.timeoutMs,o=r.maxRetries},async complete(r){let a=[],c;for(let A of r.messages){if(A.role==="system"){c={parts:[{text:A.content}]};continue}if(A.role==="tool"){a.push({role:"function",parts:[{functionResponse:{name:A.toolName??"unknown",response:Pt(A.content)}}]});continue}if(A.role==="assistant"&&A.toolCalls&&A.toolCalls.length>0){let P=[];A.content&&P.push({text:A.content});for(let b of A.toolCalls)P.push({functionCall:{name:b.name,args:b.arguments}});a.push({role:"model",parts:P});continue}let F=A.role==="assistant"?"model":"user";a.push({role:F,parts:[{text:A.content}]})}let i={contents:a,generationConfig:{temperature:r.params.temperature,maxOutputTokens:r.params.maxTokens}};c&&(i.systemInstruction=c);let l=i.generationConfig;r.params.topP!==void 0&&(l.topP=r.params.topP),r.params.stopSequences&&(l.stopSequences=r.params.stopSequences),r.tools&&r.tools.length>0&&(i.tools=[{functionDeclarations:r.tools.map(A=>({name:A.name,description:A.description,parameters:A.parameters}))}],r.toolChoice&&(i.toolConfig=wt(r.toolChoice)));let p={"Content-Type":"application/json","x-goog-api-key":t},f=Date.now(),y=`${s}/models/${r.model}:generateContent`,R=await v(async()=>{let A=await e.fetch(y,{method:"POST",headers:p,body:JSON.stringify(i),timeoutMs:n});if(!A.ok){let F;try{F=await A.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Gemini API",A.status,A.status>=500)}throw ve(A.status,F)}try{return await A.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Gemini API",A.status,A.status>=500)}},{maxRetries:o,shouldRetry:A=>A instanceof h&&A.retryable}),x=Date.now()-f,g=R;if(g.error){let A=g.error.code??400;throw ve(A,{error:g.error})}let u=g.candidates?.[0],m=u?.content?.parts??[],E="",T=[],w=0;for(let A of m)A.text!==void 0&&(E+=A.text),A.functionCall&&(T.push({id:`gemini_call_${w}`,name:A.functionCall.name,arguments:A.functionCall.args??{}}),w++);return{text:E,toolCalls:T,usage:{inputTokens:g.usageMetadata?.promptTokenCount??0,outputTokens:g.usageMetadata?.candidatesTokenCount??0,totalTokens:g.usageMetadata?.totalTokenCount??0},raw:R,latencyMs:x,modelId:g.modelVersion??r.model,finishReason:Et(u?.finishReason,T.length>0)}},estimateCost(r,a){let c=B(r,Ct);return c.ok?a.inputTokens/1e6*c.price.input+a.outputTokens/1e6*c.price.output:null},supportsTools(){return!0}}}function Pt(e){try{return JSON.parse(e)}catch{return{result:e}}}function wt(e){switch(e){case"auto":return{functionCallingConfig:{mode:"AUTO"}};case"required":return{functionCallingConfig:{mode:"ANY"}};case"none":return{functionCallingConfig:{mode:"NONE"}}}}function Mt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function St(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new h("CONTEXT_LENGTH",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function fe(e){let t="",s="https://api.mistral.ai/v1",n=6e4,o=2;return{name:"mistral",async initialize(r){if(!r.apiKey)throw new h("AUTH_FAILED","API key is required");t=r.apiKey,r.baseUrl&&(s=r.baseUrl),n=r.timeoutMs,o=r.maxRetries},async complete(r){let a={model:r.model,messages:r.messages.map(g=>g.role==="tool"?{role:"tool",content:g.content,tool_call_id:g.toolCallId}:g.role==="assistant"&&g.toolCalls&&g.toolCalls.length>0?{role:"assistant",content:g.content||null,tool_calls:g.toolCalls.map(u=>({id:u.id,type:"function",function:{name:u.name,arguments:JSON.stringify(u.arguments)}}))}:{role:g.role,content:g.content}),temperature:r.params.temperature,max_tokens:r.params.maxTokens};r.params.topP!==void 0&&(a.top_p=r.params.topP),r.params.stopSequences&&(a.stop=r.params.stopSequences),r.tools&&r.tools.length>0&&(a.tools=r.tools.map(g=>({type:"function",function:{name:g.name,description:g.description,parameters:g.parameters}})),r.toolChoice&&(a.tool_choice=r.toolChoice));let c={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),l=await v(async()=>{let g=await e.fetch(`${s}/chat/completions`,{method:"POST",headers:c,body:JSON.stringify(a),timeoutMs:n});if(!g.ok){let u;try{u=await g.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Mistral API",g.status,g.status>=500)}throw St(g.status,u)}try{return await g.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Mistral API",g.status,g.status>=500)}},{maxRetries:o,shouldRetry:g=>g instanceof h&&g.retryable}),p=Date.now()-i,f=l,y=f.choices?.[0],R=y?.message,x=(R?.tool_calls??[]).map(g=>{let u;try{u=JSON.parse(g.function.arguments||"{}")}catch{u={_raw:g.function.arguments}}return{id:g.id,name:g.function.name,arguments:u}});return{text:R?.content??"",toolCalls:x,usage:{inputTokens:f.usage?.prompt_tokens??0,outputTokens:f.usage?.completion_tokens??0,totalTokens:f.usage?.total_tokens??0},raw:l,latencyMs:p,modelId:f.model??r.model,finishReason:Mt(y?.finish_reason)}},estimateCost(r,a){return null},supportsTools(r){return!0}}}function Ot(e){switch(e){case"COMPLETE":return"stop";case"MAX_TOKENS":return"max_tokens";case"TOOL_CALL":return"tool_calls";default:return"unknown"}}function kt(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):"Unknown error";switch(e){case 401:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new h("CONTEXT_LENGTH",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ge(e){let t="",s="https://api.cohere.com",n=6e4,o=2;return{name:"cohere",async initialize(r){if(!r.apiKey)throw new h("AUTH_FAILED","API key is required");t=r.apiKey,r.baseUrl&&(s=r.baseUrl),n=r.timeoutMs,o=r.maxRetries},async complete(r){let a={model:r.model,messages:r.messages.map(u=>u.role==="tool"?{role:"tool",content:u.content,tool_call_id:u.toolCallId}:u.role==="assistant"&&u.toolCalls&&u.toolCalls.length>0?{role:"assistant",content:u.content||null,tool_calls:u.toolCalls.map(m=>({id:m.id,type:"function",function:{name:m.name,arguments:JSON.stringify(m.arguments)}}))}:{role:u.role,content:u.content}),temperature:r.params.temperature,max_tokens:r.params.maxTokens};r.params.topP!==void 0&&(a.p=r.params.topP),r.params.stopSequences&&(a.stop_sequences=r.params.stopSequences),r.tools&&r.tools.length>0&&(a.tools=r.tools.map(u=>({type:"function",function:{name:u.name,description:u.description,parameters:u.parameters}})),r.toolChoice&&(a.tool_choice=r.toolChoice));let c={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),l=await v(async()=>{let u=await e.fetch(`${s}/v2/chat`,{method:"POST",headers:c,body:JSON.stringify(a),timeoutMs:n});if(!u.ok){let m;try{m=await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Cohere API",u.status,u.status>=500)}throw kt(u.status,m)}try{return await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Cohere API",u.status,u.status>=500)}},{maxRetries:o,shouldRetry:u=>u instanceof h&&u.retryable}),p=Date.now()-i,f=l,y=f.message?.content?.filter(u=>u.type==="text").map(u=>u.text).join("")??"",R=(f.message?.tool_calls??[]).map(u=>{let m;try{m=JSON.parse(u.function.arguments||"{}")}catch{m={_raw:u.function.arguments}}return{id:u.id,name:u.function.name,arguments:m}}),x=f.usage?.tokens?.input_tokens??0,g=f.usage?.tokens?.output_tokens??0;return{text:y,toolCalls:R,usage:{inputTokens:x,outputTokens:g,totalTokens:x+g},raw:l,latencyMs:p,modelId:f.model??r.model,finishReason:Ot(f.finish_reason)}},estimateCost(r,a){return null},supportsTools(r){return!0}}}var _e={openai:ce,anthropic:de,ollama:pe,gemini:me,mistral:fe,cohere:ge};function vt(e,t){let s=_e[e];if(!s){let n=Object.keys(_e).join(", ");throw new Error(`Unknown provider: "${e}". Supported providers: ${n}`)}return s(t)}async function he(e,t,s,n){let o=n?.maxTurns??10,r=[],a=[],c=[...t.messages],i={inputTokens:0,outputTokens:0,totalTokens:0},l=0;for(let R=0;R<o;R++){let x={...t,messages:c},g=await e.complete(x);if(r.push({request:x,response:g}),i.inputTokens+=g.usage.inputTokens,i.outputTokens+=g.usage.outputTokens,i.totalTokens+=g.usage.totalTokens,l+=g.latencyMs,g.toolCalls.length===0)return{turns:r,finalText:g.text,allToolCalls:a,totalUsage:i,totalLatencyMs:l,truncated:!1};a.push(...g.toolCalls),c=[...c,{role:"assistant",content:g.text,toolCalls:g.toolCalls}];for(let u of g.toolCalls){let m=s.find(T=>T.name===u.name),E;m?E=_t(m,u.arguments):E={error:`Tool "${u.name}" not simulated`},c.push({role:"tool",content:JSON.stringify(E),toolCallId:u.id,toolName:u.name})}}let f=r[r.length-1]?.response??{text:"",toolCalls:[],usage:{inputTokens:0,outputTokens:0,totalTokens:0},raw:null,latencyMs:0,modelId:t.model,finishReason:"unknown"},y=f.text?`${f.text}
|
|
32
|
+
[Note: Conversation truncated after ${o} turns]`:`[Note: Conversation truncated after ${o} turns]`;return{turns:r,finalText:y,allToolCalls:a,totalUsage:i,totalLatencyMs:l,truncated:!0}}function _t(e,t){if(e.responses){for(let s of e.responses)if(It(s.when,t))return s.then}return e.defaultResponse!==void 0?e.defaultResponse:{error:"No matching simulation response"}}function It(e,t){for(let[s,n]of Object.entries(e))if(JSON.stringify(t[s])!==JSON.stringify(n))return!1;return!0}import{z as d}from"zod";import{z as _}from"zod";var Ie=_.object({outputTextAttr:_.string().default("gen_ai.completion.0.content"),modelAttr:_.string().default("gen_ai.response.model"),systemAttr:_.string().default("gen_ai.system"),inputTokensAttr:_.string().default("gen_ai.usage.input_tokens"),outputTokensAttr:_.string().default("gen_ai.usage.output_tokens")}),Ne=_.object({namePattern:_.string().optional().describe("Regex to filter span names"),attributeMatch:_.record(_.string()).optional().describe("Attributes that must match"),minDurationMs:_.number().min(0).optional()}),Re=_.object({port:_.number().int().min(1).max(65535).default(4318),timeoutMs:_.number().int().min(1e3).default(3e4),spanMapping:Ie.default({}),spanFilter:Ne.optional()});var N=d.string().min(1,"Must not be empty"),Nt=d.number().min(0).max(2).default(.2),V=d.number().min(0).max(1),je=d.string().refine(e=>{try{return new RegExp(e),!0}catch{return!1}},{message:"Must be a valid regex pattern"}),z=d.object({apiKeyEnv:N.describe("Environment variable name containing the API key. Never a raw key."),baseUrl:d.string().url().optional().describe("Custom base URL for API-compatible proxies (e.g., Azure OpenAI, LiteLLM)"),organization:d.string().optional().describe("Organization ID (OpenAI-specific)")}),jt=d.object({apiKeyEnv:d.string().min(1).optional().describe("Environment variable name containing the API key. Optional for Ollama (local)."),baseUrl:d.string().url().optional().describe("Ollama server URL. Defaults to http://localhost:11434.")}),$t=d.object({openai:z.optional(),anthropic:z.optional(),ollama:jt.optional(),gemini:z.optional(),mistral:z.optional(),cohere:z.optional()}).refine(e=>Object.keys(e).some(t=>e[t]!==void 0),{message:"At least one provider must be configured"}),Dt=d.object({temperature:Nt,maxTokens:d.number().int().min(1).max(128e3).default(1024),topP:d.number().min(0).max(1).optional(),stopSequences:d.array(d.string()).optional(),seed:d.number().int().optional().describe("Seed for reproducibility (provider-dependent support)")}),Lt=d.object({id:N.describe("Unique identifier for this model config, referenced in reports"),provider:d.enum(["openai","anthropic","ollama","gemini","mistral","cohere"]).describe("Must match a key in the providers section"),model:N.describe("Model name as the provider expects it (e.g., 'gpt-4o', 'claude-sonnet-4-5-20250929')"),params:Dt.default({})}),Ut=d.object({system:d.string().optional().describe("System prompt template. Supports {{variable}} interpolation."),user:N.describe("User prompt template. Supports {{variable}} interpolation."),assistant:d.string().optional().describe("Prefill for assistant response (Anthropic-specific)")}),Ft=d.object({format:d.enum(["text","json"]).default("text"),schemaFile:d.string().optional().describe("Path to JSON Schema file (relative to config file). Required if format is 'json'."),contains:d.array(d.string()).optional().describe("Output must contain all of these substrings"),notContains:d.array(d.string()).optional().describe("Output must not contain any of these substrings"),maxLength:d.number().int().positive().optional().describe("Maximum character length of the output")}).refine(e=>!(e.format==="json"&&!e.schemaFile),{message:"schemaFile is required when format is 'json'"}),Bt=d.object({enabled:d.boolean().default(!0),denyPatterns:d.array(je).default(["\\b\\d{3}-\\d{2}-\\d{4}\\b","\\b\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}\\b","\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"]).describe("Regex patterns that must NOT appear in output. Defaults include SSN, credit card, email."),customPatterns:d.array(d.object({name:N,pattern:je})).optional().describe("Named custom PII patterns for reporting clarity")}),Vt=d.object({deny:d.array(d.string()).default([]).describe("Words/phrases that must NOT appear in output (case-insensitive)"),allow:d.array(d.string()).optional().describe("If set, output MUST contain at least one of these words/phrases")}),Jt=d.object({criteria:N.describe("Natural language description of what to evaluate (e.g., 'Response is empathetic and professional')"),minScore:V.default(.7).describe("Minimum score (0-1) for this criterion to pass"),model:d.string().optional().describe("Override judge model for this criterion. Defaults to first model in models list."),rubric:d.string().optional().describe("Detailed rubric for the judge. If omitted, a default rubric is generated from criteria.")}),Gt=d.object({tool:N.describe("Expected tool/function name"),shouldNotCall:d.boolean().optional().default(!1).describe("If true, assert this tool was NOT called"),argsMatch:d.record(d.unknown()).optional().describe("Key-value pairs that must be present in the tool call arguments (partial match)"),argsSchema:d.string().optional().describe("Path to JSON Schema file to validate the tool call arguments"),argsSchemaResolved:d.record(d.unknown()).optional().describe("Resolved JSON Schema content from argsSchema file. Populated during config parsing \u2014 not user-supplied."),order:d.number().int().min(0).optional().describe("Expected position in the sequence of tool calls (0-indexed)")}),Kt=d.object({maxScore:V.default(.15).describe("Maximum drift score (0-1). Higher = more drift allowed. Fail if exceeded."),method:d.enum(["judge","embedding","field-diff"]).default("judge").describe("Drift detection method. 'judge' uses LLM comparison, 'embedding' uses cosine similarity, 'field-diff' compares JSON fields."),fields:d.array(d.string()).optional().describe("For field-diff method: JSON paths to compare (e.g., ['response.action', 'response.message'])")}),Ht=d.object({pii:Bt.optional(),keywords:Vt.optional()}),zt=d.object({output:Ft.optional(),guardrails:Ht.optional(),judge:d.array(Jt).optional().describe("LLM-as-judge evaluations. Each criterion is scored independently."),toolCalls:d.array(Gt).optional().describe("Expected tool/function calls in the model response"),baseline:d.object({drift:Kt.optional()}).optional(),latency:d.object({maxMs:d.number().positive().describe("Maximum allowed latency in milliseconds")}).optional().describe("Assert response latency is within threshold"),cost:d.object({maxUsd:d.number().positive().describe("Maximum allowed cost in USD")}).optional().describe("Assert response cost is within budget")}),Wt=d.object({name:N.describe("Tool/function name as the model sees it"),description:d.string().optional().describe("Tool description for documentation"),parameters:d.record(d.unknown()).optional().describe("JSON Schema for the tool's parameters"),responses:d.array(d.object({when:d.record(d.unknown()).describe("Condition: match tool call arguments (partial match)"),then:d.unknown().describe("Simulated response to return when condition matches")})).optional().describe("Simulated responses based on argument matching"),defaultResponse:d.unknown().optional().describe("Response when no 'when' condition matches")}),Xt=d.object({name:N.describe("Unique test case name within the suite. Used in reports and JUnit output."),prompt:N.optional().describe("Reference to a key in the prompts section. Exactly one of prompt or command must be set."),command:N.optional().describe("Shell command to execute. Stdout is captured and assertions run against it. Exactly one of prompt or command must be set."),vars:d.record(d.string()).default({}).describe("Variables to interpolate into the prompt template or command"),models:d.array(d.string()).optional().describe("Override: run this test only against these model IDs. Defaults to all models. Ignored for command tests."),repeat:d.number().int().min(1).max(100).optional().describe("Override: number of repeat runs for this specific test case"),tools:d.array(Wt).optional().describe("Simulated tools available to the model for this test case"),expect:zt.describe("Assertions to evaluate against the model output"),tags:d.array(d.string()).optional().describe("Tags for filtering test cases in CLI (e.g., --tags regression)"),skip:d.boolean().optional().default(!1).describe("Skip this test case during execution")}).refine(e=>{let t=e.prompt!==void 0,s=e.command!==void 0;return(t||s)&&!(t&&s)},{message:"Exactly one of 'prompt' or 'command' must be set on each test case"}),Yt=d.object({passRateMin:V.default(.95).describe("Minimum overall pass rate (0-1). Computed after repeats and aggregation."),schemaFailuresMax:d.number().int().min(0).default(0).describe("Maximum allowed schema validation failures across entire suite"),judgeAvgMin:V.optional().describe("Minimum average LLM-as-judge score across all criteria and test cases"),driftScoreMax:V.optional().describe("Maximum allowed drift score against active baseline"),piiFailuresMax:d.number().int().min(0).default(0).describe("Maximum allowed PII detection failures"),keywordFailuresMax:d.number().int().min(0).default(0).describe("Maximum allowed keyword guardrail failures"),costMaxUsd:d.number().positive().optional().describe("Maximum total cost in USD for the entire run. Aborts if exceeded mid-run."),latencyMaxMs:d.number().positive().optional().describe("Maximum average latency in ms. Fails gate if exceeded."),deterministicPassRate:V.optional().describe("Minimum pass rate for deterministic assertions only (tool_called, schema, pii, keywords, etc.)"),probabilisticPassRate:V.optional().describe("Minimum pass rate for probabilistic assertions only (judge, drift)")}),Qt=d.object({enabled:d.boolean().default(!1),framework:d.enum(["eu-ai-act","custom"]).default("eu-ai-act"),outputDir:d.string().default("./compliance-reports"),metadata:d.object({systemName:d.string().optional().describe("Name of the AI system being tested"),systemVersion:d.string().optional().describe("Version of the AI system"),riskLevel:d.enum(["high","limited","minimal"]).optional(),operator:d.string().optional().describe("Organization operating the AI system"),intendedPurpose:d.string().optional().describe("Documented intended purpose of the AI system"),dataGovernanceNotes:d.string().optional()}).optional()}),Zt=d.object({enabled:d.boolean().default(!1),apiUrl:d.string().url().default("https://api.kindlm.com/v1").describe("Cloud API URL. Override for self-hosted deployments.")}),qt=d.object({name:N,description:d.string().optional(),tags:d.array(d.string()).optional()}),$e=d.object({kindlm:d.literal(1).describe("Config schema version. Must be 1."),project:N.describe("Project identifier for cloud upload and report grouping"),suite:qt,providers:$t,models:d.array(Lt).min(1,"At least one model must be configured"),prompts:d.record(Ut).refine(e=>Object.keys(e).length>0,{message:"At least one prompt must be defined"}),tests:d.array(Xt).min(1,"At least one test case must be defined"),gates:Yt.default({}),compliance:Qt.optional(),trace:Re.optional().describe("OpenTelemetry trace ingestion configuration for the 'kindlm trace' command"),upload:Zt.default({}),defaults:d.object({repeat:d.number().int().min(1).max(100).default(1).describe("Default repeat count per test case"),concurrency:d.number().int().min(1).max(32).default(4).describe("Default concurrency for test execution"),timeoutMs:d.number().int().min(1e3).default(6e4).describe("Default timeout per provider call in ms"),judgeModel:d.string().optional().describe("Default model ID for LLM-as-judge assertions. Must reference a configured model.")}).default({})});function ye(e){let t=$e.safeParse(e);return t.success?k(t.data):C({code:"CONFIG_VALIDATION_ERROR",message:"Config validation failed",details:{errors:t.error.issues.map(s=>`${s.path.join(".")}: ${s.message}`)}})}import{parse as er}from"yaml";var tr=1048576,De=1e3,Le=50;function rr(e,t){if(e.length>tr)return C({code:"CONFIG_TOO_LARGE",message:`Config exceeds maximum size of 1MB (got ${(e.length/1048576).toFixed(1)}MB)`});let s;try{s=er(e)}catch(i){return C({code:"CONFIG_PARSE_ERROR",message:`Failed to parse YAML: ${i.message}`,cause:i})}let n=ye(s);if(!n.success)return n;let o=n.data;if(o.tests.length>De)return C({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${De} tests (got ${o.tests.length})`});if(o.models.length>Le)return C({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${Le} models (got ${o.models.length})`});let r=[],a=new Set;for(let i of o.models)a.has(i.id)&&r.push(`Duplicate model ID "${i.id}"`),a.add(i.id);let c=new Set;for(let i of o.tests)c.has(i.name)&&r.push(`Duplicate test name "${i.name}"`),c.add(i.name);for(let i of o.tests)i.prompt&&!(i.prompt in o.prompts)&&r.push(`Test "${i.name}" references prompt "${i.prompt}" which is not defined`);for(let i of o.tests)if(i.models)for(let l of i.models)a.has(l)||r.push(`Test "${i.name}" references model "${l}" which is not configured`);for(let i of o.models)o.providers[i.provider]||r.push(`Model "${i.id}" references provider "${i.provider}" which is not configured`);if(o.defaults.judgeModel&&!a.has(o.defaults.judgeModel)&&r.push(`defaults.judgeModel "${o.defaults.judgeModel}" is not a configured model`),t.fileReader)for(let i of o.tests){if(i.expect.output?.schemaFile){let l=Ue(t.configDir,i.expect.output.schemaFile);l.success?t.fileReader.readFile(l.data).success||r.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" not found at ${l.data}`):r.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" \u2014 ${l.error.message}`)}if(i.expect.toolCalls){for(let l of i.expect.toolCalls)if(l.argsSchema){let p=Ue(t.configDir,l.argsSchema);if(!p.success)r.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" \u2014 ${p.error.message}`);else{let f=t.fileReader.readFile(p.data);if(!f.success)r.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" not found at ${p.data}`);else try{l.argsSchemaResolved=JSON.parse(f.data)}catch{r.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" is not valid JSON`)}}}}}return r.length>0?C({code:"CONFIG_VALIDATION_ERROR",message:"Config cross-reference validation failed",details:{errors:r}}):k(o)}function Ue(e,t){if(t.startsWith("/")||t.startsWith("\\"))return C({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});if(/^[a-zA-Z]:/.test(t))return C({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});let s=e.endsWith("/")?e.slice(0,-1):e,n=`${s}/${t}`,o=Fe(n),r=Fe(s);return!o.startsWith(r+"/")&&o!==r?C({code:"PATH_TRAVERSAL",message:`Path "${t}" escapes the config directory`}):k(o)}function Fe(e){let t=e.split("/"),s=[];for(let o of t)o==="."||o===""||(o===".."?s.pop():s.push(o));return(e.startsWith("/")?"/":"")+s.join("/")}var Be=/\{\{([\w.]+)\}\}/g;function J(e,t,s){let n=Ve(e,t,s);if(n.length>0)return C({code:"CONFIG_VALIDATION_ERROR",message:`Missing template variables: ${n.join(", ")}`,details:{missing:n}});let o=e.replace(Be,(r,a)=>{if(a.startsWith("env.")){let c=a.slice(4),i=s?.[c];return i===void 0?r:i}return t[a]});return k(o)}function Ve(e,t,s){let n=new Set;for(let o of e.matchAll(Be)){let r=o[1];r!==void 0&&(r.startsWith("env.")||r in t||n.add(r))}return[...n]}function be(e){let t=e[0];if(!t)return C("aggregateRuns requires at least one run");let{testCaseName:s,modelId:n}=t,r=e.filter(R=>R.assertions.every(x=>x.passed)).length/e.length,a=new Map;for(let R of e)for(let x of R.assertions){let g=x.label?`${x.assertionType}:${x.label}`:x.assertionType,u=a.get(g);u||(u=[],a.set(g,u)),u.push(x.score)}let c={};for(let[R,x]of a){let g=x.reduce((u,m)=>u+m,0);c[R]={mean:g/x.length,min:Math.min(...x),max:Math.max(...x)}}let i=new Set;for(let R of e)for(let x of R.assertions)!x.passed&&x.failureCode&&i.add(x.failureCode);let l=e.reduce((R,x)=>R+x.latencyMs,0)/e.length,p=e.reduce((R,x)=>R+(x.costEstimateUsd??0),0),f=e.reduce((R,x)=>R+x.tokenUsage.totalTokens,0),y=e.some(R=>R.errored===!0);return k({testCaseName:s,modelId:n,runCount:e.length,passed:r===1,errored:y,passRate:r,assertionScores:c,failureCodes:[...i],latencyAvgMs:l,totalCostUsd:p,totalTokens:f,runs:e})}function xe(e){let t=e.stdout.split(`
|
|
33
|
+
`),s=[],n=[],o,r=0;for(let a of t){let c=a.trimStart();if(!c.startsWith('{"kindlm":')){s.push(a);continue}let i=sr(c);if(!i){s.push(a);continue}i.kindlm==="tool_call"?n.push({id:i.id??`cmd_tc_${r++}`,name:i.name,arguments:i.arguments}):i.kindlm==="output_json"&&(o=i.data)}return{outputText:s.join(`
|
|
34
|
+
`).trim(),toolCalls:n,outputJson:o,exitCode:e.exitCode,stderr:e.stderr}}function sr(e){try{let t=JSON.parse(e);return typeof t.kindlm!="string"?null:t.kindlm==="tool_call"?typeof t.name!="string"?null:{kindlm:"tool_call",id:typeof t.id=="string"?t.id:void 0,name:t.name,arguments:typeof t.arguments=="object"&&t.arguments!==null?t.arguments:{}}:t.kindlm==="output_json"?{kindlm:"output_json",data:t.data}:null}catch{return null}}function nr(e,t,s){return{async run(){let n=Date.now(),o=new Map;for(let b of e.tests)if(b.expect.output?.schemaFile){let S=ur(t.configDir,b.expect.output.schemaFile),O=t.fileReader.readFile(S);if(!O.success)return C({code:"SCHEMA_FILE_ERROR",message:`Failed to read schema file "${b.expect.output.schemaFile}": ${O.error.message}`});try{o.set(b.name,JSON.parse(O.data))}catch(j){return C({code:"SCHEMA_FILE_ERROR",message:`Failed to parse schema file "${b.expect.output.schemaFile}" as JSON: ${j instanceof Error?j.message:String(j)}`})}}let r=s?.tags,a=[];for(let b of e.tests){if(b.skip)continue;if(r&&r.length>0){let O=b.tags??[];if(!r.some(L=>O.includes(L)))continue}let S=b.repeat??e.defaults.repeat;if(b.command)for(let O=0;O<S;O++)a.push({test:b,modelConfig:null,runIndex:O});else{let O=b.models??e.models.map(j=>j.id);for(let j of O){let L=e.models.find(I=>I.id===j);if(L)for(let I=0;I<S;I++)a.push({test:b,modelConfig:L,runIndex:I})}}}let c=0,i=!1,l=e.gates?.costMaxUsd,p=await lr(a.map(b=>async()=>{if(i)return ir(b.test.name,b.modelConfig?.id??"command",b.runIndex,c,l??0);let S=await or(e,t,b,o);return S.costEstimateUsd!==null&&S.costEstimateUsd!==void 0&&(c+=S.costEstimateUsd,l!==void 0&&c>l&&(i=!0)),S}),e.defaults.concurrency),f=b=>`${b.testCaseName}::${b.modelId}`,y=new Map;for(let b of p){let S=f(b),O=y.get(S);O||(O=[],y.set(S,O)),O.push(b)}let R=[];for(let b of y.values()){let S=be(b);if(!S.success)return C({code:"UNKNOWN_ERROR",message:S.error});R.push(S.data)}let x=R.map(b=>{let S=b.errored?"errored":b.passed?"passed":"failed",O=b.passed?b.runs[0]:b.runs.find(L=>!L.assertions.every(I=>I.passed))??b.runs[0],j;if(S==="errored"){let L=b.runs.find(I=>I.errored&&I.error);if(L?.error)j={code:"UNKNOWN_ERROR",message:L.error.message};else{let I=b.runs.flatMap(W=>W.assertions).find(W=>!W.passed&&W.failureMessage);I?.failureMessage&&(j={code:"UNKNOWN_ERROR",message:I.failureMessage})}}return{name:b.testCaseName,modelId:b.modelId,status:S,assertions:O?.assertions??[],error:j,latencyMs:b.latencyAvgMs,costUsd:b.totalCostUsd}}),g=e.tests.filter(b=>b.skip).map(b=>({name:b.name,modelId:"",status:"skipped",assertions:[],latencyMs:0,costUsd:0})),u=[...x,...g],m=u.filter(b=>b.status==="passed").length,E=u.filter(b=>b.status==="failed").length,T=u.filter(b=>b.status==="errored").length,w=u.filter(b=>b.status==="skipped").length,A=T>0?"errored":E>0?"failed":"passed",P={suites:[{name:e.suite.name,status:A,tests:u}],totalTests:u.length,passed:m,failed:E,errored:T,skipped:w,durationMs:Date.now()-n};return k({runResult:P,aggregated:R})}}}async function or(e,t,s,n){let{test:o,modelConfig:r,runIndex:a}=s;if(o.command)return ar(e,t,o,a,n);if(!r)return $(o.name,"unknown",a,"No model config for prompt-based test");try{t.onProgress?.({type:"test_start",test:o.name,model:r.id,run:a})}catch{}try{let c=t.adapters.get(r.provider);if(!c)return $(o.name,r.id,a,`Provider adapter "${r.provider}" not found`);let i=o.prompt?e.prompts[o.prompt]:void 0;if(!i)return $(o.name,r.id,a,`Prompt "${o.prompt}" not defined`);let l=J(i.user,o.vars);if(!l.success)return $(o.name,r.id,a,l.error.message);let p=[];if(i.system){let P=J(i.system,o.vars);if(!P.success)return $(o.name,r.id,a,P.error.message);p.push({role:"system",content:P.data})}if(p.push({role:"user",content:l.data}),i.assistant){let P=J(i.assistant,o.vars);if(!P.success)return $(o.name,r.id,a,P.error.message);p.push({role:"assistant",content:P.data})}let f=(o.tools??[]).map(P=>({name:P.name,description:P.description,parameters:P.parameters})),y={model:r.model,messages:p,params:{temperature:r.params.temperature,maxTokens:r.params.maxTokens,topP:r.params.topP,stopSequences:r.params.stopSequences,seed:r.params.seed},tools:f.length>0?f:void 0},R=await he(c,y,o.tools??[]),x=c.estimateCost(r.model,R.totalUsage),g={};o.expect.output?.schemaFile&&n.has(o.name)&&(g.schemaContent=n.get(o.name));let u=Y(o.expect,g),m=e.defaults.judgeModel??e.models[0]?.id,E=e.models.find(P=>P.id===m),T=E?t.adapters.get(E.provider):void 0,w={outputText:R.finalText,toolCalls:R.allToolCalls,configDir:t.configDir,latencyMs:R.totalLatencyMs,costUsd:x??void 0,judgeAdapter:T,judgeModel:E?.model,getEmbedding:c.embed?(P=>b=>P(b))(c.embed):void 0};if(t.baselineData){let P=`${o.name}::${r.id}`,b=t.baselineData.results[P];b&&(w.baselineText=b.outputText)}let A=[];for(let P of u){let b=await P.evaluate(w);A.push(...b)}let F=A.every(P=>P.passed);try{t.onProgress?.({type:"test_complete",test:o.name,model:r.id,run:a,passed:F})}catch{}return{testCaseName:o.name,modelId:r.id,runIndex:a,outputText:R.finalText,assertions:A,latencyMs:R.totalLatencyMs,tokenUsage:R.totalUsage,costEstimateUsd:x}}catch(c){try{t.onProgress?.({type:"test_complete",test:o.name,model:r.id,run:a,passed:!1})}catch{}return $(o.name,r.id,a,c instanceof Error?c.message:String(c))}}async function ar(e,t,s,n,o){let r="command";try{t.onProgress?.({type:"test_start",test:s.name,model:r,run:n})}catch{}try{if(!t.commandExecutor)return $(s.name,r,n,"Command executor not available");if(!s.command)return $(s.name,r,n,"No command specified");let a=J(s.command,s.vars);if(!a.success)return $(s.name,r,n,a.error.message);let c=Date.now(),i=await t.commandExecutor.execute(a.data,{timeoutMs:e.defaults.timeoutMs,cwd:t.configDir});if(!i.success)return $(s.name,r,n,i.error.message);let l=Date.now()-c,p=xe(i.data),f={};s.expect.output?.schemaFile&&o.has(s.name)&&(f.schemaContent=o.get(s.name));let y=Y(s.expect,f),R=e.defaults.judgeModel??e.models[0]?.id,x=e.models.find(T=>T.id===R),g=x?t.adapters.get(x.provider):void 0,u={outputText:p.outputText,outputJson:p.outputJson,toolCalls:p.toolCalls,configDir:t.configDir,latencyMs:l,judgeAdapter:g,judgeModel:x?.model,getEmbedding:g?.embed?(T=>w=>T(w))(g.embed):void 0};if(t.baselineData){let T=`${s.name}::${r}`,w=t.baselineData.results[T];w&&(u.baselineText=w.outputText)}let m=[];for(let T of y){let w=await T.evaluate(u);m.push(...w)}let E=m.every(T=>T.passed);try{t.onProgress?.({type:"test_complete",test:s.name,model:r,run:n,passed:E})}catch{}return{testCaseName:s.name,modelId:r,runIndex:n,outputText:p.outputText,assertions:m,latencyMs:l,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}catch(a){try{t.onProgress?.({type:"test_complete",test:s.name,model:r,run:n,passed:!1})}catch{}return $(s.name,r,n,a instanceof Error?a.message:String(a))}}function ir(e,t,s,n,o){return{testCaseName:e,modelId:t,runIndex:s,outputText:"",assertions:[{assertionType:"cost",label:"Budget exceeded",passed:!1,score:0,failureCode:"BUDGET_EXCEEDED",failureMessage:`Run budget exceeded: $${n.toFixed(4)} > $${o.toFixed(4)}`}],latencyMs:0,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:0}}function $(e,t,s,n){return{testCaseName:e,modelId:t,runIndex:s,outputText:"",assertions:[{assertionType:"internal",label:"Execution error",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:n}],latencyMs:0,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null,errored:!0,error:{code:"UNKNOWN_ERROR",message:n}}}async function lr(e,t){let s=Array.from({length:e.length}),n=0;async function o(){for(;n<e.length;){let a=n++,c=e[a];if(c===void 0)throw new Error(`Task at index ${a} is undefined`);s[a]=await c()}}let r=Array.from({length:Math.min(t,e.length)},()=>o());return await Promise.all(r),s}function ur(e,t){return t.startsWith("/")?t:`${e.endsWith("/")?e.slice(0,-1):e}/${t}`}function cr(e,t){let s=[],n=t.reduce((l,p)=>l+p.runCount,0),o=t.reduce((l,p)=>l+Math.round(p.passRate*p.runCount),0),r=n>0?o/n:0;s.push({gateName:"passRateMin",passed:r>=e.passRateMin,actual:r,threshold:e.passRateMin,message:r>=e.passRateMin?`Pass rate ${M(r)} meets minimum ${M(e.passRateMin)}`:`Pass rate ${M(r)} below minimum ${M(e.passRateMin)}`});let a=Ae(t,["SCHEMA_INVALID","SCHEMA_PARSE_ERROR"]);if(s.push({gateName:"schemaFailuresMax",passed:a<=e.schemaFailuresMax,actual:a,threshold:e.schemaFailuresMax,message:a<=e.schemaFailuresMax?`Schema failures ${a} within limit ${e.schemaFailuresMax}`:`Schema failures ${a} exceed limit ${e.schemaFailuresMax}`}),e.judgeAvgMin!==void 0){let l=Ge(t,"judge"),p=l.length>0?l.reduce((f,y)=>f+y,0)/l.length:1;s.push({gateName:"judgeAvgMin",passed:p>=e.judgeAvgMin,actual:p,threshold:e.judgeAvgMin,message:p>=e.judgeAvgMin?`Judge average ${M(p)} meets minimum ${M(e.judgeAvgMin)}`:`Judge average ${M(p)} below minimum ${M(e.judgeAvgMin)}`})}if(e.driftScoreMax!==void 0){let p=Ge(t,"drift").map(y=>1-y),f=p.length>0?Math.max(...p):0;s.push({gateName:"driftScoreMax",passed:f<=e.driftScoreMax,actual:f,threshold:e.driftScoreMax,message:f<=e.driftScoreMax?`Drift score ${M(f)} within limit ${M(e.driftScoreMax)}`:`Drift score ${M(f)} exceeds limit ${M(e.driftScoreMax)}`})}let c=Ae(t,["PII_DETECTED"]);s.push({gateName:"piiFailuresMax",passed:c<=e.piiFailuresMax,actual:c,threshold:e.piiFailuresMax,message:c<=e.piiFailuresMax?`PII failures ${c} within limit ${e.piiFailuresMax}`:`PII failures ${c} exceed limit ${e.piiFailuresMax}`});let i=Ae(t,["KEYWORD_DENIED","KEYWORD_MISSING"]);if(s.push({gateName:"keywordFailuresMax",passed:i<=e.keywordFailuresMax,actual:i,threshold:e.keywordFailuresMax,message:i<=e.keywordFailuresMax?`Keyword failures ${i} within limit ${e.keywordFailuresMax}`:`Keyword failures ${i} exceed limit ${e.keywordFailuresMax}`}),e.costMaxUsd!==void 0){let l=t.reduce((p,f)=>p+f.totalCostUsd,0);s.push({gateName:"costMaxUsd",passed:l<=e.costMaxUsd,actual:l,threshold:e.costMaxUsd,message:l<=e.costMaxUsd?`Total cost $${l.toFixed(4)} within limit $${e.costMaxUsd.toFixed(4)}`:`Total cost $${l.toFixed(4)} exceeds limit $${e.costMaxUsd.toFixed(4)}`})}if(e.latencyMaxMs!==void 0){let l=t.length>0?t.reduce((p,f)=>p+f.latencyAvgMs,0)/t.length:0;s.push({gateName:"latencyMaxMs",passed:l<=e.latencyMaxMs,actual:l,threshold:e.latencyMaxMs,message:l<=e.latencyMaxMs?`Average latency ${Math.round(l)}ms within limit ${e.latencyMaxMs}ms`:`Average latency ${Math.round(l)}ms exceeds limit ${e.latencyMaxMs}ms`})}if(e.deterministicPassRate!==void 0){let l=Je(t,"deterministic");s.push({gateName:"deterministicPassRate",passed:l>=e.deterministicPassRate,actual:l,threshold:e.deterministicPassRate,message:l>=e.deterministicPassRate?`Deterministic pass rate ${M(l)} meets minimum ${M(e.deterministicPassRate)}`:`Deterministic pass rate ${M(l)} below minimum ${M(e.deterministicPassRate)}`})}if(e.probabilisticPassRate!==void 0){let l=Je(t,"probabilistic");s.push({gateName:"probabilisticPassRate",passed:l>=e.probabilisticPassRate,actual:l,threshold:e.probabilisticPassRate,message:l>=e.probabilisticPassRate?`Probabilistic pass rate ${M(l)} meets minimum ${M(e.probabilisticPassRate)}`:`Probabilistic pass rate ${M(l)} below minimum ${M(e.probabilisticPassRate)}`})}return{passed:s.every(l=>l.passed),gates:s}}function Je(e,t){let s=0,n=0;for(let o of e)for(let r of o.runs)for(let a of r.assertions)H(a.assertionType)===t&&(s++,a.passed&&n++);return s>0?n/s:1}function Ae(e,t){let s=0;for(let n of e)for(let o of n.runs)for(let r of o.assertions)!r.passed&&r.failureCode&&t.includes(r.failureCode)&&s++;return s}function Ge(e,t){let s=[];for(let n of e)for(let[o,r]of Object.entries(n.assertionScores))(o===t||o.startsWith(`${t}:`))&&s.push(r.mean);return s}function M(e){return(e*100).toFixed(1)+"%"}var U=e=>e,Te={bold:U,red:U,green:U,yellow:U,cyan:U,dim:U,greenBold:U,redBold:U};function dr(e=Te){return{name:"pretty",async generate(t,s){let n=[],o=e;n.push(""),n.push(o.bold(" KindLM Test Results")),n.push("");let r=0;for(let p of t.suites){n.push(pr(p,o));for(let f of p.tests){n.push(mr(f,o));let y=fr(f,o);y&&n.push(y);for(let R of f.assertions)n.push(gr(R,o));r+=f.costUsd}n.push("")}n.push(o.bold(" Summary"));let a=o.green(`${t.passed} passed`),c=t.failed>0?o.red(`${t.failed} failed`):`${t.failed} failed`,i=t.errored>0?o.yellow(`${t.errored} errored`):`${t.errored} errored`;if(n.push(` ${a}, ${c}, ${i} (${t.totalTests} total)`),n.push(` Duration: ${He(t.durationMs)}`),r>0&&n.push(` Cost: ${ze(r)}`),n.push(""),s.gates.length>0){n.push(o.bold(" Quality Gates"));for(let p of s.gates){let f=p.passed?o.green("\u2713"):o.red("\u2717");n.push(` ${f} ${p.message}`)}n.push("")}return t.failed===0&&t.errored===0&&s.passed?n.push(o.greenBold(" \u2713 All tests passed")):n.push(o.redBold(" \u2717 Some tests failed")),n.push(""),{content:n.join(`
|
|
35
|
+
`),format:"text"}}}}function pr(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${t.bold(e.name)}`}function mr(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${e.name}`}function fr(e,t){if(e.status==="skipped")return null;let s=[];return e.modelId&&s.push(e.modelId),e.latencyMs>0&&s.push(He(e.latencyMs)),e.costUsd>=5e-5&&s.push(ze(e.costUsd)),s.length===0?null:` ${t.dim(s.join(" \xB7 "))}`}function gr(e,t){if(e.passed){let r=Ke(e),a=r?`${e.label} ${t.cyan(r)}`:e.label;return` ${t.green("\u2713")} ${t.dim(a)}`}let s=Ke(e),n=e.failureMessage??"failed",o=s?`${e.label} ${t.cyan(s)}`:e.label;return` ${t.red("\u2717")} ${o}: ${n}`}function Ke(e){if(e.assertionType==="judge"||e.assertionType==="drift"){let t=hr(e);if(t!==null){let s=e.passed?"\u2265":"<";return`(${e.score.toFixed(2)} ${s} ${t.toFixed(2)})`}return`(${e.score.toFixed(2)})`}return""}function hr(e){if(e.metadata&&typeof e.metadata=="object"&&"threshold"in e.metadata){let t=e.metadata.threshold;if(typeof t=="number")return t}return null}function He(e){return e<1e3?`${e}ms`:`${(e/1e3).toFixed(2)}s`}function ze(e){return e<.01?`$${e.toFixed(4)}`:`$${e.toFixed(2)}`}function Rr(e){return{name:"json",async generate(t,s){let n={kindlm:{version:e??"0.0.0",timestamp:new Date().toISOString()},summary:{totalTests:t.totalTests,passed:t.passed,failed:t.failed,errored:t.errored,skipped:t.skipped,durationMs:t.durationMs},gates:{passed:s.passed,results:s.gates},suites:t.suites.map(o=>({name:o.name,status:o.status,tests:o.tests.map(r=>({name:r.name,status:r.status,assertions:r.assertions,latencyMs:r.latencyMs,costUsd:r.costUsd}))}))};return{content:JSON.stringify(n,null,2),format:"json"}}}}function yr(){return{name:"junit",async generate(e,t){let s=e.durationMs/1e3,n=[];n.push('<?xml version="1.0" encoding="UTF-8"?>'),n.push(`<testsuites name="KindLM" tests="${e.totalTests}" failures="${e.failed}" errors="${e.errored}" time="${s.toFixed(3)}">`);for(let o of e.suites){let r=o.tests.filter(i=>i.status==="failed").length,a=o.tests.filter(i=>i.status==="errored").length,c=o.tests.reduce((i,l)=>i+l.latencyMs,0)/1e3;n.push(` <testsuite name="${D(o.name)}" tests="${o.tests.length}" failures="${r}" errors="${a}" time="${c.toFixed(3)}">`);for(let i of o.tests){let l=i.latencyMs/1e3;if(n.push(` <testcase name="${D(i.name)}" classname="${D(o.name)}" time="${l.toFixed(3)}">`),i.status==="skipped")n.push(" <skipped/>");else if(i.status==="errored"&&i.error)n.push(` <error message="${D(i.error.message)}" type="${D(i.error.code)}">${D(i.error.message)}</error>`);else if(i.status==="failed"){let p=i.assertions.filter(f=>!f.passed);for(let f of p)n.push(` <failure message="${D(f.label)}" type="${D(f.failureCode??"ASSERTION_FAILED")}">${D(f.failureMessage??"Assertion failed")}</failure>`)}n.push(" </testcase>")}n.push(" </testsuite>")}if(t.gates.length>0){let o=t.gates.filter(r=>!r.passed).length;n.push(` <testsuite name="Quality Gates" tests="${t.gates.length}" failures="${o}" errors="0" time="0.000">`);for(let r of t.gates)n.push(` <testcase name="${D(r.gateName)}" classname="Quality Gates" time="0.000">`),r.passed||n.push(` <failure message="${D(r.message)}" type="GATE_FAILED">${D(r.message)}</failure>`),n.push(" </testcase>");n.push(" </testsuite>")}return n.push("</testsuites>"),{content:n.join(`
|
|
36
|
+
`),format:"xml"}}}}function D(e){return e=e.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\uFFFE\uFFFF]/g,""),e.replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""").replace(/'/g,"'")}async function We(e){let t=new TextEncoder().encode(e),s=await globalThis.crypto.subtle.digest("SHA-256",t),n=new Uint8Array(s);return Array.from(n).map(o=>o.toString(16).padStart(2,"0")).join("")}function Ce(e){if(Array.isArray(e))return e.map(Ce);if(e&&typeof e=="object"){let t=Object.entries(e).sort(([s],[n])=>s.localeCompare(n)).map(([s,n])=>[s,Ce(n)]);return Object.fromEntries(t)}return e}function br(e){return JSON.stringify(Ce(e))}function xr(e){return{name:"compliance",async generate(t,s){let n=new Date().toISOString(),o=e?.kindlmVersion??"unknown",r=[];r.push("# EU AI Act \u2014 Annex IV Compliance Report"),r.push(""),r.push(`**Generated:** ${n}`),r.push("**Framework:** EU AI Act (Regulation 2024/1689)"),r.push(`**Tool:** KindLM v${o}`),e?.systemName&&r.push(`**System Name:** ${e.systemName}`),e?.operator&&r.push(`**Operator:** ${e.operator}`),e?.riskLevel&&r.push(`**Risk Level:** ${e.riskLevel}`),e?.intendedPurpose&&r.push(`**Intended Purpose:** ${e.intendedPurpose}`),r.push(""),r.push("## Article 9 \u2014 Risk Management System"),r.push(""),r.push("Testing demonstrates ongoing risk identification and mitigation through automated behavioral regression tests."),r.push(""),r.push(Q(s,["passRateMin"])),r.push("## Article 10 \u2014 Data and Data Governance"),r.push(""),r.push("PII detection guardrails verify that personal data is not exposed in AI system outputs."),r.push(""),r.push(Q(s,["piiFailuresMax","keywordFailuresMax"])),r.push("## Article 12 \u2014 Record-Keeping"),r.push(""),r.push("### Test Execution Log"),r.push(""),r.push("| Metric | Value |"),r.push("|--------|-------|"),r.push(`| Total Tests | ${t.totalTests} |`),r.push(`| Passed | ${t.passed} |`),r.push(`| Failed | ${t.failed} |`),r.push(`| Errored | ${t.errored} |`),r.push(`| Duration | ${t.durationMs}ms |`),r.push(""),r.push("### Suite Results"),r.push("");for(let p of t.suites){r.push(`**${p.name}** \u2014 ${p.status}`);for(let f of p.tests){let y=f.status==="passed"?"PASS":"FAIL";r.push(`- [${y}] ${f.name}`)}r.push("")}r.push("## Article 13 \u2014 Transparency and Provision of Information"),r.push(""),r.push("This report provides transparent documentation of AI system testing methodology, results, and quality gate evaluations as required under Article 13."),r.push(""),r.push(Q(s,["judgeAvgMin","driftScoreMax"])),r.push("## Article 15 \u2014 Accuracy, Robustness and Cybersecurity"),r.push(""),r.push("Schema validation and behavioral assertions verify output accuracy and robustness."),r.push(""),r.push(Q(s,["schemaFailuresMax","costMaxUsd","latencyMaxMs"])),r.push("## Quality Gate Summary"),r.push(""),r.push("| Gate | Result | Actual | Threshold |"),r.push("|------|--------|--------|-----------|");for(let p of s.gates){let f=p.passed?"PASS":"FAIL";r.push(`| ${p.gateName} | ${f} | ${Xe(p.actual)} | ${Xe(p.threshold)} |`)}r.push("");let a=s.passed?"PASS":"FAIL";r.push(`**Overall Verdict:** ${a}`),r.push("");let c=r.join(`
|
|
37
|
+
`),i=await We(c),l=e?await We(br({content:c,metadata:e})):i;return r.push("---"),r.push(`**Tamper Evidence Hash (SHA-256):** \`${i}\``),r.push(`**Run Identity Hash (SHA-256):** \`${l}\``),r.push(`**Run ID:** ${e?.runId??"N/A"}`),r.push(`**KindLM Version:** ${e?.kindlmVersion??"N/A"}`),r.push(`**Git Commit:** ${e?.gitCommitSha??"N/A"}`),r.push(`**Models:** ${e?.modelIds?.join(", ")??"N/A"}`),r.push(""),{content:r.join(`
|
|
38
|
+
`),format:"markdown"}}}}function Q(e,t){let s=e.gates.filter(o=>t.includes(o.gateName));if(s.length===0)return"";let n=[];n.push("**Gate Evidence:**");for(let o of s){let r=o.passed?"PASS":"FAIL";n.push(`- [${r}] ${o.message}`)}return n.push(""),n.join(`
|
|
39
|
+
`)}function Xe(e){return Number.isInteger(e)?String(e):e.toFixed(4)}var G="1",Ee=["1"],Ar={};function Qe(e){if(typeof e!="object"||e===null)return{ok:!1,error:"Baseline data is not an object"};let t=e,s=t.version;if(typeof s!="string")return{ok:!1,error:"Baseline data missing version field"};if(s===G){let a=Ye(t);return a.ok?{ok:!0,baseline:t,migrated:!1}:a}if(!Ee.includes(s))return{ok:!1,error:`Unsupported baseline version: "${s}". Known versions: ${Ee.join(", ")}`};let n=s,o=!1;for(;n!==G;){let a=Ar[n];if(!a)return{ok:!1,error:`No migration path from version "${n}" to "${G}"`};t=a(t),n=t.version,o=!0}let r=Ye(t);return r.ok?{ok:!0,baseline:t,migrated:o}:r}function Ye(e){return typeof e.suiteName!="string"?{ok:!1,error:"Baseline file missing required field: suiteName"}:typeof e.createdAt!="string"?{ok:!1,error:"Baseline file missing required field: createdAt"}:typeof e.results!="object"||e.results===null?{ok:!1,error:"Baseline file missing required field: results"}:{ok:!0}}function Ze(e){return JSON.stringify(e,null,2)}function qe(e){let t;try{t=JSON.parse(e)}catch{return C({code:"BASELINE_CORRUPT",message:"Baseline file is not valid JSON"})}if(typeof t!="object"||t===null)return C({code:"BASELINE_CORRUPT",message:"Baseline file is not a JSON object"});let s=t;if(typeof s.version!="string")return C({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: version"});let n=Qe(t);if(!n.ok){let o=Ee.includes(s.version);return C({code:o?"BASELINE_CORRUPT":"BASELINE_VERSION_MISMATCH",message:n.error})}return k(n.baseline)}function Tr(e,t){let s=t.read(e);return s.success?qe(s.data):s}function Cr(e,t){let s=Ze(e);return t.write(e.suiteName,s)}function Er(e){return e.list()}function Pr(e,t,s){let n={};for(let o of t){let r=`${o.testCaseName}::${o.modelId}`,c=o.runs.find(i=>i.assertions.every(l=>l.passed))??o.runs[0];n[r]={passRate:o.passRate,outputText:c?.outputText??"",failureCodes:o.failureCodes,latencyAvgMs:o.latencyAvgMs,costUsd:o.totalCostUsd,runCount:o.runCount}}return{version:G,suiteName:e,createdAt:s,results:n}}function wr(e,t){let s=[],n=[],o=[],r=[],a=[],c=new Set(Object.keys(e.results));for(let[i,l]of Object.entries(t)){let p=e.results[i];if(!p){r.push(i);continue}c.delete(i);let f=l.passRate-p.passRate;if(f<-.001){let y=l.failureCodes.filter(R=>!p.failureCodes.includes(R));s.push({testName:i,baselinePassRate:p.passRate,currentPassRate:l.passRate,newFailureCodes:y})}else f>.001?n.push({testName:i,baselinePassRate:p.passRate,currentPassRate:l.passRate}):o.push({testName:i,passRate:l.passRate})}for(let i of c)a.push(i);return{suiteName:e.suiteName,hasBaseline:!0,regressions:s,improvements:n,unchanged:o,newTests:r,removedTests:a}}function Mr(e){if(!e||typeof e!="object")return C({code:"CONFIG_PARSE_ERROR",message:"OTLP payload must be a non-null object"});let t=e;if(!Array.isArray(t.resourceSpans))return C({code:"CONFIG_PARSE_ERROR",message:"OTLP payload missing resourceSpans array"});let s=[];for(let n of t.resourceSpans){let o=tt(n.resource?.attributes??[]);if(Array.isArray(n.scopeSpans)){for(let r of n.scopeSpans)if(Array.isArray(r.spans))for(let a of r.spans){let c=et(a.startTimeUnixNano),i=et(a.endTimeUnixNano);s.push({traceId:a.traceId,spanId:a.spanId,parentSpanId:a.parentSpanId||void 0,name:a.name,kind:a.kind,startTimeMs:c,endTimeMs:i,durationMs:i-c,attributes:tt(a.attributes??[]),resourceAttributes:o,statusCode:a.status?.code,statusMessage:a.status?.message})}}}return k(s)}function et(e){try{let t=BigInt(e);return Number(t/1000000n)}catch{return 0}}function tt(e){let t={};for(let s of e){let n=Sr(s.value);n!==void 0&&(t[s.key]=n)}return t}function Sr(e){if(e.stringValue!==void 0)return e.stringValue;if(e.intValue!==void 0)return parseInt(e.intValue,10);if(e.doubleValue!==void 0)return e.doubleValue;if(e.boolValue!==void 0)return e.boolValue}function Or(e,t){return t?e.filter(s=>{if(t.namePattern){if(X(t.namePattern))return!0;if(!new RegExp(t.namePattern).test(s.name))return!1}if(t.attributeMatch){for(let[n,o]of Object.entries(t.attributeMatch))if(String(s.attributes[n])!==o)return!1}return!(t.minDurationMs!==void 0&&s.durationMs<t.minDurationMs)}):e}function kr(e,t){let s="",n=[],o=0,r=0,a=0,c,i;for(let l of e){let p={...l.resourceAttributes,...l.attributes},f=p[t.outputTextAttr];typeof f=="string"&&f&&(s=s?`${s}
|
|
40
|
+
${f}`:f);let y=p[t.modelAttr];typeof y=="string"&&y&&(c=y);let R=p[t.systemAttr];typeof R=="string"&&R&&(i=R);let x=p[t.inputTokensAttr];typeof x=="number"&&(r+=x);let g=p[t.outputTokensAttr];typeof g=="number"&&(a+=g),l.parentSpanId||(o+=l.durationMs);let u=p["gen_ai.tool.name"],m=p["gen_ai.tool.arguments"];if(typeof u=="string"){let E={};if(typeof m=="string")try{E=JSON.parse(m)}catch{}n.push({id:l.spanId,name:u,arguments:E})}}return{outputText:s,toolCalls:n,latencyMs:o,inputTokens:r,outputTokens:a,model:c,system:i}}function vr(e,t){let s=e.toolCalls.map(n=>({id:n.id,name:n.name,arguments:n.arguments}));return{outputText:e.outputText,toolCalls:s,configDir:t.configDir,latencyMs:e.latencyMs,judgeAdapter:t.judgeAdapter,judgeModel:t.judgeModel,baselineText:t.baselineText}}export{G as BASELINE_VERSION,$e as KindLMConfigSchema,h as ProviderError,Ne as SpanFilterSchema,Ie as SpanMappingSchema,Re as TraceConfigSchema,be as aggregateRuns,Pr as buildBaselineData,vr as buildContextFromTrace,H as classifyAssertion,wr as compareBaseline,Oe as cosineSimilarity,de as createAnthropicAdapter,Y as createAssertionsFromExpect,ge as createCohereAdapter,xr as createComplianceReporter,ue as createCostAssertion,ie as createDriftAssertion,me as createGeminiAdapter,Rr as createJsonReporter,ae as createJudgeAssertion,yr as createJunitReporter,oe as createKeywordsAbsentAssertion,ne as createKeywordsPresentAssertion,le as createLatencyAssertion,fe as createMistralAdapter,pe as createOllamaAdapter,ce as createOpenAIAdapter,se as createPiiAssertion,dr as createPrettyReporter,vt as createProvider,nr as createRunner,re as createSchemaAssertion,q as createToolCalledAssertion,ee as createToolNotCalledAssertion,te as createToolOrderAssertion,qe as deserializeBaseline,C as err,cr as evaluateGates,Or as filterSpans,Ve as findMissingVars,J as interpolate,pt as isDeterministic,mt as isProbabilistic,Er as listBaselines,B as lookupModelPricing,kr as mapSpansToResult,Qe as migrateBaseline,Te as noColor,k as ok,xe as parseCommandOutput,rr as parseConfig,Mr as parseOtlpPayload,Tr as readBaseline,he as runConversation,Ze as serializeBaseline,ye as validateConfig,K as validateUnitIntervalScore,v as withRetry,Cr as writeBaseline};
|
|
40
41
|
//# sourceMappingURL=index.js.map
|