@kindlm/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +40 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3061 -0
- package/dist/index.d.ts +3061 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/package.json +57 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"use strict";var pt=Object.create;var z=Object.defineProperty;var ct=Object.getOwnPropertyDescriptor;var dt=Object.getOwnPropertyNames;var mt=Object.getPrototypeOf,ft=Object.prototype.hasOwnProperty;var gt=(e,t)=>{for(var s in t)z(e,s,{get:t[s],enumerable:!0})},Ae=(e,t,s,r)=>{if(t&&typeof t=="object"||typeof t=="function")for(let n of dt(t))!ft.call(e,n)&&n!==s&&z(e,n,{get:()=>t[n],enumerable:!(r=ct(t,n))||r.enumerable});return e};var Pe=(e,t,s)=>(s=e!=null?pt(mt(e)):{},Ae(t||!e||!e.__esModule?z(s,"default",{value:e,enumerable:!0}):s,e)),ht=e=>Ae(z({},"__esModule",{value:!0}),e);var Ss={};gt(Ss,{BASELINE_VERSION:()=>F,KindLMConfigSchema:()=>Re,ProviderError:()=>g,SpanFilterSchema:()=>ge,SpanMappingSchema:()=>fe,TraceConfigSchema:()=>ie,aggregateRuns:()=>ue,buildBaselineData:()=>st,buildContextFromTrace:()=>ut,classifyAssertion:()=>D,compareBaseline:()=>rt,createAnthropicAdapter:()=>te,createAssertionRegistry:()=>Se,createAssertionsFromExpect:()=>K,createCohereAdapter:()=>ne,createComplianceReporter:()=>Ze,createCostAssertion:()=>q,createDriftAssertion:()=>V,createGeminiAdapter:()=>re,createJsonReporter:()=>We,createJudgeAssertion:()=>J,createJunitReporter:()=>Xe,createKeywordsAbsentAssertion:()=>Q,createKeywordsPresentAssertion:()=>Y,createLatencyAssertion:()=>Z,createMistralAdapter:()=>oe,createOllamaAdapter:()=>se,createOpenAIAdapter:()=>ee,createPiiAssertion:()=>G,createPrettyReporter:()=>Ke,createProvider:()=>ke,createRunner:()=>Ue,createSchemaAssertion:()=>B,createToolCalledAssertion:()=>U,createToolNotCalledAssertion:()=>W,createToolOrderAssertion:()=>X,deserializeBaseline:()=>Te,err:()=>A,evaluateGates:()=>Je,filterSpans:()=>it,findMissingVars:()=>ye,interpolate:()=>L,isDeterministic:()=>Me,isProbabilistic:()=>Oe,listBaselines:()=>tt,mapSpansToResult:()=>lt,noColor:()=>ce,ok:()=>S,parseCommandOutput:()=>pe,parseConfig:()=>Le,parseOtlpPayload:()=>at,readBaseline:()=>qe,runConversation:()=>ae,serializeBaseline:()=>xe,validateConfig:()=>le,withRetry:()=>_,writeBaseline:()=>et});module.exports=ht(Ss);var g=class extends Error{constructor(s,r,n,o=!1,a){super(r);this.code=s;this.statusCode=n;this.retryable=o;this.raw=a;this.name="ProviderError"}};function S(e){return{success:!0,data:e}}function A(e){return{success:!1,error:e}}function Ee(e,t){for(let[s,r]of Object.entries(t))if(JSON.stringify(e[s])!==JSON.stringify(r))return!1;return!0}function U(e,t){return{type:"tool_called",evaluate(s){let r=[],n=s.toolCalls.filter(a=>a.name===e),o=s.toolCalls.map(a=>a.name);if(n.length===0)return r.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${e}" to be called, but got: [${o.join(", ")}]`}),Promise.resolve(r);if(r.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!0,score:1}),t){let a=n.some(p=>Ee(p.arguments,t));r.push({assertionType:"tool_called",label:`Tool "${e}" args match`,passed:a,score:a?1:0,failureCode:a?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:a?void 0:`Expected args ${JSON.stringify(t)}, got ${JSON.stringify(n[0]?.arguments)}`})}return Promise.resolve(r)}}}function W(e){return{type:"tool_not_called",evaluate(t){let s=t.toolCalls.some(r=>r.name===e);return Promise.resolve([{assertionType:"tool_not_called",label:`Tool "${e}" not called`,passed:!s,score:s?0:1,failureCode:s?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:s?`Expected tool "${e}" to NOT be called, but it was`:void 0}])}}}function X(e){return{type:"tool_order",evaluate(t){let s=[];for(let r of e){if(r.shouldNotCall){let o=t.toolCalls.some(a=>a.name===r.tool);s.push({assertionType:"tool_order",label:`Tool "${r.tool}" not called`,passed:!o,score:o?0:1,failureCode:o?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:o?`Expected tool "${r.tool}" to NOT be called, but it was`:void 0});continue}let n=t.toolCalls.filter(o=>o.name===r.tool);if(n.length===0){s.push({assertionType:"tool_order",label:`Tool "${r.tool}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${r.tool}" to be called, but it was not`});continue}if(s.push({assertionType:"tool_order",label:`Tool "${r.tool}" called`,passed:!0,score:1}),r.argsMatch){let o=n.some(a=>Ee(a.arguments,r.argsMatch??{}));s.push({assertionType:"tool_order",label:`Tool "${r.tool}" args match`,passed:o,score:o?1:0,failureCode:o?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:o?void 0:`Expected args ${JSON.stringify(r.argsMatch)}, got ${JSON.stringify(n[0]?.arguments)}`})}if(r.order!==void 0){let o=t.toolCalls.findIndex(p=>p.name===r.tool),a=o===r.order;s.push({assertionType:"tool_order",label:`Tool "${r.tool}" at position ${r.order}`,passed:a,score:a?1:0,failureCode:a?void 0:"TOOL_CALL_ORDER_WRONG",failureMessage:a?void 0:`Expected "${r.tool}" at position ${r.order}, but found at position ${o}`})}}return Promise.resolve(s)}}}var me;async function Rt(){if(me)return me;let e=await import("ajv"),t=await import("ajv-formats"),s=e.default,r=t.default,n=new s({allErrors:!0,strict:!1});return r(n),me=n,n}var ve=new Map;function yt(e,t){let s=JSON.stringify(t),r=ve.get(s);if(r)return r;let n=e.compile(t);return ve.set(s,n),n}function B(e){return{type:"schema",async evaluate(t){let s=[],r;if(e.format==="json")try{r=JSON.parse(t.outputText),s.push({assertionType:"schema",label:"Output is valid JSON",passed:!0,score:1})}catch(n){return s.push({assertionType:"schema",label:"Output is valid JSON",passed:!1,score:0,failureCode:"SCHEMA_PARSE_ERROR",failureMessage:`Failed to parse output as JSON: ${n instanceof Error?n.message:String(n)}`}),s}if(e.schemaContent){let n=await Rt(),o=yt(n,e.schemaContent),a=o(r??t.outputText);s.push({assertionType:"schema",label:"Output matches JSON Schema",passed:a,score:a?1:0,failureCode:a?void 0:"SCHEMA_INVALID",failureMessage:a?void 0:`Schema validation failed: ${n.errorsText(o.errors)}`,metadata:a?void 0:{errors:o.errors}})}if(e.contains){let n=t.outputText.toLowerCase();for(let o of e.contains){let a=n.includes(o.toLowerCase());s.push({assertionType:"schema",label:`Output contains "${o}"`,passed:a,score:a?1:0,failureCode:a?void 0:"CONTAINS_FAILED",failureMessage:a?void 0:`Expected output to contain "${o}"`})}}if(e.notContains){let n=t.outputText.toLowerCase();for(let o of e.notContains){let a=n.includes(o.toLowerCase());s.push({assertionType:"schema",label:`Output does not contain "${o}"`,passed:!a,score:a?0:1,failureCode:a?"NOT_CONTAINS_FAILED":void 0,failureMessage:a?`Expected output to NOT contain "${o}"`:void 0})}}if(e.maxLength!==void 0){let n=t.outputText.length<=e.maxLength;s.push({assertionType:"schema",label:`Output length <= ${e.maxLength}`,passed:n,score:n?1:0,failureCode:n?void 0:"MAX_LENGTH_EXCEEDED",failureMessage:n?void 0:`Output length ${t.outputText.length} exceeds max ${e.maxLength}`})}return s}}}function bt(e){return e.length<=4?"*".repeat(e.length):e.slice(0,2)+"*".repeat(e.length-4)+e.slice(-2)}var xt=/(\+|\*|\{[^}]+\})\)?(\+|\*|\{[^}]+\})/;function Tt(e){return xt.test(e)}function Ct(e,t,s){let r=[],n=Date.now();e.lastIndex=0;let o;for(;(o=e.exec(t))!==null&&(r.push(o[0]),!(r.length>=s||Date.now()-n>100)););return r}function G(e){let t=[],s;for(let r=0;r<e.denyPatterns.length;r++){let n=e.denyPatterns[r];n!==void 0&&t.push({name:`pii-pattern-${r+1}`,regex:new RegExp(n,"gi")})}if(e.customPatterns)for(let r of e.customPatterns){if(Tt(r.pattern)){s=[{assertionType:"pii",label:"No PII detected",passed:!1,score:0,failureCode:"INVALID_PATTERN",failureMessage:`Custom pattern "${r.name}" contains nested quantifiers and may cause catastrophic backtracking`}];break}t.push({name:r.name,regex:new RegExp(r.pattern,"gi")})}return{type:"pii",evaluate(r){if(s)return Promise.resolve(s);let n=[],o=0;for(let{name:p,regex:i}of t){if(o>=1e3)break;let u=1e3-o,c=Ct(i,r.outputText,u);for(let f of c)n.push({name:p,redacted:bt(f)});o+=c.length}let a=n.length===0;return Promise.resolve([{assertionType:"pii",label:"No PII detected",passed:a,score:a?1:0,failureCode:a?void 0:"PII_DETECTED",failureMessage:a?void 0:`Found ${n.length} PII match(es): ${n.map(p=>`${p.name}=${p.redacted}`).join(", ")}`,metadata:a?void 0:{matches:n}}])}}}function Y(e){return{type:"keywords_present",evaluate(t){let s=t.outputText.toLowerCase(),r=e.some(n=>s.includes(n.toLowerCase()));return Promise.resolve([{assertionType:"keywords_present",label:"Required keyword present",passed:r,score:r?1:0,failureCode:r?void 0:"KEYWORD_MISSING",failureMessage:r?void 0:`Expected at least one of [${e.join(", ")}] in output`}])}}}function Q(e){return{type:"keywords_absent",evaluate(t){let s=t.outputText.toLowerCase(),r=[];for(let n of e){let o=s.includes(n.toLowerCase());r.push({assertionType:"keywords_absent",label:`Keyword "${n}" absent`,passed:!o,score:o?0:1,failureCode:o?"KEYWORD_DENIED":void 0,failureMessage:o?`Denied keyword "${n}" found in output`:void 0})}return Promise.resolve(r)}}}var At=`You are an impartial AI judge evaluating an AI assistant's response.
|
|
2
|
+
You will be given:
|
|
3
|
+
- The assistant's response
|
|
4
|
+
- Evaluation criteria
|
|
5
|
+
- An optional rubric
|
|
6
|
+
|
|
7
|
+
Score the response from 0.0 to 1.0 based on how well it meets the criteria.
|
|
8
|
+
|
|
9
|
+
Respond ONLY with a JSON object in this exact format:
|
|
10
|
+
{"score": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function Pt(e,t,s){let r=`## Assistant Response
|
|
11
|
+
${e}
|
|
12
|
+
|
|
13
|
+
## Criteria
|
|
14
|
+
${t}`;return s&&(r+=`
|
|
15
|
+
|
|
16
|
+
## Rubric
|
|
17
|
+
${s}`),r}function Et(e){let t=e.match(/```(?:json)?\s*([\s\S]*?)```/)??e.match(/(\{[\s\S]*\})/);if(!t?.[1])return null;try{let s=JSON.parse(t[1]);return typeof s.score=="number"&&typeof s.reasoning=="string"?{score:s.score,reasoning:s.reasoning}:null}catch{return null}}function J(e){return{type:"judge",async evaluate(t){if(!t.judgeAdapter||!t.judgeModel)return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Judge assertion requires judgeAdapter and judgeModel in context"}];let s=await t.judgeAdapter.complete({model:t.judgeModel,messages:[{role:"system",content:At},{role:"user",content:Pt(t.outputText,e.criteria,e.rubric)}],params:{temperature:0,maxTokens:512}}),r=Et(s.text);if(!r)return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:`Failed to parse judge response: ${s.text.slice(0,200)}`}];let n=r.score>=e.minScore;return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:n,score:r.score,failureCode:n?void 0:"JUDGE_BELOW_THRESHOLD",failureMessage:n?void 0:`Score ${r.score} below threshold ${e.minScore}: ${r.reasoning}`,metadata:{reasoning:r.reasoning}}]}}}var vt=`You are an impartial AI judge comparing two AI assistant responses.
|
|
18
|
+
You will be given a baseline response and a new response.
|
|
19
|
+
Evaluate how much the new response has drifted from the baseline.
|
|
20
|
+
|
|
21
|
+
Score from 0.0 to 1.0 where:
|
|
22
|
+
- 0.0 = responses are semantically identical
|
|
23
|
+
- 0.5 = moderate differences in tone, detail, or structure
|
|
24
|
+
- 1.0 = completely different meaning or contradictory
|
|
25
|
+
|
|
26
|
+
Respond ONLY with a JSON object in this exact format:
|
|
27
|
+
{"driftScore": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function wt(e){let t=e.match(/```(?:json)?\s*([\s\S]*?)```/)??e.match(/(\{[\s\S]*\})/);if(!t?.[1])return null;try{let s=JSON.parse(t[1]);return typeof s.driftScore=="number"&&typeof s.reasoning=="string"?{driftScore:s.driftScore,reasoning:s.reasoning}:null}catch{return null}}function we(e,t){let s=t.split("."),r=e;for(let n of s){if(r==null||typeof r!="object")return;r=r[n]}return r}function St(e,t,s){let r,n;try{r=JSON.parse(e),n=JSON.parse(t)}catch{return{driftScore:1,mismatched:["(parse error)"]}}let o=[];for(let p of s){let i=we(r,p),u=we(n,p);JSON.stringify(i)!==JSON.stringify(u)&&o.push(p)}return{driftScore:s.length>0?o.length/s.length:0,mismatched:o}}function V(e){return{type:"drift",async evaluate(t){if(!t.baselineText)return[{assertionType:"drift",label:"Drift check",passed:!0,score:1,metadata:{reason:"No baseline available"}}];if(e.method==="embedding")return[{assertionType:"drift",label:"Drift check (embedding)",passed:!0,score:1,metadata:{reason:"Embedding method not yet implemented"}}];if(e.method==="field-diff"){let a=e.fields??[],{driftScore:p,mismatched:i}=St(t.baselineText,t.outputText,a),u=1-p,c=p<=e.maxScore;return[{assertionType:"drift",label:"Drift check (field-diff)",passed:c,score:u,failureCode:c?void 0:"DRIFT_EXCEEDED",failureMessage:c?void 0:`Drift score ${p.toFixed(2)} exceeds max ${e.maxScore}. Mismatched: [${i.join(", ")}]`,metadata:{driftScore:p,mismatched:i}}]}if(!t.judgeAdapter||!t.judgeModel)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Drift judge method requires judgeAdapter and judgeModel in context"}];let s=await t.judgeAdapter.complete({model:t.judgeModel,messages:[{role:"system",content:vt},{role:"user",content:`## Baseline Response
|
|
28
|
+
${t.baselineText}
|
|
29
|
+
|
|
30
|
+
## New Response
|
|
31
|
+
${t.outputText}`}],params:{temperature:0,maxTokens:512}}),r=wt(s.text);if(!r)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:`Failed to parse drift judge response: ${s.text.slice(0,200)}`}];let n=1-r.driftScore,o=r.driftScore<=e.maxScore;return[{assertionType:"drift",label:"Drift check (judge)",passed:o,score:n,failureCode:o?void 0:"DRIFT_EXCEEDED",failureMessage:o?void 0:`Drift score ${r.driftScore.toFixed(2)} exceeds max ${e.maxScore}: ${r.reasoning}`,metadata:{driftScore:r.driftScore,reasoning:r.reasoning}}]}}}function Z(e){return{type:"latency",evaluate(t){let s=t.latencyMs??0,r=s<=e.maxMs;return Promise.resolve([{assertionType:"latency",label:`Latency <= ${e.maxMs}ms`,passed:r,score:r?1:0,failureCode:r?void 0:"PROVIDER_TIMEOUT",failureMessage:r?void 0:`Latency ${s}ms exceeds max ${e.maxMs}ms`,metadata:{latencyMs:s}}])}}}function q(e){return{type:"cost",evaluate(t){let s=t.costUsd??0,r=s<=e.maxUsd;return Promise.resolve([{assertionType:"cost",label:`Cost <= $${e.maxUsd}`,passed:r,score:r?1:0,failureCode:r?void 0:"INTERNAL_ERROR",failureMessage:r?void 0:`Cost $${s.toFixed(4)} exceeds max $${e.maxUsd}`,metadata:{costUsd:s}}])}}}function K(e,t){let s=[];if(e.toolCalls)if(e.toolCalls.some(n=>n.order!==void 0))s.push(X(e.toolCalls));else for(let n of e.toolCalls)n.shouldNotCall?s.push(W(n.tool)):s.push(U(n.tool,n.argsMatch??void 0));if(e.output&&s.push(B({format:e.output.format,schemaFile:e.output.schemaFile,schemaContent:t?.schemaContent,contains:e.output.contains,notContains:e.output.notContains,maxLength:e.output.maxLength})),e.guardrails?.pii&&s.push(G({denyPatterns:e.guardrails.pii.denyPatterns,customPatterns:e.guardrails.pii.customPatterns})),e.guardrails?.keywords&&(e.guardrails.keywords.allow&&e.guardrails.keywords.allow.length>0&&s.push(Y(e.guardrails.keywords.allow)),e.guardrails.keywords.deny.length>0&&s.push(Q(e.guardrails.keywords.deny))),e.judge)for(let r of e.judge)s.push(J({criteria:r.criteria,minScore:r.minScore,rubric:r.rubric}));return e.baseline?.drift&&s.push(V({maxScore:e.baseline.drift.maxScore,method:e.baseline.drift.method,fields:e.baseline.drift.fields})),s}function Se(){return new Map([["tool_called",e=>U(e.toolCalls?.[0]?.tool??"")],["schema",e=>B(e.output??{format:"text"})],["pii",e=>G(e.guardrails?.pii??{denyPatterns:[]})],["judge",e=>J(e.judge?.[0]??{criteria:"",minScore:.7})],["drift",e=>V(e.baseline?.drift??{maxScore:.15,method:"judge"})],["latency",()=>Z({maxMs:6e4})],["cost",()=>q({maxUsd:1})]])}var Mt=new Set(["judge","drift"]);function D(e){return Mt.has(e)?"probabilistic":"deterministic"}function Me(e){return D(e)==="deterministic"}function Oe(e){return D(e)==="probabilistic"}async function _(e,t){let{maxRetries:s,shouldRetry:r,baseDelayMs:n=500}=t,o;for(let a=0;a<=s;a++)try{return await e()}catch(p){if(o=p,a>=s||!r(p))throw p;let i=n*Math.pow(2,a);await Ot(i)}throw o}function Ot(e){return new Promise(t=>setTimeout(t,e))}var _t={"gpt-4o":{input:2.5,output:10},"gpt-4o-mini":{input:.15,output:.6},"gpt-4-turbo":{input:10,output:30},"o3-mini":{input:1.1,output:4.4}};function kt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function It(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ee(e){let t="",s="https://api.openai.com/v1",r,n=6e4,o=2;return{name:"openai",async initialize(a){if(!a.apiKey)throw new g("AUTH_FAILED","API key is required");t=a.apiKey,a.baseUrl&&(s=a.baseUrl),r=a.organization,n=a.timeoutMs,o=a.maxRetries},async complete(a){let p={model:a.model,messages:a.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content,tool_call_id:d.toolCallId}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||null,tool_calls:d.toolCalls.map(h=>({id:h.id,type:"function",function:{name:h.name,arguments:JSON.stringify(h.arguments)}}))}:{role:d.role,content:d.content}),temperature:a.params.temperature,max_tokens:a.params.maxTokens};a.params.topP!==void 0&&(p.top_p=a.params.topP),a.params.seed!==void 0&&(p.seed=a.params.seed),a.params.stopSequences&&(p.stop=a.params.stopSequences),a.tools&&a.tools.length>0&&(p.tools=a.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})),a.toolChoice&&(p.tool_choice=a.toolChoice));let i={"Content-Type":"application/json",Authorization:`Bearer ${t}`};r&&(i["OpenAI-Organization"]=r);let u=Date.now(),c=await _(()=>e.fetch(`${s}/chat/completions`,{method:"POST",headers:i,body:JSON.stringify(p),timeoutMs:n}),{maxRetries:o,shouldRetry:d=>d instanceof g&&d.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from OpenAI API",c.status,c.status>=500)}let y=Date.now()-u;if(!c.ok)throw It(c.status,f);let m=f,x=m.choices?.[0],C=x?.message,b=(C?.tool_calls??[]).map(d=>{let h;try{h=JSON.parse(d.function.arguments||"{}")}catch{h={_raw:d.function.arguments}}return{id:d.id,name:d.function.name,arguments:h}});return{text:C?.content??"",toolCalls:b,usage:{inputTokens:m.usage?.prompt_tokens??0,outputTokens:m.usage?.completion_tokens??0,totalTokens:m.usage?.total_tokens??0},raw:f,latencyMs:y,modelId:m.model??a.model,finishReason:kt(x?.finish_reason)}},estimateCost(a,p){let i=_t[a];return i?p.inputTokens/1e6*i.input+p.outputTokens/1e6*i.output:null},supportsTools(a){return!a.startsWith("o1-")}}}var jt={"claude-opus-4-5-20250929":{input:15,output:75},"claude-sonnet-4-5-20250929":{input:3,output:15},"claude-haiku-4-5-20251001":{input:.8,output:4}};function Nt(e){switch(e){case"end_turn":return"stop";case"max_tokens":return"max_tokens";case"tool_use":return"tool_calls";default:return"unknown"}}function $t(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function te(e){let t="",s="https://api.anthropic.com",r=6e4,n=2;return{name:"anthropic",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a=o.messages.find(h=>h.role==="system"),p=o.messages.filter(h=>h.role!=="system").map(h=>{if(h.role==="tool")return{role:"user",content:[{type:"tool_result",tool_use_id:h.toolCallId,content:h.content}]};if(h.role==="assistant"){if(h.toolCalls&&h.toolCalls.length>0){let R=[];h.content&&R.push({type:"text",text:h.content});for(let P of h.toolCalls)R.push({type:"tool_use",id:P.id,name:P.name,input:P.arguments});return{role:"assistant",content:R}}return{role:"assistant",content:h.content}}return{role:"user",content:h.content}}),i={model:o.model,max_tokens:o.params.maxTokens,messages:p};a&&(i.system=a.content),o.params.temperature!==void 0&&(i.temperature=o.params.temperature),o.params.topP!==void 0&&(i.top_p=o.params.topP),o.params.stopSequences&&(i.stop_sequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(i.tools=o.tools.map(h=>({name:h.name,description:h.description??"",input_schema:h.parameters??{type:"object",properties:{}}})),o.toolChoice&&(i.tool_choice=o.toolChoice==="required"?{type:"any"}:{type:o.toolChoice}));let u=Date.now(),c=await _(()=>e.fetch(`${s}/v1/messages`,{method:"POST",headers:{"Content-Type":"application/json","x-api-key":t,"anthropic-version":"2023-06-01"},body:JSON.stringify(i),timeoutMs:r}),{maxRetries:n,shouldRetry:h=>h instanceof g&&h.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Anthropic API",c.status,c.status>=500)}let y=Date.now()-u;if(!c.ok)throw $t(c.status,f);let m=f,x="",C=[];for(let h of m.content??[])h.type==="text"?x+=h.text:h.type==="tool_use"&&C.push({id:h.id,name:h.name,arguments:h.input??{}});let b=m.usage?.input_tokens??0,d=m.usage?.output_tokens??0;return{text:x,toolCalls:C,usage:{inputTokens:b,outputTokens:d,totalTokens:b+d},raw:f,latencyMs:y,modelId:m.model??o.model,finishReason:Nt(m.stop_reason)}},estimateCost(o,a){let p=Object.entries(jt).find(([i])=>o.includes(i)||i.includes(o));return p?a.inputTokens/1e6*p[1].input+a.outputTokens/1e6*p[1].output:null},supportsTools(o){return!0}}}function Dt(e,t){if(t)return"tool_calls";switch(e){case"stop":return"stop";case"length":return"max_tokens";default:return"unknown"}}function Lt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error):"Unknown error";switch(e){case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function se(e){let t="http://localhost:11434",s=6e4,r=2;return{name:"ollama",async initialize(n){n.baseUrl&&(t=n.baseUrl),s=n.timeoutMs,r=n.maxRetries},async complete(n){let o=n.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||"",tool_calls:d.toolCalls.map(h=>({function:{name:h.name,arguments:h.arguments}}))}:{role:d.role,content:d.content}),a={model:n.model,messages:o,stream:!1,options:{temperature:n.params.temperature,num_predict:n.params.maxTokens}},p=a.options;n.params.topP!==void 0&&(p.top_p=n.params.topP),n.params.seed!==void 0&&(p.seed=n.params.seed),n.params.stopSequences&&(p.stop=n.params.stopSequences),n.tools&&n.tools.length>0&&(a.tools=n.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})));let i={"Content-Type":"application/json"},u=Date.now(),c=await _(()=>e.fetch(`${t}/api/chat`,{method:"POST",headers:i,body:JSON.stringify(a),timeoutMs:s}),{maxRetries:r,shouldRetry:d=>d instanceof g&&d.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Ollama API",c.status,c.status>=500)}let y=Date.now()-u;if(!c.ok)throw Lt(c.status,f);let m=f,x=m.message,C=x?.tool_calls??[],b=C.map((d,h)=>({id:`ollama_call_${h}`,name:d.function.name,arguments:d.function.arguments??{}}));return{text:x?.content??"",toolCalls:b,usage:{inputTokens:m.prompt_eval_count??0,outputTokens:m.eval_count??0,totalTokens:(m.prompt_eval_count??0)+(m.eval_count??0)},raw:f,latencyMs:y,modelId:m.model??n.model,finishReason:Dt(m.done_reason,C.length>0)}},estimateCost(){return 0},supportsTools(){return!0}}}var Ft={"gemini-2.0-flash":{input:.1,output:.4},"gemini-2.0-flash-lite":{input:.075,output:.3},"gemini-1.5-pro":{input:1.25,output:5},"gemini-1.5-flash":{input:.075,output:.3},"gemini-1.5-flash-8b":{input:.0375,output:.15}};function Ut(e,t){if(t)return"tool_calls";switch(e){case"STOP":return"stop";case"MAX_TOKENS":return"max_tokens";case"SAFETY":return"stop";default:return"unknown"}}function Bt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 400:return s.toLowerCase().includes("api key")?new g("AUTH_FAILED",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 401:case 403:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function re(e){let t="",s="https://generativelanguage.googleapis.com/v1beta",r=6e4,n=2;return{name:"gemini",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a=[],p;for(let T of o.messages){if(T.role==="system"){p={parts:[{text:T.content}]};continue}if(T.role==="tool"){a.push({role:"function",parts:[{functionResponse:{name:T.toolName??"unknown",response:Gt(T.content)}}]});continue}if(T.role==="assistant"&&T.toolCalls&&T.toolCalls.length>0){let O=[];T.content&&O.push({text:T.content});for(let Ce of T.toolCalls)O.push({functionCall:{name:Ce.name,args:Ce.arguments}});a.push({role:"model",parts:O});continue}let v=T.role==="assistant"?"model":"user";a.push({role:v,parts:[{text:T.content}]})}let i={contents:a,generationConfig:{temperature:o.params.temperature,maxOutputTokens:o.params.maxTokens}};p&&(i.systemInstruction=p);let u=i.generationConfig;o.params.topP!==void 0&&(u.topP=o.params.topP),o.params.stopSequences&&(u.stopSequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(i.tools=[{functionDeclarations:o.tools.map(T=>({name:T.name,description:T.description,parameters:T.parameters}))}],o.toolChoice&&(i.toolConfig=Jt(o.toolChoice)));let c={"Content-Type":"application/json","x-goog-api-key":t},f=Date.now(),y=`${s}/models/${o.model}:generateContent`,m=await _(()=>e.fetch(y,{method:"POST",headers:c,body:JSON.stringify(i),timeoutMs:r}),{maxRetries:n,shouldRetry:T=>T instanceof g&&T.retryable}),x;try{x=await m.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Gemini API",m.status,m.status>=500)}let C=Date.now()-f;if(!m.ok)throw Bt(m.status,x);let b=x,d=b.candidates?.[0],h=d?.content?.parts??[],R="",P=[],E=0;for(let T of h)T.text!==void 0&&(R+=T.text),T.functionCall&&(P.push({id:`gemini_call_${E}`,name:T.functionCall.name,arguments:T.functionCall.args??{}}),E++);return{text:R,toolCalls:P,usage:{inputTokens:b.usageMetadata?.promptTokenCount??0,outputTokens:b.usageMetadata?.candidatesTokenCount??0,totalTokens:b.usageMetadata?.totalTokenCount??0},raw:x,latencyMs:C,modelId:b.modelVersion??o.model,finishReason:Ut(d?.finishReason,P.length>0)}},estimateCost(o,a){let p=Ft[o];return p?a.inputTokens/1e6*p.input+a.outputTokens/1e6*p.output:null},supportsTools(){return!0}}}function Gt(e){try{return JSON.parse(e)}catch{return{result:e}}}function Jt(e){switch(e){case"auto":return{functionCallingConfig:{mode:"AUTO"}};case"required":return{functionCallingConfig:{mode:"ANY"}};case"none":return{functionCallingConfig:{mode:"NONE"}}}}function Vt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function Kt(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function oe(e){let t="",s="https://api.mistral.ai/v1",r=6e4,n=2;return{name:"mistral",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a={model:o.model,messages:o.messages.map(b=>b.role==="tool"?{role:"tool",content:b.content,tool_call_id:b.toolCallId}:b.role==="assistant"&&b.toolCalls&&b.toolCalls.length>0?{role:"assistant",content:b.content||null,tool_calls:b.toolCalls.map(d=>({id:d.id,type:"function",function:{name:d.name,arguments:JSON.stringify(d.arguments)}}))}:{role:b.role,content:b.content}),temperature:o.params.temperature,max_tokens:o.params.maxTokens};o.params.topP!==void 0&&(a.top_p=o.params.topP),o.params.stopSequences&&(a.stop=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(b=>({type:"function",function:{name:b.name,description:b.description,parameters:b.parameters}})),o.toolChoice&&(a.tool_choice=o.toolChoice));let p={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),u=await _(()=>e.fetch(`${s}/chat/completions`,{method:"POST",headers:p,body:JSON.stringify(a),timeoutMs:r}),{maxRetries:n,shouldRetry:b=>b instanceof g&&b.retryable}),c;try{c=await u.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Mistral API",u.status,u.status>=500)}let f=Date.now()-i;if(!u.ok)throw Kt(u.status,c);let y=c,m=y.choices?.[0],x=m?.message,C=(x?.tool_calls??[]).map(b=>{let d;try{d=JSON.parse(b.function.arguments||"{}")}catch{d={_raw:b.function.arguments}}return{id:b.id,name:b.function.name,arguments:d}});return{text:x?.content??"",toolCalls:C,usage:{inputTokens:y.usage?.prompt_tokens??0,outputTokens:y.usage?.completion_tokens??0,totalTokens:y.usage?.total_tokens??0},raw:c,latencyMs:f,modelId:y.model??o.model,finishReason:Vt(m?.finish_reason)}},estimateCost(o,a){return null},supportsTools(o){return!0}}}function Ht(e){switch(e){case"COMPLETE":return"stop";case"MAX_TOKENS":return"max_tokens";case"TOOL_CALL":return"tool_calls";default:return"unknown"}}function zt(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ne(e){let t="",s="https://api.cohere.com",r=6e4,n=2;return{name:"cohere",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a={model:o.model,messages:o.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content,tool_call_id:d.toolCallId}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||null,tool_calls:d.toolCalls.map(h=>({id:h.id,type:"function",function:{name:h.name,arguments:JSON.stringify(h.arguments)}}))}:{role:d.role,content:d.content}),temperature:o.params.temperature,max_tokens:o.params.maxTokens};o.params.topP!==void 0&&(a.p=o.params.topP),o.params.stopSequences&&(a.stop_sequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})),o.toolChoice&&(a.tool_choice=o.toolChoice));let p={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),u=await _(()=>e.fetch(`${s}/v2/chat`,{method:"POST",headers:p,body:JSON.stringify(a),timeoutMs:r}),{maxRetries:n,shouldRetry:d=>d instanceof g&&d.retryable}),c;try{c=await u.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Cohere API",u.status,u.status>=500)}let f=Date.now()-i;if(!u.ok)throw zt(u.status,c);let y=c,m=y.message?.content?.filter(d=>d.type==="text").map(d=>d.text).join("")??"",x=(y.message?.tool_calls??[]).map(d=>{let h;try{h=JSON.parse(d.function.arguments||"{}")}catch{h={_raw:d.function.arguments}}return{id:d.id,name:d.function.name,arguments:h}}),C=y.usage?.tokens?.input_tokens??0,b=y.usage?.tokens?.output_tokens??0;return{text:m,toolCalls:x,usage:{inputTokens:C,outputTokens:b,totalTokens:C+b},raw:c,latencyMs:f,modelId:y.model??o.model,finishReason:Ht(y.finish_reason)}},estimateCost(o,a){return null},supportsTools(o){return!0}}}var _e={openai:ee,anthropic:te,ollama:se,gemini:re,mistral:oe,cohere:ne};function ke(e,t){let s=_e[e];if(!s){let r=Object.keys(_e).join(", ");throw new Error(`Unknown provider: "${e}". Supported providers: ${r}`)}return s(t)}async function ae(e,t,s,r){let n=r?.maxTurns??10,o=[],a=[],p=[...t.messages],i={inputTokens:0,outputTokens:0,totalTokens:0},u=0;for(let y=0;y<n;y++){let m={...t,messages:p},x=await e.complete(m);if(o.push({request:m,response:x}),i.inputTokens+=x.usage.inputTokens,i.outputTokens+=x.usage.outputTokens,i.totalTokens+=x.usage.totalTokens,u+=x.latencyMs,x.toolCalls.length===0)return{turns:o,finalText:x.text,allToolCalls:a,totalUsage:i,totalLatencyMs:u};a.push(...x.toolCalls),p=[...p,{role:"assistant",content:x.text,toolCalls:x.toolCalls}];for(let C of x.toolCalls){let b=s.find(h=>h.name===C.name),d;b?d=Wt(b,C.arguments):d={error:`Tool "${C.name}" not simulated`},p.push({role:"tool",content:JSON.stringify(d),toolCallId:C.id,toolName:C.name})}}let f=o[o.length-1]?.response??{text:"",toolCalls:[],usage:{inputTokens:0,outputTokens:0,totalTokens:0},raw:null,latencyMs:0,modelId:t.model,finishReason:"unknown"};return{turns:o,finalText:f.text,allToolCalls:a,totalUsage:i,totalLatencyMs:u}}function Wt(e,t){if(e.responses){for(let s of e.responses)if(Xt(s.when,t))return s.then}return e.defaultResponse!==void 0?e.defaultResponse:{error:"No matching simulation response"}}function Xt(e,t){for(let[s,r]of Object.entries(e))if(JSON.stringify(t[s])!==JSON.stringify(r))return!1;return!0}var l=require("zod");var M=require("zod"),fe=M.z.object({outputTextAttr:M.z.string().default("gen_ai.completion.0.content"),modelAttr:M.z.string().default("gen_ai.response.model"),systemAttr:M.z.string().default("gen_ai.system"),inputTokensAttr:M.z.string().default("gen_ai.usage.input_tokens"),outputTokensAttr:M.z.string().default("gen_ai.usage.output_tokens")}),ge=M.z.object({namePattern:M.z.string().optional().describe("Regex to filter span names"),attributeMatch:M.z.record(M.z.string()).optional().describe("Attributes that must match"),minDurationMs:M.z.number().min(0).optional()}),ie=M.z.object({port:M.z.number().int().min(1).max(65535).default(4318),timeoutMs:M.z.number().int().min(1e3).default(3e4),spanMapping:fe.default({}),spanFilter:ge.optional()});var k=l.z.string().min(1,"Must not be empty"),Yt=l.z.number().min(0).max(2).default(.2),$=l.z.number().min(0).max(1),he=l.z.string().refine(e=>{try{return new RegExp(e),!0}catch{return!1}},{message:"Must be a valid regex pattern"}),H=l.z.object({apiKeyEnv:k.describe("Environment variable name containing the API key. Never a raw key."),baseUrl:l.z.string().url().optional().describe("Custom base URL for API-compatible proxies (e.g., Azure OpenAI, LiteLLM)"),organization:l.z.string().optional().describe("Organization ID (OpenAI-specific)")}),Qt=l.z.object({apiKeyEnv:l.z.string().min(1).optional().describe("Environment variable name containing the API key. Optional for Ollama (local)."),baseUrl:l.z.string().url().optional().describe("Ollama server URL. Defaults to http://localhost:11434.")}),Zt=l.z.object({openai:H.optional(),anthropic:H.optional(),ollama:Qt.optional(),gemini:H.optional(),mistral:H.optional(),cohere:H.optional()}).refine(e=>Object.keys(e).some(t=>e[t]!==void 0),{message:"At least one provider must be configured"}),qt=l.z.object({temperature:Yt,maxTokens:l.z.number().int().min(1).max(128e3).default(1024),topP:l.z.number().min(0).max(1).optional(),stopSequences:l.z.array(l.z.string()).optional(),seed:l.z.number().int().optional().describe("Seed for reproducibility (provider-dependent support)")}),es=l.z.object({id:k.describe("Unique identifier for this model config, referenced in reports"),provider:l.z.enum(["openai","anthropic","ollama","gemini","mistral","cohere"]).describe("Must match a key in the providers section"),model:k.describe("Model name as the provider expects it (e.g., 'gpt-4o', 'claude-sonnet-4-5-20250929')"),params:qt.default({})}),ts=l.z.object({system:l.z.string().optional().describe("System prompt template. Supports {{variable}} interpolation."),user:k.describe("User prompt template. Supports {{variable}} interpolation."),assistant:l.z.string().optional().describe("Prefill for assistant response (Anthropic-specific)")}),ss=l.z.object({format:l.z.enum(["text","json"]).default("text"),schemaFile:l.z.string().optional().describe("Path to JSON Schema file (relative to config file). Required if format is 'json'."),contains:l.z.array(l.z.string()).optional().describe("Output must contain all of these substrings"),notContains:l.z.array(l.z.string()).optional().describe("Output must not contain any of these substrings"),maxLength:l.z.number().int().positive().optional().describe("Maximum character length of the output")}).refine(e=>!(e.format==="json"&&!e.schemaFile),{message:"schemaFile is required when format is 'json'"}),rs=l.z.object({enabled:l.z.boolean().default(!0),denyPatterns:l.z.array(he).default(["\\b\\d{3}-\\d{2}-\\d{4}\\b","\\b\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}\\b","\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"]).describe("Regex patterns that must NOT appear in output. Defaults include SSN, credit card, email."),customPatterns:l.z.array(l.z.object({name:k,pattern:he})).optional().describe("Named custom PII patterns for reporting clarity")}),os=l.z.object({deny:l.z.array(l.z.string()).default([]).describe("Words/phrases that must NOT appear in output (case-insensitive)"),allow:l.z.array(l.z.string()).optional().describe("If set, output MUST contain at least one of these words/phrases")}),ns=l.z.object({criteria:k.describe("Natural language description of what to evaluate (e.g., 'Response is empathetic and professional')"),minScore:$.default(.7).describe("Minimum score (0-1) for this criterion to pass"),model:l.z.string().optional().describe("Override judge model for this criterion. Defaults to first model in models list."),rubric:l.z.string().optional().describe("Detailed rubric for the judge. If omitted, a default rubric is generated from criteria.")}),as=l.z.object({tool:k.describe("Expected tool/function name"),shouldNotCall:l.z.boolean().optional().default(!1).describe("If true, assert this tool was NOT called"),argsMatch:l.z.record(l.z.unknown()).optional().describe("Key-value pairs that must be present in the tool call arguments (partial match)"),argsSchema:l.z.string().optional().describe("Path to JSON Schema file to validate the tool call arguments"),order:l.z.number().int().min(0).optional().describe("Expected position in the sequence of tool calls (0-indexed)"),responseContains:l.z.string().optional().describe("Assert the simulated tool response contains this substring")}),is=l.z.object({maxScore:$.default(.15).describe("Maximum drift score (0-1). Higher = more drift allowed. Fail if exceeded."),method:l.z.enum(["judge","embedding","field-diff"]).default("judge").describe("Drift detection method. 'judge' uses LLM comparison, 'embedding' uses cosine similarity, 'field-diff' compares JSON fields."),fields:l.z.array(l.z.string()).optional().describe("For field-diff method: JSON paths to compare (e.g., ['response.action', 'response.message'])")}),ls=l.z.object({pii:rs.optional(),keywords:os.optional()}),us=l.z.object({output:ss.optional(),guardrails:ls.optional(),judge:l.z.array(ns).optional().describe("LLM-as-judge evaluations. Each criterion is scored independently."),toolCalls:l.z.array(as).optional().describe("Expected tool/function calls in the model response"),baseline:l.z.object({drift:is.optional()}).optional()}),ps=l.z.object({name:k.describe("Tool/function name as the model sees it"),description:l.z.string().optional().describe("Tool description for documentation"),parameters:l.z.record(l.z.unknown()).optional().describe("JSON Schema for the tool's parameters"),responses:l.z.array(l.z.object({when:l.z.record(l.z.unknown()).describe("Condition: match tool call arguments (partial match)"),then:l.z.unknown().describe("Simulated response to return when condition matches")})).optional().describe("Simulated responses based on argument matching"),defaultResponse:l.z.unknown().optional().describe("Response when no 'when' condition matches")}),cs=l.z.object({name:k.describe("Unique test case name within the suite. Used in reports and JUnit output."),prompt:k.optional().describe("Reference to a key in the prompts section. Exactly one of prompt or command must be set."),command:k.optional().describe("Shell command to execute. Stdout is captured and assertions run against it. Exactly one of prompt or command must be set."),vars:l.z.record(l.z.string()).default({}).describe("Variables to interpolate into the prompt template or command"),models:l.z.array(l.z.string()).optional().describe("Override: run this test only against these model IDs. Defaults to all models. Ignored for command tests."),repeat:l.z.number().int().min(1).optional().describe("Override: number of repeat runs for this specific test case"),tools:l.z.array(ps).optional().describe("Simulated tools available to the model for this test case"),expect:us.describe("Assertions to evaluate against the model output"),tags:l.z.array(l.z.string()).optional().describe("Tags for filtering test cases in CLI (e.g., --tags regression)"),skip:l.z.boolean().optional().default(!1).describe("Skip this test case during execution")}).refine(e=>{let t=e.prompt!==void 0,s=e.command!==void 0;return(t||s)&&!(t&&s)},{message:"Exactly one of 'prompt' or 'command' must be set on each test case"}),ds=l.z.object({passRateMin:$.default(.95).describe("Minimum overall pass rate (0-1). Computed after repeats and aggregation."),schemaFailuresMax:l.z.number().int().min(0).default(0).describe("Maximum allowed schema validation failures across entire suite"),judgeAvgMin:$.optional().describe("Minimum average LLM-as-judge score across all criteria and test cases"),driftScoreMax:$.optional().describe("Maximum allowed drift score against active baseline"),piiFailuresMax:l.z.number().int().min(0).default(0).describe("Maximum allowed PII detection failures"),keywordFailuresMax:l.z.number().int().min(0).default(0).describe("Maximum allowed keyword guardrail failures"),costMaxUsd:l.z.number().positive().optional().describe("Maximum total cost in USD for the entire run. Aborts if exceeded mid-run."),latencyMaxMs:l.z.number().positive().optional().describe("Maximum average latency in ms. Fails gate if exceeded."),deterministicPassRate:$.optional().describe("Minimum pass rate for deterministic assertions only (tool_called, schema, pii, keywords, etc.)"),probabilisticPassRate:$.optional().describe("Minimum pass rate for probabilistic assertions only (judge, drift)")}),ms=l.z.object({enabled:l.z.boolean().default(!1),framework:l.z.enum(["eu-ai-act","custom"]).default("eu-ai-act"),outputDir:l.z.string().default("./compliance-reports"),metadata:l.z.object({systemName:l.z.string().optional().describe("Name of the AI system being tested"),systemVersion:l.z.string().optional().describe("Version of the AI system"),riskLevel:l.z.enum(["high","limited","minimal"]).optional(),operator:l.z.string().optional().describe("Organization operating the AI system"),intendedPurpose:l.z.string().optional().describe("Documented intended purpose of the AI system"),dataGovernanceNotes:l.z.string().optional()}).optional()}),fs=l.z.object({enabled:l.z.boolean().default(!1),includeArtifacts:l.z.boolean().default(!1).describe("Upload raw prompt inputs and model outputs. Disabled by default for privacy."),redactPatterns:l.z.array(he).optional().describe("Patterns to redact from artifacts before upload (applied on top of PII guardrails)"),apiUrl:l.z.string().url().default("https://api.kindlm.com/v1").describe("Cloud API URL. Override for self-hosted deployments.")}),gs=l.z.object({name:k,description:l.z.string().optional(),tags:l.z.array(l.z.string()).optional()}),Re=l.z.object({kindlm:l.z.literal(1).describe("Config schema version. Must be 1."),project:k.describe("Project identifier for cloud upload and report grouping"),suite:gs,providers:Zt,models:l.z.array(es).min(1,"At least one model must be configured"),prompts:l.z.record(ts).refine(e=>Object.keys(e).length>0,{message:"At least one prompt must be defined"}),tests:l.z.array(cs).min(1,"At least one test case must be defined"),gates:ds.default({}),compliance:ms.optional(),trace:ie.optional().describe("OpenTelemetry trace ingestion configuration for the 'kindlm trace' command"),upload:fs.default({}),defaults:l.z.object({repeat:l.z.number().int().min(1).default(1).describe("Default repeat count per test case"),concurrency:l.z.number().int().min(1).max(32).default(4).describe("Default concurrency for test execution"),timeoutMs:l.z.number().int().min(1e3).default(6e4).describe("Default timeout per provider call in ms"),judgeModel:l.z.string().optional().describe("Default model ID for LLM-as-judge assertions. Must reference a configured model.")}).default({})});function le(e){let t=Re.safeParse(e);return t.success?S(t.data):A({code:"CONFIG_VALIDATION_ERROR",message:"Config validation failed",details:{errors:t.error.issues.map(s=>`${s.path.join(".")}: ${s.message}`)}})}var De=require("yaml");var hs=1048576,Ie=1e3,je=50;function Le(e,t){if(e.length>hs)return A({code:"CONFIG_TOO_LARGE",message:`Config exceeds maximum size of 1MB (got ${(e.length/1048576).toFixed(1)}MB)`});let s;try{s=(0,De.parse)(e)}catch(i){return A({code:"CONFIG_PARSE_ERROR",message:`Failed to parse YAML: ${i.message}`,cause:i})}let r=le(s);if(!r.success)return r;let n=r.data;if(n.tests.length>Ie)return A({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${Ie} tests (got ${n.tests.length})`});if(n.models.length>je)return A({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${je} models (got ${n.models.length})`});let o=[],a=new Set;for(let i of n.models)a.has(i.id)&&o.push(`Duplicate model ID "${i.id}"`),a.add(i.id);let p=new Set;for(let i of n.tests)p.has(i.name)&&o.push(`Duplicate test name "${i.name}"`),p.add(i.name);for(let i of n.tests)i.prompt&&!(i.prompt in n.prompts)&&o.push(`Test "${i.name}" references prompt "${i.prompt}" which is not defined`);for(let i of n.tests)if(i.models)for(let u of i.models)a.has(u)||o.push(`Test "${i.name}" references model "${u}" which is not configured`);for(let i of n.models)n.providers[i.provider]||o.push(`Model "${i.id}" references provider "${i.provider}" which is not configured`);if(n.defaults.judgeModel&&!a.has(n.defaults.judgeModel)&&o.push(`defaults.judgeModel "${n.defaults.judgeModel}" is not a configured model`),t.fileReader)for(let i of n.tests){if(i.expect.output?.schemaFile){let u=Ne(t.configDir,i.expect.output.schemaFile);u.success?t.fileReader.readFile(u.data).success||o.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" not found at ${u.data}`):o.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" \u2014 ${u.error.message}`)}if(i.expect.toolCalls){for(let u of i.expect.toolCalls)if(u.argsSchema){let c=Ne(t.configDir,u.argsSchema);c.success?t.fileReader.readFile(c.data).success||o.push(`Test "${i.name}": argsSchema "${u.argsSchema}" for tool "${u.tool}" not found at ${c.data}`):o.push(`Test "${i.name}": argsSchema "${u.argsSchema}" for tool "${u.tool}" \u2014 ${c.error.message}`)}}}return o.length>0?A({code:"CONFIG_VALIDATION_ERROR",message:"Config cross-reference validation failed",details:{errors:o}}):S(n)}function Ne(e,t){if(t.startsWith("/")||t.startsWith("\\"))return A({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});if(/^[a-zA-Z]:/.test(t))return A({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});let s=e.endsWith("/")?e.slice(0,-1):e,r=`${s}/${t}`,n=$e(r),o=$e(s);return!n.startsWith(o+"/")&&n!==o?A({code:"PATH_TRAVERSAL",message:`Path "${t}" escapes the config directory`}):S(n)}function $e(e){let t=e.split("/"),s=[];for(let n of t)n==="."||n===""||(n===".."?s.pop():s.push(n));return(e.startsWith("/")?"/":"")+s.join("/")}var Fe=/\{\{(\w+)\}\}/g;function L(e,t){let s=ye(e,t);if(s.length>0)return A({code:"CONFIG_VALIDATION_ERROR",message:`Missing template variables: ${s.join(", ")}`,details:{missing:s}});let r=e.replace(Fe,(n,o)=>t[o]);return S(r)}function ye(e,t){let s=new Set;for(let r of e.matchAll(Fe)){let n=r[1];n!==void 0&&!(n in t)&&s.add(n)}return[...s]}function ue(e){let t=e[0];if(!t)throw new Error("aggregateRuns requires at least one run");let{testCaseName:s,modelId:r}=t,o=e.filter(y=>y.assertions.every(m=>m.passed)).length/e.length,a=new Map;for(let y of e)for(let m of y.assertions){let x=a.get(m.assertionType);x||(x=[],a.set(m.assertionType,x)),x.push(m.score)}let p={};for(let[y,m]of a){let x=m.reduce((C,b)=>C+b,0);p[y]={mean:x/m.length,min:Math.min(...m),max:Math.max(...m)}}let i=new Set;for(let y of e)for(let m of y.assertions)!m.passed&&m.failureCode&&i.add(m.failureCode);let u=e.reduce((y,m)=>y+m.latencyMs,0)/e.length,c=e.reduce((y,m)=>y+(m.costEstimateUsd??0),0),f=e.reduce((y,m)=>y+m.tokenUsage.totalTokens,0);return{testCaseName:s,modelId:r,runCount:e.length,passed:o===1,passRate:o,assertionScores:p,failureCodes:[...i],latencyAvgMs:u,totalCostUsd:c,totalTokens:f,runs:e}}function pe(e){let t=e.stdout.split(`
|
|
32
|
+
`),s=[],r=[],n,o=0;for(let a of t){let p=a.trimStart();if(!p.startsWith('{"kindlm":')){s.push(a);continue}let i=Rs(p);if(!i){s.push(a);continue}i.kindlm==="tool_call"?r.push({id:i.id??`cmd_tc_${o++}`,name:i.name,arguments:i.arguments}):i.kindlm==="output_json"&&(n=i.data)}return{outputText:s.join(`
|
|
33
|
+
`).trim(),toolCalls:r,outputJson:n,exitCode:e.exitCode,stderr:e.stderr}}function Rs(e){try{let t=JSON.parse(e);return typeof t.kindlm!="string"?null:t.kindlm==="tool_call"?typeof t.name!="string"?null:{kindlm:"tool_call",id:typeof t.id=="string"?t.id:void 0,name:t.name,arguments:typeof t.arguments=="object"&&t.arguments!==null?t.arguments:{}}:t.kindlm==="output_json"?{kindlm:"output_json",data:t.data}:null}catch{return null}}function Ue(e,t){return{async run(){let s=Date.now(),r=new Map;for(let R of e.tests)if(R.expect.output?.schemaFile){let P=Ts(t.configDir,R.expect.output.schemaFile),E=t.fileReader.readFile(P);if(!E.success)return A({code:"SCHEMA_FILE_ERROR",message:`Failed to read schema file "${R.expect.output.schemaFile}": ${E.error.message}`});try{r.set(R.name,JSON.parse(E.data))}catch(T){return A({code:"SCHEMA_FILE_ERROR",message:`Failed to parse schema file "${R.expect.output.schemaFile}" as JSON: ${T instanceof Error?T.message:String(T)}`})}}let n=[];for(let R of e.tests){if(R.skip)continue;let P=R.repeat??e.defaults.repeat;if(R.command)for(let E=0;E<P;E++)n.push({test:R,modelConfig:null,runIndex:E});else{let E=R.models??e.models.map(T=>T.id);for(let T of E){let v=e.models.find(O=>O.id===T);if(v)for(let O=0;O<P;O++)n.push({test:R,modelConfig:v,runIndex:O})}}}let o=await xs(n.map(R=>()=>ys(e,t,R,r)),e.defaults.concurrency),a=R=>`${R.testCaseName}::${R.modelId}`,p=new Map;for(let R of o){let P=a(R),E=p.get(P);E||(E=[],p.set(P,E)),E.push(R)}let i=[];for(let R of p.values())i.push(ue(R));let u=i.map(R=>({name:R.testCaseName,modelId:R.modelId,status:R.passed?"passed":"failed",assertions:R.runs[0]?.assertions??[],latencyMs:R.latencyAvgMs,costUsd:R.totalCostUsd})),c=e.tests.filter(R=>R.skip).map(R=>({name:R.name,modelId:"",status:"skipped",assertions:[],latencyMs:0,costUsd:0})),f=[...u,...c],y=f.filter(R=>R.status==="passed").length,m=f.filter(R=>R.status==="failed").length,x=f.filter(R=>R.status==="errored").length,C=f.filter(R=>R.status==="skipped").length,b=x>0?"errored":m>0?"failed":"passed",h={suites:[{name:e.suite.name,status:b,tests:f}],totalTests:f.length,passed:y,failed:m,errored:x,skipped:C,durationMs:Date.now()-s};return S({runResult:h,aggregated:i})}}}async function ys(e,t,s,r){let{test:n,modelConfig:o,runIndex:a}=s;if(n.command)return bs(e,t,n,a,r);if(!o)return j(n.name,"unknown",a,"No model config for prompt-based test");t.onProgress?.({type:"test_start",test:n.name,model:o.id,run:a});try{let p=t.adapters.get(o.provider);if(!p)return j(n.name,o.id,a,`Provider adapter "${o.provider}" not found`);let i=n.prompt?e.prompts[n.prompt]:void 0;if(!i)return j(n.name,o.id,a,`Prompt "${n.prompt}" not defined`);let u=L(i.user,n.vars);if(!u.success)return j(n.name,o.id,a,u.error.message);let c=[];if(i.system){let v=L(i.system,n.vars);if(!v.success)return j(n.name,o.id,a,v.error.message);c.push({role:"system",content:v.data})}c.push({role:"user",content:u.data});let f=(n.tools??[]).map(v=>({name:v.name,description:v.description,parameters:v.parameters})),y={model:o.model,messages:c,params:{temperature:o.params.temperature,maxTokens:o.params.maxTokens,topP:o.params.topP,stopSequences:o.params.stopSequences,seed:o.params.seed},tools:f.length>0?f:void 0},m=await ae(p,y,n.tools??[]),x=p.estimateCost(o.model,m.totalUsage),C={};n.expect.output?.schemaFile&&r.has(n.name)&&(C.schemaContent=r.get(n.name));let b=K(n.expect,C),d=e.defaults.judgeModel??e.models[0]?.id,h=e.models.find(v=>v.id===d),R=h?t.adapters.get(h.provider):void 0,P={outputText:m.finalText,toolCalls:m.allToolCalls,configDir:t.configDir,latencyMs:m.totalLatencyMs,costUsd:x??void 0,judgeAdapter:R,judgeModel:h?.model};if(t.baselineData){let v=`${n.name}::${o.id}`,O=t.baselineData.results[v];O&&(P.baselineText=O.outputText)}let E=[];for(let v of b){let O=await v.evaluate(P);E.push(...O)}let T=E.every(v=>v.passed);return t.onProgress?.({type:"test_complete",test:n.name,model:o.id,run:a,passed:T}),{testCaseName:n.name,modelId:o.id,runIndex:a,outputText:m.finalText,assertions:E,latencyMs:m.totalLatencyMs,tokenUsage:m.totalUsage,costEstimateUsd:x}}catch(p){return t.onProgress?.({type:"test_complete",test:n.name,model:o.id,run:a,passed:!1}),j(n.name,o.id,a,p instanceof Error?p.message:String(p))}}async function bs(e,t,s,r,n){let o="command";t.onProgress?.({type:"test_start",test:s.name,model:o,run:r});try{if(!t.commandExecutor)return j(s.name,o,r,"Command executor not available");if(!s.command)return j(s.name,o,r,"No command specified");let a=L(s.command,s.vars);if(!a.success)return j(s.name,o,r,a.error.message);let p=Date.now(),i=await t.commandExecutor.execute(a.data,{timeoutMs:e.defaults.timeoutMs,cwd:t.configDir});if(!i.success)return j(s.name,o,r,i.error.message);let u=Date.now()-p,c=pe(i.data),f={};s.expect.output?.schemaFile&&n.has(s.name)&&(f.schemaContent=n.get(s.name));let y=K(s.expect,f),m=e.defaults.judgeModel??e.models[0]?.id,x=e.models.find(R=>R.id===m),C=x?t.adapters.get(x.provider):void 0,b={outputText:c.outputText,outputJson:c.outputJson,toolCalls:c.toolCalls,configDir:t.configDir,latencyMs:u,judgeAdapter:C,judgeModel:x?.model};if(t.baselineData){let R=`${s.name}::${o}`,P=t.baselineData.results[R];P&&(b.baselineText=P.outputText)}let d=[];for(let R of y){let P=await R.evaluate(b);d.push(...P)}let h=d.every(R=>R.passed);return t.onProgress?.({type:"test_complete",test:s.name,model:o,run:r,passed:h}),{testCaseName:s.name,modelId:o,runIndex:r,outputText:c.outputText,assertions:d,latencyMs:u,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}catch(a){return t.onProgress?.({type:"test_complete",test:s.name,model:o,run:r,passed:!1}),j(s.name,o,r,a instanceof Error?a.message:String(a))}}function j(e,t,s,r){return{testCaseName:e,modelId:t,runIndex:s,outputText:"",assertions:[{assertionType:"internal",label:"Execution error",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:r}],latencyMs:0,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}async function xs(e,t){let s=new Array(e.length),r=0;async function n(){for(;r<e.length;){let a=r++,p=e[a];p&&(s[a]=await p())}}let o=Array.from({length:Math.min(t,e.length)},()=>n());return await Promise.all(o),s}function Ts(e,t){return t.startsWith("/")?t:`${e.endsWith("/")?e.slice(0,-1):e}/${t}`}function Je(e,t){let s=[],r=t.reduce((u,c)=>u+c.runCount,0),n=t.reduce((u,c)=>u+Math.round(c.passRate*c.runCount),0),o=r>0?n/r:0;s.push({gateName:"passRateMin",passed:o>=e.passRateMin,actual:o,threshold:e.passRateMin,message:o>=e.passRateMin?`Pass rate ${w(o)} meets minimum ${w(e.passRateMin)}`:`Pass rate ${w(o)} below minimum ${w(e.passRateMin)}`});let a=be(t,["SCHEMA_INVALID","SCHEMA_PARSE_ERROR"]);if(s.push({gateName:"schemaFailuresMax",passed:a<=e.schemaFailuresMax,actual:a,threshold:e.schemaFailuresMax,message:a<=e.schemaFailuresMax?`Schema failures ${a} within limit ${e.schemaFailuresMax}`:`Schema failures ${a} exceed limit ${e.schemaFailuresMax}`}),e.judgeAvgMin!==void 0){let u=Ge(t,"judge"),c=u.length>0?u.reduce((f,y)=>f+y,0)/u.length:1;s.push({gateName:"judgeAvgMin",passed:c>=e.judgeAvgMin,actual:c,threshold:e.judgeAvgMin,message:c>=e.judgeAvgMin?`Judge average ${w(c)} meets minimum ${w(e.judgeAvgMin)}`:`Judge average ${w(c)} below minimum ${w(e.judgeAvgMin)}`})}if(e.driftScoreMax!==void 0){let u=Ge(t,"drift"),c=u.length>0?Math.max(...u):0;s.push({gateName:"driftScoreMax",passed:c<=e.driftScoreMax,actual:c,threshold:e.driftScoreMax,message:c<=e.driftScoreMax?`Drift score ${w(c)} within limit ${w(e.driftScoreMax)}`:`Drift score ${w(c)} exceeds limit ${w(e.driftScoreMax)}`})}let p=be(t,["PII_DETECTED"]);s.push({gateName:"piiFailuresMax",passed:p<=e.piiFailuresMax,actual:p,threshold:e.piiFailuresMax,message:p<=e.piiFailuresMax?`PII failures ${p} within limit ${e.piiFailuresMax}`:`PII failures ${p} exceed limit ${e.piiFailuresMax}`});let i=be(t,["KEYWORD_DENIED","KEYWORD_MISSING"]);if(s.push({gateName:"keywordFailuresMax",passed:i<=e.keywordFailuresMax,actual:i,threshold:e.keywordFailuresMax,message:i<=e.keywordFailuresMax?`Keyword failures ${i} within limit ${e.keywordFailuresMax}`:`Keyword failures ${i} exceed limit ${e.keywordFailuresMax}`}),e.costMaxUsd!==void 0){let u=t.reduce((c,f)=>c+f.totalCostUsd,0);s.push({gateName:"costMaxUsd",passed:u<=e.costMaxUsd,actual:u,threshold:e.costMaxUsd,message:u<=e.costMaxUsd?`Total cost $${u.toFixed(4)} within limit $${e.costMaxUsd.toFixed(4)}`:`Total cost $${u.toFixed(4)} exceeds limit $${e.costMaxUsd.toFixed(4)}`})}if(e.latencyMaxMs!==void 0){let u=t.length>0?t.reduce((c,f)=>c+f.latencyAvgMs,0)/t.length:0;s.push({gateName:"latencyMaxMs",passed:u<=e.latencyMaxMs,actual:u,threshold:e.latencyMaxMs,message:u<=e.latencyMaxMs?`Average latency ${Math.round(u)}ms within limit ${e.latencyMaxMs}ms`:`Average latency ${Math.round(u)}ms exceeds limit ${e.latencyMaxMs}ms`})}if(e.deterministicPassRate!==void 0){let u=Be(t,"deterministic");s.push({gateName:"deterministicPassRate",passed:u>=e.deterministicPassRate,actual:u,threshold:e.deterministicPassRate,message:u>=e.deterministicPassRate?`Deterministic pass rate ${w(u)} meets minimum ${w(e.deterministicPassRate)}`:`Deterministic pass rate ${w(u)} below minimum ${w(e.deterministicPassRate)}`})}if(e.probabilisticPassRate!==void 0){let u=Be(t,"probabilistic");s.push({gateName:"probabilisticPassRate",passed:u>=e.probabilisticPassRate,actual:u,threshold:e.probabilisticPassRate,message:u>=e.probabilisticPassRate?`Probabilistic pass rate ${w(u)} meets minimum ${w(e.probabilisticPassRate)}`:`Probabilistic pass rate ${w(u)} below minimum ${w(e.probabilisticPassRate)}`})}return{passed:s.every(u=>u.passed),gates:s}}function Be(e,t){let s=0,r=0;for(let n of e)for(let o of n.runs)for(let a of o.assertions)D(a.assertionType)===t&&(s++,a.passed&&r++);return s>0?r/s:1}function be(e,t){let s=0;for(let r of e)for(let n of t)r.failureCodes.includes(n)&&s++;return s}function Ge(e,t){let s=[];for(let r of e){let n=r.assertionScores[t];n&&s.push(n.mean)}return s}function w(e){return(e*100).toFixed(1)+"%"}var N=e=>e,ce={bold:N,red:N,green:N,yellow:N,cyan:N,dim:N,greenBold:N,redBold:N};function Ke(e=ce){return{name:"pretty",generate(t,s){let r=[],n=e;r.push(""),r.push(n.bold(" KindLM Test Results")),r.push("");let o=0;for(let c of t.suites){r.push(Cs(c,n));for(let f of c.tests){r.push(As(f,n));let y=Ps(f,n);y&&r.push(y);for(let m of f.assertions)r.push(Es(m,n));o+=f.costUsd}r.push("")}r.push(n.bold(" Summary"));let a=n.green(`${t.passed} passed`),p=t.failed>0?n.red(`${t.failed} failed`):`${t.failed} failed`,i=t.errored>0?n.yellow(`${t.errored} errored`):`${t.errored} errored`;if(r.push(` ${a}, ${p}, ${i} (${t.totalTests} total)`),r.push(` Duration: ${He(t.durationMs)}`),o>0&&r.push(` Cost: ${ze(o)}`),r.push(""),s.gates.length>0){r.push(n.bold(" Quality Gates"));for(let c of s.gates){let f=c.passed?n.green("\u2713"):n.red("\u2717");r.push(` ${f} ${c.message}`)}r.push("")}return t.failed===0&&t.errored===0&&s.passed?r.push(n.greenBold(" \u2713 All tests passed")):r.push(n.redBold(" \u2717 Some tests failed")),r.push(""),{content:r.join(`
|
|
34
|
+
`),format:"text"}}}}function Cs(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${t.bold(e.name)}`}function As(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${e.name}`}function Ps(e,t){if(e.status==="skipped")return null;let s=[];return e.modelId&&s.push(e.modelId),e.latencyMs>0&&s.push(He(e.latencyMs)),e.costUsd>=5e-5&&s.push(ze(e.costUsd)),s.length===0?null:` ${t.dim(s.join(" \xB7 "))}`}function Es(e,t){if(e.passed){let o=Ve(e),a=o?`${e.label} ${t.cyan(o)}`:e.label;return` ${t.green("\u2713")} ${t.dim(a)}`}let s=Ve(e),r=e.failureMessage??"failed",n=s?`${e.label} ${t.cyan(s)}`:e.label;return` ${t.red("\u2717")} ${n}: ${r}`}function Ve(e){if(e.assertionType==="judge"||e.assertionType==="drift"){let t=vs(e);if(t!==null){let s=e.passed?"\u2265":"<";return`(${e.score.toFixed(2)} ${s} ${t.toFixed(2)})`}return`(${e.score.toFixed(2)})`}return""}function vs(e){if(e.metadata&&typeof e.metadata=="object"&&"threshold"in e.metadata){let t=e.metadata.threshold;if(typeof t=="number")return t}if(e.failureMessage){let t=e.failureMessage.match(/threshold (\d+\.?\d*)/i);if(t?.[1])return parseFloat(t[1]);let s=e.failureMessage.match(/below (\d+\.?\d*)/i);if(s?.[1])return parseFloat(s[1])}return null}function He(e){return e<1e3?`${e}ms`:`${(e/1e3).toFixed(2)}s`}function ze(e){return e<.01?`$${e.toFixed(4)}`:`$${e.toFixed(2)}`}function We(){return{name:"json",generate(e,t){let s={kindlm:{version:"1.0.0",timestamp:new Date().toISOString()},summary:{totalTests:e.totalTests,passed:e.passed,failed:e.failed,errored:e.errored,skipped:e.skipped,durationMs:e.durationMs},gates:{passed:t.passed,results:t.gates},suites:e.suites.map(r=>({name:r.name,status:r.status,tests:r.tests.map(n=>({name:n.name,status:n.status,assertions:n.assertions,latencyMs:n.latencyMs,costUsd:n.costUsd}))}))};return{content:JSON.stringify(s,null,2),format:"json"}}}}function Xe(){return{name:"junit",generate(e,t){let s=e.durationMs/1e3,r=[];r.push('<?xml version="1.0" encoding="UTF-8"?>'),r.push(`<testsuites name="KindLM" tests="${e.totalTests}" failures="${e.failed}" errors="${e.errored}" time="${s.toFixed(3)}">`);for(let n of e.suites){let o=n.tests.filter(i=>i.status==="failed").length,a=n.tests.filter(i=>i.status==="errored").length,p=n.tests.reduce((i,u)=>i+u.latencyMs,0)/1e3;r.push(` <testsuite name="${I(n.name)}" tests="${n.tests.length}" failures="${o}" errors="${a}" time="${p.toFixed(3)}">`);for(let i of n.tests){let u=i.latencyMs/1e3;if(r.push(` <testcase name="${I(i.name)}" classname="${I(n.name)}" time="${u.toFixed(3)}">`),i.status==="skipped")r.push(" <skipped/>");else if(i.status==="errored"&&i.error)r.push(` <error message="${I(i.error.message)}" type="${I(i.error.code)}">${I(i.error.message)}</error>`);else if(i.status==="failed"){let c=i.assertions.filter(f=>!f.passed);for(let f of c)r.push(` <failure message="${I(f.label)}" type="${I(f.failureCode??"ASSERTION_FAILED")}">${I(f.failureMessage??"Assertion failed")}</failure>`)}r.push(" </testcase>")}r.push(" </testsuite>")}if(t.gates.length>0){let n=t.gates.filter(o=>!o.passed).length;r.push(` <testsuite name="Quality Gates" tests="${t.gates.length}" failures="${n}" errors="0" time="0.000">`);for(let o of t.gates)r.push(` <testcase name="${I(o.gateName)}" classname="Quality Gates" time="0.000">`),o.passed||r.push(` <failure message="${I(o.message)}" type="GATE_FAILED">${I(o.message)}</failure>`),r.push(" </testcase>");r.push(" </testsuite>")}return r.push("</testsuites>"),{content:r.join(`
|
|
35
|
+
`),format:"xml"}}}}function I(e){return e.replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""").replace(/'/g,"'")}var Qe=require("crypto");function Ze(){return{name:"compliance",generate(e,t){let s=new Date().toISOString(),r=[];r.push("# EU AI Act \u2014 Annex IV Compliance Report"),r.push(""),r.push(`**Generated:** ${s}`),r.push("**Framework:** EU AI Act (Regulation 2024/1689)"),r.push("**Tool:** KindLM v1.0.0"),r.push(""),r.push("## Article 9 \u2014 Risk Management System"),r.push(""),r.push("Testing demonstrates ongoing risk identification and mitigation through automated behavioral regression tests."),r.push(""),r.push(de(t,["passRateMin"])),r.push("## Article 10 \u2014 Data and Data Governance"),r.push(""),r.push("PII detection guardrails verify that personal data is not exposed in AI system outputs."),r.push(""),r.push(de(t,["piiFailuresMax","keywordFailuresMax"])),r.push("## Article 12 \u2014 Record-Keeping"),r.push(""),r.push("### Test Execution Log"),r.push(""),r.push("| Metric | Value |"),r.push("|--------|-------|"),r.push(`| Total Tests | ${e.totalTests} |`),r.push(`| Passed | ${e.passed} |`),r.push(`| Failed | ${e.failed} |`),r.push(`| Errored | ${e.errored} |`),r.push(`| Duration | ${e.durationMs}ms |`),r.push(""),r.push("### Suite Results"),r.push("");for(let p of e.suites){r.push(`**${p.name}** \u2014 ${p.status}`);for(let i of p.tests){let u=i.status==="passed"?"PASS":"FAIL";r.push(`- [${u}] ${i.name}`)}r.push("")}r.push("## Article 13 \u2014 Transparency and Provision of Information"),r.push(""),r.push("This report provides transparent documentation of AI system testing methodology, results, and quality gate evaluations as required under Article 13."),r.push(""),r.push(de(t,["judgeAvgMin","driftScoreMax"])),r.push("## Article 15 \u2014 Accuracy, Robustness and Cybersecurity"),r.push(""),r.push("Schema validation and behavioral assertions verify output accuracy and robustness."),r.push(""),r.push(de(t,["schemaFailuresMax","costMaxUsd","latencyMaxMs"])),r.push("## Quality Gate Summary"),r.push(""),r.push("| Gate | Result | Actual | Threshold |"),r.push("|------|--------|--------|-----------|");for(let p of t.gates){let i=p.passed?"PASS":"FAIL";r.push(`| ${p.gateName} | ${i} | ${Ye(p.actual)} | ${Ye(p.threshold)} |`)}r.push("");let n=t.passed?"PASS":"FAIL";r.push(`**Overall Verdict:** ${n}`),r.push("");let o=r.join(`
|
|
36
|
+
`),a=(0,Qe.createHash)("sha256").update(o).digest("hex");return r.push("---"),r.push(`**Tamper Evidence Hash (SHA-256):** \`${a}\``),r.push(""),{content:r.join(`
|
|
37
|
+
`),format:"markdown"}}}}function de(e,t){let s=e.gates.filter(n=>t.includes(n.gateName));if(s.length===0)return"";let r=[];r.push("**Gate Evidence:**");for(let n of s){let o=n.passed?"PASS":"FAIL";r.push(`- [${o}] ${n.message}`)}return r.push(""),r.join(`
|
|
38
|
+
`)}function Ye(e){return Number.isInteger(e)?String(e):e.toFixed(4)}var F="1";function xe(e){return JSON.stringify(e,null,2)}function Te(e){let t;try{t=JSON.parse(e)}catch{return A({code:"BASELINE_CORRUPT",message:"Baseline file is not valid JSON"})}if(typeof t!="object"||t===null)return A({code:"BASELINE_CORRUPT",message:"Baseline file is not a JSON object"});let s=t;return typeof s.version!="string"?A({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: version"}):s.version!==F?A({code:"BASELINE_VERSION_MISMATCH",message:`Baseline version "${s.version}" does not match expected "${F}". Re-run \`kindlm baseline set\` to update.`}):typeof s.suiteName!="string"?A({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: suiteName"}):typeof s.createdAt!="string"?A({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: createdAt"}):typeof s.results!="object"||s.results===null?A({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: results"}):S(t)}function qe(e,t){let s=t.read(e);return s.success?Te(s.data):s}function et(e,t){let s=xe(e);return t.write(e.suiteName,s)}function tt(e){return e.list()}function st(e,t,s){let r={};for(let n of t){let o=`${n.testCaseName}::${n.modelId}`,p=n.runs.find(i=>i.assertions.every(u=>u.passed))??n.runs[0];r[o]={passRate:n.passRate,outputText:p?.outputText??"",failureCodes:n.failureCodes,latencyAvgMs:n.latencyAvgMs,costUsd:n.totalCostUsd,runCount:n.runCount}}return{version:F,suiteName:e,createdAt:s,results:r}}function rt(e,t){let s=[],r=[],n=[],o=[],a=[],p=new Set(Object.keys(e.results));for(let[i,u]of Object.entries(t)){let c=e.results[i];if(!c){o.push(i);continue}p.delete(i);let f=u.passRate-c.passRate;if(f<-.001){let y=u.failureCodes.filter(m=>!c.failureCodes.includes(m));s.push({testName:i,baselinePassRate:c.passRate,currentPassRate:u.passRate,newFailureCodes:y})}else f>.001?r.push({testName:i,baselinePassRate:c.passRate,currentPassRate:u.passRate}):n.push({testName:i,passRate:u.passRate})}for(let i of p)a.push(i);return{suiteName:e.suiteName,hasBaseline:!0,regressions:s,improvements:r,unchanged:n,newTests:o,removedTests:a}}function at(e){if(!e||typeof e!="object")return A({code:"CONFIG_PARSE_ERROR",message:"OTLP payload must be a non-null object"});let t=e;if(!Array.isArray(t.resourceSpans))return A({code:"CONFIG_PARSE_ERROR",message:"OTLP payload missing resourceSpans array"});let s=[];for(let r of t.resourceSpans){let n=nt(r.resource?.attributes??[]);if(Array.isArray(r.scopeSpans)){for(let o of r.scopeSpans)if(Array.isArray(o.spans))for(let a of o.spans){let p=ot(a.startTimeUnixNano),i=ot(a.endTimeUnixNano);s.push({traceId:a.traceId,spanId:a.spanId,parentSpanId:a.parentSpanId||void 0,name:a.name,kind:a.kind,startTimeMs:p,endTimeMs:i,durationMs:i-p,attributes:nt(a.attributes??[]),resourceAttributes:n,statusCode:a.status?.code,statusMessage:a.status?.message})}}}return S(s)}function ot(e){let t=BigInt(e);return Number(t/1000000n)}function nt(e){let t={};for(let s of e){let r=ws(s.value);r!==void 0&&(t[s.key]=r)}return t}function ws(e){if(e.stringValue!==void 0)return e.stringValue;if(e.intValue!==void 0)return parseInt(e.intValue,10);if(e.doubleValue!==void 0)return e.doubleValue;if(e.boolValue!==void 0)return e.boolValue}function it(e,t){return t?e.filter(s=>{if(t.namePattern&&!new RegExp(t.namePattern).test(s.name))return!1;if(t.attributeMatch){for(let[r,n]of Object.entries(t.attributeMatch))if(String(s.attributes[r])!==n)return!1}return!(t.minDurationMs!==void 0&&s.durationMs<t.minDurationMs)}):e}function lt(e,t){let s="",r=[],n=0,o=0,a=0,p,i;for(let u of e){let c={...u.resourceAttributes,...u.attributes},f=c[t.outputTextAttr];typeof f=="string"&&f&&(s=s?`${s}
|
|
39
|
+
${f}`:f);let y=c[t.modelAttr];typeof y=="string"&&y&&(p=y);let m=c[t.systemAttr];typeof m=="string"&&m&&(i=m);let x=c[t.inputTokensAttr];typeof x=="number"&&(o+=x);let C=c[t.outputTokensAttr];typeof C=="number"&&(a+=C),u.parentSpanId||(n+=u.durationMs);let b=c["gen_ai.tool.name"],d=c["gen_ai.tool.arguments"];if(typeof b=="string"){let h={};if(typeof d=="string")try{h=JSON.parse(d)}catch{}r.push({id:u.spanId,name:b,arguments:h})}}return{outputText:s,toolCalls:r,latencyMs:n,inputTokens:o,outputTokens:a,model:p,system:i}}function ut(e,t){let s=e.toolCalls.map(r=>({id:r.id,name:r.name,arguments:r.arguments}));return{outputText:e.outputText,toolCalls:s,configDir:t.configDir,latencyMs:e.latencyMs,judgeAdapter:t.judgeAdapter,judgeModel:t.judgeModel,baselineText:t.baselineText}}0&&(module.exports={BASELINE_VERSION,KindLMConfigSchema,ProviderError,SpanFilterSchema,SpanMappingSchema,TraceConfigSchema,aggregateRuns,buildBaselineData,buildContextFromTrace,classifyAssertion,compareBaseline,createAnthropicAdapter,createAssertionRegistry,createAssertionsFromExpect,createCohereAdapter,createComplianceReporter,createCostAssertion,createDriftAssertion,createGeminiAdapter,createJsonReporter,createJudgeAssertion,createJunitReporter,createKeywordsAbsentAssertion,createKeywordsPresentAssertion,createLatencyAssertion,createMistralAdapter,createOllamaAdapter,createOpenAIAdapter,createPiiAssertion,createPrettyReporter,createProvider,createRunner,createSchemaAssertion,createToolCalledAssertion,createToolNotCalledAssertion,createToolOrderAssertion,deserializeBaseline,err,evaluateGates,filterSpans,findMissingVars,interpolate,isDeterministic,isProbabilistic,listBaselines,mapSpansToResult,noColor,ok,parseCommandOutput,parseConfig,parseOtlpPayload,readBaseline,runConversation,serializeBaseline,validateConfig,withRetry,writeBaseline});
|
|
40
|
+
//# sourceMappingURL=index.cjs.map
|