@kindlm/core 0.2.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1 -1
- package/dist/index.cjs +15 -14
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +245 -100
- package/dist/index.d.ts +245 -100
- package/dist/index.js +15 -14
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 KindLM Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# KindLM
|
|
2
2
|
|
|
3
|
-

|
|
4
4
|
|
|
5
5
|
Behavioral regression testing for AI agents. Test what your agents **do** — not just what they say.
|
|
6
6
|
|
package/dist/index.cjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"use strict";var pt=Object.create;var z=Object.defineProperty;var ct=Object.getOwnPropertyDescriptor;var dt=Object.getOwnPropertyNames;var mt=Object.getPrototypeOf,ft=Object.prototype.hasOwnProperty;var gt=(e,t)=>{for(var s in t)z(e,s,{get:t[s],enumerable:!0})},Ae=(e,t,s,r)=>{if(t&&typeof t=="object"||typeof t=="function")for(let n of dt(t))!ft.call(e,n)&&n!==s&&z(e,n,{get:()=>t[n],enumerable:!(r=ct(t,n))||r.enumerable});return e};var Pe=(e,t,s)=>(s=e!=null?pt(mt(e)):{},Ae(t||!e||!e.__esModule?z(s,"default",{value:e,enumerable:!0}):s,e)),ht=e=>Ae(z({},"__esModule",{value:!0}),e);var Ss={};gt(Ss,{BASELINE_VERSION:()=>F,KindLMConfigSchema:()=>Re,ProviderError:()=>g,SpanFilterSchema:()=>ge,SpanMappingSchema:()=>fe,TraceConfigSchema:()=>ie,aggregateRuns:()=>ue,buildBaselineData:()=>st,buildContextFromTrace:()=>ut,classifyAssertion:()=>D,compareBaseline:()=>rt,createAnthropicAdapter:()=>te,createAssertionRegistry:()=>Se,createAssertionsFromExpect:()=>K,createCohereAdapter:()=>ne,createComplianceReporter:()=>Ze,createCostAssertion:()=>q,createDriftAssertion:()=>V,createGeminiAdapter:()=>re,createJsonReporter:()=>We,createJudgeAssertion:()=>J,createJunitReporter:()=>Xe,createKeywordsAbsentAssertion:()=>Q,createKeywordsPresentAssertion:()=>Y,createLatencyAssertion:()=>Z,createMistralAdapter:()=>oe,createOllamaAdapter:()=>se,createOpenAIAdapter:()=>ee,createPiiAssertion:()=>G,createPrettyReporter:()=>Ke,createProvider:()=>ke,createRunner:()=>Ue,createSchemaAssertion:()=>B,createToolCalledAssertion:()=>U,createToolNotCalledAssertion:()=>W,createToolOrderAssertion:()=>X,deserializeBaseline:()=>Te,err:()=>A,evaluateGates:()=>Je,filterSpans:()=>it,findMissingVars:()=>ye,interpolate:()=>L,isDeterministic:()=>Me,isProbabilistic:()=>Oe,listBaselines:()=>tt,mapSpansToResult:()=>lt,noColor:()=>ce,ok:()=>S,parseCommandOutput:()=>pe,parseConfig:()=>Le,parseOtlpPayload:()=>at,readBaseline:()=>qe,runConversation:()=>ae,serializeBaseline:()=>xe,validateConfig:()=>le,withRetry:()=>_,writeBaseline:()=>et});module.exports=ht(Ss);var g=class extends Error{constructor(s,r,n,o=!1,a){super(r);this.code=s;this.statusCode=n;this.retryable=o;this.raw=a;this.name="ProviderError"}};function S(e){return{success:!0,data:e}}function A(e){return{success:!1,error:e}}function Ee(e,t){for(let[s,r]of Object.entries(t))if(JSON.stringify(e[s])!==JSON.stringify(r))return!1;return!0}function U(e,t){return{type:"tool_called",evaluate(s){let r=[],n=s.toolCalls.filter(a=>a.name===e),o=s.toolCalls.map(a=>a.name);if(n.length===0)return r.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${e}" to be called, but got: [${o.join(", ")}]`}),Promise.resolve(r);if(r.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!0,score:1}),t){let a=n.some(p=>Ee(p.arguments,t));r.push({assertionType:"tool_called",label:`Tool "${e}" args match`,passed:a,score:a?1:0,failureCode:a?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:a?void 0:`Expected args ${JSON.stringify(t)}, got ${JSON.stringify(n[0]?.arguments)}`})}return Promise.resolve(r)}}}function W(e){return{type:"tool_not_called",evaluate(t){let s=t.toolCalls.some(r=>r.name===e);return Promise.resolve([{assertionType:"tool_not_called",label:`Tool "${e}" not called`,passed:!s,score:s?0:1,failureCode:s?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:s?`Expected tool "${e}" to NOT be called, but it was`:void 0}])}}}function X(e){return{type:"tool_order",evaluate(t){let s=[];for(let r of e){if(r.shouldNotCall){let o=t.toolCalls.some(a=>a.name===r.tool);s.push({assertionType:"tool_order",label:`Tool "${r.tool}" not called`,passed:!o,score:o?0:1,failureCode:o?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:o?`Expected tool "${r.tool}" to NOT be called, but it was`:void 0});continue}let n=t.toolCalls.filter(o=>o.name===r.tool);if(n.length===0){s.push({assertionType:"tool_order",label:`Tool "${r.tool}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${r.tool}" to be called, but it was not`});continue}if(s.push({assertionType:"tool_order",label:`Tool "${r.tool}" called`,passed:!0,score:1}),r.argsMatch){let o=n.some(a=>Ee(a.arguments,r.argsMatch??{}));s.push({assertionType:"tool_order",label:`Tool "${r.tool}" args match`,passed:o,score:o?1:0,failureCode:o?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:o?void 0:`Expected args ${JSON.stringify(r.argsMatch)}, got ${JSON.stringify(n[0]?.arguments)}`})}if(r.order!==void 0){let o=t.toolCalls.findIndex(p=>p.name===r.tool),a=o===r.order;s.push({assertionType:"tool_order",label:`Tool "${r.tool}" at position ${r.order}`,passed:a,score:a?1:0,failureCode:a?void 0:"TOOL_CALL_ORDER_WRONG",failureMessage:a?void 0:`Expected "${r.tool}" at position ${r.order}, but found at position ${o}`})}}return Promise.resolve(s)}}}var me;async function Rt(){if(me)return me;let e=await import("ajv"),t=await import("ajv-formats"),s=e.default,r=t.default,n=new s({allErrors:!0,strict:!1});return r(n),me=n,n}var ve=new Map;function yt(e,t){let s=JSON.stringify(t),r=ve.get(s);if(r)return r;let n=e.compile(t);return ve.set(s,n),n}function B(e){return{type:"schema",async evaluate(t){let s=[],r;if(e.format==="json")try{r=JSON.parse(t.outputText),s.push({assertionType:"schema",label:"Output is valid JSON",passed:!0,score:1})}catch(n){return s.push({assertionType:"schema",label:"Output is valid JSON",passed:!1,score:0,failureCode:"SCHEMA_PARSE_ERROR",failureMessage:`Failed to parse output as JSON: ${n instanceof Error?n.message:String(n)}`}),s}if(e.schemaContent){let n=await Rt(),o=yt(n,e.schemaContent),a=o(r??t.outputText);s.push({assertionType:"schema",label:"Output matches JSON Schema",passed:a,score:a?1:0,failureCode:a?void 0:"SCHEMA_INVALID",failureMessage:a?void 0:`Schema validation failed: ${n.errorsText(o.errors)}`,metadata:a?void 0:{errors:o.errors}})}if(e.contains){let n=t.outputText.toLowerCase();for(let o of e.contains){let a=n.includes(o.toLowerCase());s.push({assertionType:"schema",label:`Output contains "${o}"`,passed:a,score:a?1:0,failureCode:a?void 0:"CONTAINS_FAILED",failureMessage:a?void 0:`Expected output to contain "${o}"`})}}if(e.notContains){let n=t.outputText.toLowerCase();for(let o of e.notContains){let a=n.includes(o.toLowerCase());s.push({assertionType:"schema",label:`Output does not contain "${o}"`,passed:!a,score:a?0:1,failureCode:a?"NOT_CONTAINS_FAILED":void 0,failureMessage:a?`Expected output to NOT contain "${o}"`:void 0})}}if(e.maxLength!==void 0){let n=t.outputText.length<=e.maxLength;s.push({assertionType:"schema",label:`Output length <= ${e.maxLength}`,passed:n,score:n?1:0,failureCode:n?void 0:"MAX_LENGTH_EXCEEDED",failureMessage:n?void 0:`Output length ${t.outputText.length} exceeds max ${e.maxLength}`})}return s}}}function bt(e){return e.length<=4?"*".repeat(e.length):e.slice(0,2)+"*".repeat(e.length-4)+e.slice(-2)}var xt=/(\+|\*|\{[^}]+\})\)?(\+|\*|\{[^}]+\})/;function Tt(e){return xt.test(e)}function Ct(e,t,s){let r=[],n=Date.now();e.lastIndex=0;let o;for(;(o=e.exec(t))!==null&&(r.push(o[0]),!(r.length>=s||Date.now()-n>100)););return r}function G(e){let t=[],s;for(let r=0;r<e.denyPatterns.length;r++){let n=e.denyPatterns[r];n!==void 0&&t.push({name:`pii-pattern-${r+1}`,regex:new RegExp(n,"gi")})}if(e.customPatterns)for(let r of e.customPatterns){if(Tt(r.pattern)){s=[{assertionType:"pii",label:"No PII detected",passed:!1,score:0,failureCode:"INVALID_PATTERN",failureMessage:`Custom pattern "${r.name}" contains nested quantifiers and may cause catastrophic backtracking`}];break}t.push({name:r.name,regex:new RegExp(r.pattern,"gi")})}return{type:"pii",evaluate(r){if(s)return Promise.resolve(s);let n=[],o=0;for(let{name:p,regex:i}of t){if(o>=1e3)break;let u=1e3-o,c=Ct(i,r.outputText,u);for(let f of c)n.push({name:p,redacted:bt(f)});o+=c.length}let a=n.length===0;return Promise.resolve([{assertionType:"pii",label:"No PII detected",passed:a,score:a?1:0,failureCode:a?void 0:"PII_DETECTED",failureMessage:a?void 0:`Found ${n.length} PII match(es): ${n.map(p=>`${p.name}=${p.redacted}`).join(", ")}`,metadata:a?void 0:{matches:n}}])}}}function Y(e){return{type:"keywords_present",evaluate(t){let s=t.outputText.toLowerCase(),r=e.some(n=>s.includes(n.toLowerCase()));return Promise.resolve([{assertionType:"keywords_present",label:"Required keyword present",passed:r,score:r?1:0,failureCode:r?void 0:"KEYWORD_MISSING",failureMessage:r?void 0:`Expected at least one of [${e.join(", ")}] in output`}])}}}function Q(e){return{type:"keywords_absent",evaluate(t){let s=t.outputText.toLowerCase(),r=[];for(let n of e){let o=s.includes(n.toLowerCase());r.push({assertionType:"keywords_absent",label:`Keyword "${n}" absent`,passed:!o,score:o?0:1,failureCode:o?"KEYWORD_DENIED":void 0,failureMessage:o?`Denied keyword "${n}" found in output`:void 0})}return Promise.resolve(r)}}}var At=`You are an impartial AI judge evaluating an AI assistant's response.
|
|
1
|
+
"use strict";var Pt=Object.create;var Y=Object.defineProperty;var wt=Object.getOwnPropertyDescriptor;var Mt=Object.getOwnPropertyNames;var St=Object.getPrototypeOf,Ot=Object.prototype.hasOwnProperty;var kt=(e,t)=>{for(var s in t)Y(e,s,{get:t[s],enumerable:!0})},Ne=(e,t,s,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of Mt(t))!Ot.call(e,o)&&o!==s&&Y(e,o,{get:()=>t[o],enumerable:!(n=wt(t,o))||n.enumerable});return e};var je=(e,t,s)=>(s=e!=null?Pt(St(e)):{},Ne(t||!e||!e.__esModule?Y(s,"default",{value:e,enumerable:!0}):s,e)),vt=e=>Ne(Y({},"__esModule",{value:!0}),e);var Vr={};kt(Vr,{BASELINE_VERSION:()=>G,KindLMConfigSchema:()=>we,ProviderError:()=>h,SpanFilterSchema:()=>Pe,SpanMappingSchema:()=>Ee,TraceConfigSchema:()=>he,aggregateRuns:()=>ye,buildBaselineData:()=>Rt,buildContextFromTrace:()=>Et,classifyAssertion:()=>H,compareBaseline:()=>yt,cosineSimilarity:()=>Ce,createAnthropicAdapter:()=>ce,createAssertionsFromExpect:()=>z,createCohereAdapter:()=>fe,createComplianceReporter:()=>pt,createCostAssertion:()=>le,createDriftAssertion:()=>ae,createGeminiAdapter:()=>pe,createJsonReporter:()=>lt,createJudgeAssertion:()=>oe,createJunitReporter:()=>ut,createKeywordsAbsentAssertion:()=>ne,createKeywordsPresentAssertion:()=>se,createLatencyAssertion:()=>ie,createMistralAdapter:()=>me,createOllamaAdapter:()=>de,createOpenAIAdapter:()=>ue,createPiiAssertion:()=>re,createPrettyReporter:()=>ot,createProvider:()=>Ke,createRunner:()=>et,createSchemaAssertion:()=>ee,createToolCalledAssertion:()=>Q,createToolNotCalledAssertion:()=>Z,createToolOrderAssertion:()=>q,deserializeBaseline:()=>Ie,err:()=>T,evaluateGates:()=>st,filterSpans:()=>Tt,findMissingVars:()=>Me,interpolate:()=>J,isDeterministic:()=>Fe,isProbabilistic:()=>Be,listBaselines:()=>ht,lookupModelPricing:()=>U,mapSpansToResult:()=>Ct,migrateBaseline:()=>ve,noColor:()=>xe,ok:()=>k,parseCommandOutput:()=>be,parseConfig:()=>Ze,parseOtlpPayload:()=>At,readBaseline:()=>ft,runConversation:()=>ge,serializeBaseline:()=>_e,validateConfig:()=>Re,validateUnitIntervalScore:()=>K,withRetry:()=>v,writeBaseline:()=>gt});module.exports=vt(Vr);var h=class extends Error{constructor(s,n,o,r=!1,a){super(n);this.code=s;this.statusCode=o;this.retryable=r;this.raw=a;this.name="ProviderError"}};function k(e){return{success:!0,data:e}}function T(e){return{success:!1,error:e}}function $e(e){return typeof e=="object"&&e!==null&&!Array.isArray(e)}function Te(e,t){if(Object.is(e,t))return!0;if(Array.isArray(t)){if(!Array.isArray(e)||e.length!==t.length)return!1;for(let s=0;s<e.length;s++)if(!Te(e[s],t[s]))return!1;return!0}if($e(t)){if(!$e(e))return!1;for(let[s,n]of Object.entries(t))if(!(s in e)||!Te(e[s],n))return!1;return!0}return!1}function De(e,t){for(let[s,n]of Object.entries(t))if(!(s in e)||!Te(e[s],n))return!1;return!0}function Le(e,t,s,n,o,r){if(!s)return;if(!o.validateJsonSchema){r.push({assertionType:t,label:`Tool "${e}" args schema valid`,passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"argsSchema provided but no JSON Schema validator was injected"});return}let a=typeof s=="string"?JSON.parse(s):s,c=n.some(i=>{let l=o.validateJsonSchema;return l?l(a,i.arguments).valid:!1});r.push({assertionType:t,label:`Tool "${e}" args schema valid`,passed:c,score:c?1:0,failureCode:c?void 0:"TOOL_CALL_ARGS_SCHEMA_INVALID",failureMessage:c?void 0:`Tool "${e}" arguments did not match argsSchema`})}function Q(e,t,s){return{type:"tool_called",evaluate(n){let o=[],r=n.toolCalls.filter(c=>c.name===e),a=n.toolCalls.map(c=>c.name);if(r.length===0)return o.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${e}" to be called, but got: [${a.join(", ")}]`}),Promise.resolve(o);if(o.push({assertionType:"tool_called",label:`Tool "${e}" called`,passed:!0,score:1}),t){let c=r.some(i=>De(i.arguments,t));o.push({assertionType:"tool_called",label:`Tool "${e}" args match`,passed:c,score:c?1:0,failureCode:c?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:c?void 0:`Expected args ${JSON.stringify(t)}, got ${JSON.stringify(r[0]?.arguments)}`})}return Le(e,"tool_called",s,r,n,o),Promise.resolve(o)}}}function Z(e){return{type:"tool_not_called",evaluate(t){let s=t.toolCalls.some(n=>n.name===e);return Promise.resolve([{assertionType:"tool_not_called",label:`Tool "${e}" not called`,passed:!s,score:s?0:1,failureCode:s?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:s?`Expected tool "${e}" to NOT be called, but it was`:void 0}])}}}function q(e){return{type:"tool_order",evaluate(t){let s=[];for(let n of e){if(n.shouldNotCall){let r=t.toolCalls.some(a=>a.name===n.tool);s.push({assertionType:"tool_order",label:`Tool "${n.tool}" not called`,passed:!r,score:r?0:1,failureCode:r?"TOOL_CALL_UNEXPECTED":void 0,failureMessage:r?`Expected tool "${n.tool}" to NOT be called, but it was`:void 0});continue}let o=t.toolCalls.filter(r=>r.name===n.tool);if(o.length===0){s.push({assertionType:"tool_order",label:`Tool "${n.tool}" called`,passed:!1,score:0,failureCode:"TOOL_CALL_MISSING",failureMessage:`Expected tool "${n.tool}" to be called, but it was not`});continue}if(s.push({assertionType:"tool_order",label:`Tool "${n.tool}" called`,passed:!0,score:1}),n.argsMatch){let r=o.some(a=>De(a.arguments,n.argsMatch??{}));s.push({assertionType:"tool_order",label:`Tool "${n.tool}" args match`,passed:r,score:r?1:0,failureCode:r?void 0:"TOOL_CALL_ARGS_MISMATCH",failureMessage:r?void 0:`Expected args ${JSON.stringify(n.argsMatch)}, got ${JSON.stringify(o[0]?.arguments)}`})}if(Le(n.tool,"tool_order",n.argsSchema,o,t,s),n.order!==void 0){let a=t.toolCalls[n.order]?.name===n.tool,c=t.toolCalls.findIndex(i=>i.name===n.tool);s.push({assertionType:"tool_order",label:`Tool "${n.tool}" at position ${n.order}`,passed:a,score:a?1:0,failureCode:a?void 0:"TOOL_CALL_ORDER_WRONG",failureMessage:a?void 0:`Expected "${n.tool}" at position ${n.order}, but found at position ${c}`})}}return Promise.resolve(s)}}}function ee(e){let t;async function s(){if(t)return t;let r=await import("ajv"),a=await import("ajv-formats"),c=r.default,i=a.default,l=new c({allErrors:!0,strict:!1});return i(l),t=l,l}let n=new Map;function o(r,a){let c=JSON.stringify(a),i=n.get(c);if(i)return i;try{let l=r.compile(a);return n.set(c,l),l}catch(l){return{compileError:l instanceof Error?l.message:String(l)}}}return{type:"schema",async evaluate(r){let a=[],c;if(e.format==="json")try{c=JSON.parse(r.outputText),a.push({assertionType:"schema",label:"Output is valid JSON",passed:!0,score:1})}catch(i){return a.push({assertionType:"schema",label:"Output is valid JSON",passed:!1,score:0,failureCode:"SCHEMA_PARSE_ERROR",failureMessage:`Failed to parse output as JSON: ${i instanceof Error?i.message:String(i)}`}),a}if(e.schemaContent){let i=await s(),l=o(i,e.schemaContent);if("compileError"in l)a.push({assertionType:"schema",label:"Output matches JSON Schema",passed:!1,score:0,failureCode:"SCHEMA_INVALID",failureMessage:`Schema compilation failed: ${l.compileError}`});else{let p=l,f=p(c??r.outputText);a.push({assertionType:"schema",label:"Output matches JSON Schema",passed:f,score:f?1:0,failureCode:f?void 0:"SCHEMA_INVALID",failureMessage:f?void 0:`Schema validation failed: ${i.errorsText(p.errors)}`,metadata:f?void 0:{errors:p.errors}})}}if(e.contains){let i=r.outputText.toLowerCase();for(let l of e.contains){let p=i.includes(l.toLowerCase());a.push({assertionType:"schema",label:`Output contains "${l}"`,passed:p,score:p?1:0,failureCode:p?void 0:"CONTAINS_FAILED",failureMessage:p?void 0:`Expected output to contain "${l}"`})}}if(e.notContains){let i=r.outputText.toLowerCase();for(let l of e.notContains){let p=i.includes(l.toLowerCase());a.push({assertionType:"schema",label:`Output does not contain "${l}"`,passed:!p,score:p?0:1,failureCode:p?"NOT_CONTAINS_FAILED":void 0,failureMessage:p?`Expected output to NOT contain "${l}"`:void 0})}}if(e.maxLength!==void 0){let i=r.outputText.length<=e.maxLength;a.push({assertionType:"schema",label:`Output length <= ${e.maxLength}`,passed:i,score:i?1:0,failureCode:i?void 0:"MAX_LENGTH_EXCEEDED",failureMessage:i?void 0:`Output length ${r.outputText.length} exceeds max ${e.maxLength}`})}return a}}}function _t(e){return e.length<=4?"*".repeat(e.length):e.slice(0,2)+"*".repeat(e.length-4)+e.slice(-2)}var It=/(\+|\*|\{[^}]+\})\)?(\+|\*|\{[^}]+\})/;function te(e){return It.test(e)}function Nt(e,t,s){let n=[],o=Date.now();e.lastIndex=0;let r;for(;(r=e.exec(t))!==null&&(n.push(r[0]),!(n.length>=s||Date.now()-o>100)););return n}function re(e){let t=[],s;for(let n=0;n<e.denyPatterns.length;n++){let o=e.denyPatterns[n];if(o!==void 0){if(te(o)){s=[{assertionType:"pii",label:"No PII detected",passed:!1,score:0,failureCode:"INVALID_PATTERN",failureMessage:`Deny pattern "pii-pattern-${n+1}" contains nested quantifiers and may cause catastrophic backtracking`}];break}t.push({name:`pii-pattern-${n+1}`,regex:new RegExp(o,"gi")})}}if(e.customPatterns)for(let n of e.customPatterns){if(te(n.pattern)){s=[{assertionType:"pii",label:"No PII detected",passed:!1,score:0,failureCode:"INVALID_PATTERN",failureMessage:`Custom pattern "${n.name}" contains nested quantifiers and may cause catastrophic backtracking`}];break}t.push({name:n.name,regex:new RegExp(n.pattern,"gi")})}return{type:"pii",evaluate(n){if(s)return Promise.resolve(s);let o=[],r=0,a=Date.now();for(let{name:i,regex:l}of t){if(r>=1e3||Date.now()-a>500)break;let p=1e3-r,f=Nt(l,n.outputText,p);for(let y of f)o.push({name:i,redacted:_t(y)});r+=f.length}let c=o.length===0;return Promise.resolve([{assertionType:"pii",label:"No PII detected",passed:c,score:c?1:0,failureCode:c?void 0:"PII_DETECTED",failureMessage:c?void 0:`Found ${o.length} PII match(es): ${o.map(i=>`${i.name}=${i.redacted}`).join(", ")}`,metadata:c?void 0:{matches:o}}])}}}function se(e){return{type:"keywords_present",evaluate(t){let s=t.outputText.toLowerCase(),n=e.some(o=>s.includes(o.toLowerCase()));return Promise.resolve([{assertionType:"keywords_present",label:"Required keyword present",passed:n,score:n?1:0,failureCode:n?void 0:"KEYWORD_MISSING",failureMessage:n?void 0:`Expected at least one of [${e.join(", ")}] in output`}])}}}function ne(e){return{type:"keywords_absent",evaluate(t){let s=t.outputText.toLowerCase(),n=[];for(let o of e){let r=s.includes(o.toLowerCase());n.push({assertionType:"keywords_absent",label:`Keyword "${o}" absent`,passed:!r,score:r?0:1,failureCode:r?"KEYWORD_DENIED":void 0,failureMessage:r?`Denied keyword "${o}" found in output`:void 0})}return Promise.resolve(n)}}}function K(e,t){return typeof e!="number"?{ok:!1,reason:`${t} must be a number`}:Number.isFinite(e)?e<0||e>1?{ok:!1,reason:`${t} must be between 0 and 1 inclusive`}:{ok:!0,score:e}:{ok:!1,reason:`${t} must be a finite number`}}var jt=`You are an impartial AI judge evaluating an AI assistant's response.
|
|
2
2
|
You will be given:
|
|
3
3
|
- The assistant's response
|
|
4
4
|
- Evaluation criteria
|
|
@@ -7,14 +7,14 @@ You will be given:
|
|
|
7
7
|
Score the response from 0.0 to 1.0 based on how well it meets the criteria.
|
|
8
8
|
|
|
9
9
|
Respond ONLY with a JSON object in this exact format:
|
|
10
|
-
{"score": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function
|
|
10
|
+
{"score": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function $t(e,t,s){let n=`## Assistant Response
|
|
11
11
|
${e}
|
|
12
12
|
|
|
13
13
|
## Criteria
|
|
14
|
-
${t}`;return s&&(
|
|
14
|
+
${t}`;return s&&(n+=`
|
|
15
15
|
|
|
16
16
|
## Rubric
|
|
17
|
-
${s}`),
|
|
17
|
+
${s}`),n}function Dt(e){let t=e.match(/```(?:json)?\s*([\s\S]*?)```/)??e.match(/(\{[\s\S]*\})/);if(!t?.[1])return{ok:!1,reason:"No JSON object found in judge response"};let s;try{s=JSON.parse(t[1])}catch{return{ok:!1,reason:"Invalid JSON in judge response"}}let n=K(s.score,"score");return n.ok?typeof s.reasoning!="string"?{ok:!1,reason:"reasoning must be a string"}:{ok:!0,score:n.score,reasoning:s.reasoning}:{ok:!1,reason:n.reason}}function oe(e){return{type:"judge",async evaluate(t){if(!t.judgeAdapter||!t.judgeModel)return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Judge assertion requires judgeAdapter and judgeModel in context"}];let s;try{s=await t.judgeAdapter.complete({model:e.model??t.judgeModel,messages:[{role:"system",content:jt},{role:"user",content:$t(t.outputText,e.criteria,e.rubric)}],params:{temperature:0,maxTokens:512}})}catch(r){return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"JUDGE_EVAL_ERROR",failureMessage:`Judge adapter error: ${r instanceof Error?r.message:String(r)}`}]}let n=Dt(s.text);if(!n.ok)return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:!1,score:0,failureCode:"JUDGE_PARSE_ERROR",failureMessage:`Failed to parse judge response: ${n.reason}`}];let o=n.score>=e.minScore;return[{assertionType:"judge",label:`Judge: ${e.criteria}`,passed:o,score:n.score,failureCode:o?void 0:"JUDGE_BELOW_THRESHOLD",failureMessage:o?void 0:`Score ${n.score} below threshold ${e.minScore}: ${n.reasoning}`,metadata:{reasoning:n.reasoning,threshold:e.minScore}}]}}}var Lt=`You are an impartial AI judge comparing two AI assistant responses.
|
|
18
18
|
You will be given a baseline response and a new response.
|
|
19
19
|
Evaluate how much the new response has drifted from the baseline.
|
|
20
20
|
|
|
@@ -24,17 +24,18 @@ Score from 0.0 to 1.0 where:
|
|
|
24
24
|
- 1.0 = completely different meaning or contradictory
|
|
25
25
|
|
|
26
26
|
Respond ONLY with a JSON object in this exact format:
|
|
27
|
-
{"driftScore": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function
|
|
27
|
+
{"driftScore": <number between 0.0 and 1.0>, "reasoning": "<brief explanation>"}`;function Ut(e){let t=e.match(/```(?:json)?\s*([\s\S]*?)```/)??e.match(/(\{[\s\S]*\})/);if(!t?.[1])return{ok:!1,reason:"No JSON object found in drift judge response"};let s;try{s=JSON.parse(t[1])}catch{return{ok:!1,reason:"Invalid JSON in drift judge response"}}let n=K(s.driftScore,"driftScore");return n.ok?typeof s.reasoning!="string"?{ok:!1,reason:"reasoning must be a string"}:{ok:!0,driftScore:n.score,reasoning:s.reasoning}:{ok:!1,reason:n.reason}}function Ue(e,t){let s=t.split("."),n=e;for(let o of s){if(n==null||typeof n!="object")return;n=n[o]}return n}function Ft(e,t,s){let n,o;try{n=JSON.parse(e),o=JSON.parse(t)}catch{return{driftScore:1,mismatched:["(parse error)"]}}let r=[];for(let c of s){let i=Ue(n,c),l=Ue(o,c);JSON.stringify(i)!==JSON.stringify(l)&&r.push(c)}return{driftScore:s.length>0?r.length/s.length:0,mismatched:r}}function Ce(e,t){if(e.length!==t.length||e.length===0)return 0;let s=0,n=0,o=0;for(let a=0;a<e.length;a++){let c=e[a]??0,i=t[a]??0;s+=c*i,n+=c*c,o+=i*i}let r=Math.sqrt(n)*Math.sqrt(o);return r===0?0:s/r}function ae(e){return{type:"drift",async evaluate(t){if(!t.baselineText)return[{assertionType:"drift",label:"Drift check",passed:!1,score:0,failureCode:"DRIFT_EXCEEDED",failureMessage:"No baseline available \u2014 run `kindlm baseline set` first",metadata:{reason:"No baseline available"}}];if(e.method==="embedding"){if(!t.getEmbedding)return[{assertionType:"drift",label:"Drift check (embedding)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Drift embedding method requires getEmbedding in context"}];let a,c;try{[a,c]=await Promise.all([t.getEmbedding(t.baselineText),t.getEmbedding(t.outputText)])}catch(y){return[{assertionType:"drift",label:"Drift check (embedding)",passed:!1,score:0,failureCode:"JUDGE_EVAL_ERROR",failureMessage:`Embedding error: ${y instanceof Error?y.message:String(y)}`}]}let i=Ce(a,c),l=1-i,p=i,f=l<=e.maxScore;return[{assertionType:"drift",label:"Drift check (embedding)",passed:f,score:p,failureCode:f?void 0:"DRIFT_EXCEEDED",failureMessage:f?void 0:`Drift score ${l.toFixed(3)} exceeds max ${e.maxScore}`,metadata:{driftScore:l,similarity:i,threshold:e.maxScore}}]}if(e.method==="field-diff"){let a=e.fields??[],{driftScore:c,mismatched:i}=Ft(t.baselineText,t.outputText,a),l=1-c,p=c<=e.maxScore;return[{assertionType:"drift",label:"Drift check (field-diff)",passed:p,score:l,failureCode:p?void 0:"DRIFT_EXCEEDED",failureMessage:p?void 0:`Drift score ${c.toFixed(2)} exceeds max ${e.maxScore}. Mismatched: [${i.join(", ")}]`,metadata:{driftScore:c,mismatched:i,threshold:e.maxScore}}]}if(!t.judgeAdapter||!t.judgeModel)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:"Drift judge method requires judgeAdapter and judgeModel in context"}];let s;try{s=await t.judgeAdapter.complete({model:t.judgeModel,messages:[{role:"system",content:Lt},{role:"user",content:`## Baseline Response
|
|
28
28
|
${t.baselineText}
|
|
29
29
|
|
|
30
30
|
## New Response
|
|
31
|
-
${t.outputText}`}],params:{temperature:0,maxTokens:512}}),r=wt(s.text);if(!r)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:`Failed to parse drift judge response: ${s.text.slice(0,200)}`}];let n=1-r.driftScore,o=r.driftScore<=e.maxScore;return[{assertionType:"drift",label:"Drift check (judge)",passed:o,score:n,failureCode:o?void 0:"DRIFT_EXCEEDED",failureMessage:o?void 0:`Drift score ${r.driftScore.toFixed(2)} exceeds max ${e.maxScore}: ${r.reasoning}`,metadata:{driftScore:r.driftScore,reasoning:r.reasoning}}]}}}function Z(e){return{type:"latency",evaluate(t){let s=t.latencyMs??0,r=s<=e.maxMs;return Promise.resolve([{assertionType:"latency",label:`Latency <= ${e.maxMs}ms`,passed:r,score:r?1:0,failureCode:r?void 0:"PROVIDER_TIMEOUT",failureMessage:r?void 0:`Latency ${s}ms exceeds max ${e.maxMs}ms`,metadata:{latencyMs:s}}])}}}function q(e){return{type:"cost",evaluate(t){let s=t.costUsd??0,r=s<=e.maxUsd;return Promise.resolve([{assertionType:"cost",label:`Cost <= $${e.maxUsd}`,passed:r,score:r?1:0,failureCode:r?void 0:"INTERNAL_ERROR",failureMessage:r?void 0:`Cost $${s.toFixed(4)} exceeds max $${e.maxUsd}`,metadata:{costUsd:s}}])}}}function K(e,t){let s=[];if(e.toolCalls)if(e.toolCalls.some(n=>n.order!==void 0))s.push(X(e.toolCalls));else for(let n of e.toolCalls)n.shouldNotCall?s.push(W(n.tool)):s.push(U(n.tool,n.argsMatch??void 0));if(e.output&&s.push(B({format:e.output.format,schemaFile:e.output.schemaFile,schemaContent:t?.schemaContent,contains:e.output.contains,notContains:e.output.notContains,maxLength:e.output.maxLength})),e.guardrails?.pii&&s.push(G({denyPatterns:e.guardrails.pii.denyPatterns,customPatterns:e.guardrails.pii.customPatterns})),e.guardrails?.keywords&&(e.guardrails.keywords.allow&&e.guardrails.keywords.allow.length>0&&s.push(Y(e.guardrails.keywords.allow)),e.guardrails.keywords.deny.length>0&&s.push(Q(e.guardrails.keywords.deny))),e.judge)for(let r of e.judge)s.push(J({criteria:r.criteria,minScore:r.minScore,rubric:r.rubric}));return e.baseline?.drift&&s.push(V({maxScore:e.baseline.drift.maxScore,method:e.baseline.drift.method,fields:e.baseline.drift.fields})),s}function Se(){return new Map([["tool_called",e=>U(e.toolCalls?.[0]?.tool??"")],["schema",e=>B(e.output??{format:"text"})],["pii",e=>G(e.guardrails?.pii??{denyPatterns:[]})],["judge",e=>J(e.judge?.[0]??{criteria:"",minScore:.7})],["drift",e=>V(e.baseline?.drift??{maxScore:.15,method:"judge"})],["latency",()=>Z({maxMs:6e4})],["cost",()=>q({maxUsd:1})]])}var Mt=new Set(["judge","drift"]);function D(e){return Mt.has(e)?"probabilistic":"deterministic"}function Me(e){return D(e)==="deterministic"}function Oe(e){return D(e)==="probabilistic"}async function _(e,t){let{maxRetries:s,shouldRetry:r,baseDelayMs:n=500}=t,o;for(let a=0;a<=s;a++)try{return await e()}catch(p){if(o=p,a>=s||!r(p))throw p;let i=n*Math.pow(2,a);await Ot(i)}throw o}function Ot(e){return new Promise(t=>setTimeout(t,e))}var _t={"gpt-4o":{input:2.5,output:10},"gpt-4o-mini":{input:.15,output:.6},"gpt-4-turbo":{input:10,output:30},"o3-mini":{input:1.1,output:4.4}};function kt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function It(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ee(e){let t="",s="https://api.openai.com/v1",r,n=6e4,o=2;return{name:"openai",async initialize(a){if(!a.apiKey)throw new g("AUTH_FAILED","API key is required");t=a.apiKey,a.baseUrl&&(s=a.baseUrl),r=a.organization,n=a.timeoutMs,o=a.maxRetries},async complete(a){let p={model:a.model,messages:a.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content,tool_call_id:d.toolCallId}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||null,tool_calls:d.toolCalls.map(h=>({id:h.id,type:"function",function:{name:h.name,arguments:JSON.stringify(h.arguments)}}))}:{role:d.role,content:d.content}),temperature:a.params.temperature,max_tokens:a.params.maxTokens};a.params.topP!==void 0&&(p.top_p=a.params.topP),a.params.seed!==void 0&&(p.seed=a.params.seed),a.params.stopSequences&&(p.stop=a.params.stopSequences),a.tools&&a.tools.length>0&&(p.tools=a.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})),a.toolChoice&&(p.tool_choice=a.toolChoice));let i={"Content-Type":"application/json",Authorization:`Bearer ${t}`};r&&(i["OpenAI-Organization"]=r);let u=Date.now(),c=await _(()=>e.fetch(`${s}/chat/completions`,{method:"POST",headers:i,body:JSON.stringify(p),timeoutMs:n}),{maxRetries:o,shouldRetry:d=>d instanceof g&&d.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from OpenAI API",c.status,c.status>=500)}let y=Date.now()-u;if(!c.ok)throw It(c.status,f);let m=f,x=m.choices?.[0],C=x?.message,b=(C?.tool_calls??[]).map(d=>{let h;try{h=JSON.parse(d.function.arguments||"{}")}catch{h={_raw:d.function.arguments}}return{id:d.id,name:d.function.name,arguments:h}});return{text:C?.content??"",toolCalls:b,usage:{inputTokens:m.usage?.prompt_tokens??0,outputTokens:m.usage?.completion_tokens??0,totalTokens:m.usage?.total_tokens??0},raw:f,latencyMs:y,modelId:m.model??a.model,finishReason:kt(x?.finish_reason)}},estimateCost(a,p){let i=_t[a];return i?p.inputTokens/1e6*i.input+p.outputTokens/1e6*i.output:null},supportsTools(a){return!a.startsWith("o1-")}}}var jt={"claude-opus-4-5-20250929":{input:15,output:75},"claude-sonnet-4-5-20250929":{input:3,output:15},"claude-haiku-4-5-20251001":{input:.8,output:4}};function Nt(e){switch(e){case"end_turn":return"stop";case"max_tokens":return"max_tokens";case"tool_use":return"tool_calls";default:return"unknown"}}function $t(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function te(e){let t="",s="https://api.anthropic.com",r=6e4,n=2;return{name:"anthropic",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a=o.messages.find(h=>h.role==="system"),p=o.messages.filter(h=>h.role!=="system").map(h=>{if(h.role==="tool")return{role:"user",content:[{type:"tool_result",tool_use_id:h.toolCallId,content:h.content}]};if(h.role==="assistant"){if(h.toolCalls&&h.toolCalls.length>0){let R=[];h.content&&R.push({type:"text",text:h.content});for(let P of h.toolCalls)R.push({type:"tool_use",id:P.id,name:P.name,input:P.arguments});return{role:"assistant",content:R}}return{role:"assistant",content:h.content}}return{role:"user",content:h.content}}),i={model:o.model,max_tokens:o.params.maxTokens,messages:p};a&&(i.system=a.content),o.params.temperature!==void 0&&(i.temperature=o.params.temperature),o.params.topP!==void 0&&(i.top_p=o.params.topP),o.params.stopSequences&&(i.stop_sequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(i.tools=o.tools.map(h=>({name:h.name,description:h.description??"",input_schema:h.parameters??{type:"object",properties:{}}})),o.toolChoice&&(i.tool_choice=o.toolChoice==="required"?{type:"any"}:{type:o.toolChoice}));let u=Date.now(),c=await _(()=>e.fetch(`${s}/v1/messages`,{method:"POST",headers:{"Content-Type":"application/json","x-api-key":t,"anthropic-version":"2023-06-01"},body:JSON.stringify(i),timeoutMs:r}),{maxRetries:n,shouldRetry:h=>h instanceof g&&h.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Anthropic API",c.status,c.status>=500)}let y=Date.now()-u;if(!c.ok)throw $t(c.status,f);let m=f,x="",C=[];for(let h of m.content??[])h.type==="text"?x+=h.text:h.type==="tool_use"&&C.push({id:h.id,name:h.name,arguments:h.input??{}});let b=m.usage?.input_tokens??0,d=m.usage?.output_tokens??0;return{text:x,toolCalls:C,usage:{inputTokens:b,outputTokens:d,totalTokens:b+d},raw:f,latencyMs:y,modelId:m.model??o.model,finishReason:Nt(m.stop_reason)}},estimateCost(o,a){let p=Object.entries(jt).find(([i])=>o.includes(i)||i.includes(o));return p?a.inputTokens/1e6*p[1].input+a.outputTokens/1e6*p[1].output:null},supportsTools(o){return!0}}}function Dt(e,t){if(t)return"tool_calls";switch(e){case"stop":return"stop";case"length":return"max_tokens";default:return"unknown"}}function Lt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error):"Unknown error";switch(e){case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function se(e){let t="http://localhost:11434",s=6e4,r=2;return{name:"ollama",async initialize(n){n.baseUrl&&(t=n.baseUrl),s=n.timeoutMs,r=n.maxRetries},async complete(n){let o=n.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||"",tool_calls:d.toolCalls.map(h=>({function:{name:h.name,arguments:h.arguments}}))}:{role:d.role,content:d.content}),a={model:n.model,messages:o,stream:!1,options:{temperature:n.params.temperature,num_predict:n.params.maxTokens}},p=a.options;n.params.topP!==void 0&&(p.top_p=n.params.topP),n.params.seed!==void 0&&(p.seed=n.params.seed),n.params.stopSequences&&(p.stop=n.params.stopSequences),n.tools&&n.tools.length>0&&(a.tools=n.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})));let i={"Content-Type":"application/json"},u=Date.now(),c=await _(()=>e.fetch(`${t}/api/chat`,{method:"POST",headers:i,body:JSON.stringify(a),timeoutMs:s}),{maxRetries:r,shouldRetry:d=>d instanceof g&&d.retryable}),f;try{f=await c.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Ollama API",c.status,c.status>=500)}let y=Date.now()-u;if(!c.ok)throw Lt(c.status,f);let m=f,x=m.message,C=x?.tool_calls??[],b=C.map((d,h)=>({id:`ollama_call_${h}`,name:d.function.name,arguments:d.function.arguments??{}}));return{text:x?.content??"",toolCalls:b,usage:{inputTokens:m.prompt_eval_count??0,outputTokens:m.eval_count??0,totalTokens:(m.prompt_eval_count??0)+(m.eval_count??0)},raw:f,latencyMs:y,modelId:m.model??n.model,finishReason:Dt(m.done_reason,C.length>0)}},estimateCost(){return 0},supportsTools(){return!0}}}var Ft={"gemini-2.0-flash":{input:.1,output:.4},"gemini-2.0-flash-lite":{input:.075,output:.3},"gemini-1.5-pro":{input:1.25,output:5},"gemini-1.5-flash":{input:.075,output:.3},"gemini-1.5-flash-8b":{input:.0375,output:.15}};function Ut(e,t){if(t)return"tool_calls";switch(e){case"STOP":return"stop";case"MAX_TOKENS":return"max_tokens";case"SAFETY":return"stop";default:return"unknown"}}function Bt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 400:return s.toLowerCase().includes("api key")?new g("AUTH_FAILED",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 401:case 403:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function re(e){let t="",s="https://generativelanguage.googleapis.com/v1beta",r=6e4,n=2;return{name:"gemini",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a=[],p;for(let T of o.messages){if(T.role==="system"){p={parts:[{text:T.content}]};continue}if(T.role==="tool"){a.push({role:"function",parts:[{functionResponse:{name:T.toolName??"unknown",response:Gt(T.content)}}]});continue}if(T.role==="assistant"&&T.toolCalls&&T.toolCalls.length>0){let O=[];T.content&&O.push({text:T.content});for(let Ce of T.toolCalls)O.push({functionCall:{name:Ce.name,args:Ce.arguments}});a.push({role:"model",parts:O});continue}let v=T.role==="assistant"?"model":"user";a.push({role:v,parts:[{text:T.content}]})}let i={contents:a,generationConfig:{temperature:o.params.temperature,maxOutputTokens:o.params.maxTokens}};p&&(i.systemInstruction=p);let u=i.generationConfig;o.params.topP!==void 0&&(u.topP=o.params.topP),o.params.stopSequences&&(u.stopSequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(i.tools=[{functionDeclarations:o.tools.map(T=>({name:T.name,description:T.description,parameters:T.parameters}))}],o.toolChoice&&(i.toolConfig=Jt(o.toolChoice)));let c={"Content-Type":"application/json","x-goog-api-key":t},f=Date.now(),y=`${s}/models/${o.model}:generateContent`,m=await _(()=>e.fetch(y,{method:"POST",headers:c,body:JSON.stringify(i),timeoutMs:r}),{maxRetries:n,shouldRetry:T=>T instanceof g&&T.retryable}),x;try{x=await m.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Gemini API",m.status,m.status>=500)}let C=Date.now()-f;if(!m.ok)throw Bt(m.status,x);let b=x,d=b.candidates?.[0],h=d?.content?.parts??[],R="",P=[],E=0;for(let T of h)T.text!==void 0&&(R+=T.text),T.functionCall&&(P.push({id:`gemini_call_${E}`,name:T.functionCall.name,arguments:T.functionCall.args??{}}),E++);return{text:R,toolCalls:P,usage:{inputTokens:b.usageMetadata?.promptTokenCount??0,outputTokens:b.usageMetadata?.candidatesTokenCount??0,totalTokens:b.usageMetadata?.totalTokenCount??0},raw:x,latencyMs:C,modelId:b.modelVersion??o.model,finishReason:Ut(d?.finishReason,P.length>0)}},estimateCost(o,a){let p=Ft[o];return p?a.inputTokens/1e6*p.input+a.outputTokens/1e6*p.output:null},supportsTools(){return!0}}}function Gt(e){try{return JSON.parse(e)}catch{return{result:e}}}function Jt(e){switch(e){case"auto":return{functionCallingConfig:{mode:"AUTO"}};case"required":return{functionCallingConfig:{mode:"ANY"}};case"none":return{functionCallingConfig:{mode:"NONE"}}}}function Vt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function Kt(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function oe(e){let t="",s="https://api.mistral.ai/v1",r=6e4,n=2;return{name:"mistral",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a={model:o.model,messages:o.messages.map(b=>b.role==="tool"?{role:"tool",content:b.content,tool_call_id:b.toolCallId}:b.role==="assistant"&&b.toolCalls&&b.toolCalls.length>0?{role:"assistant",content:b.content||null,tool_calls:b.toolCalls.map(d=>({id:d.id,type:"function",function:{name:d.name,arguments:JSON.stringify(d.arguments)}}))}:{role:b.role,content:b.content}),temperature:o.params.temperature,max_tokens:o.params.maxTokens};o.params.topP!==void 0&&(a.top_p=o.params.topP),o.params.stopSequences&&(a.stop=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(b=>({type:"function",function:{name:b.name,description:b.description,parameters:b.parameters}})),o.toolChoice&&(a.tool_choice=o.toolChoice));let p={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),u=await _(()=>e.fetch(`${s}/chat/completions`,{method:"POST",headers:p,body:JSON.stringify(a),timeoutMs:r}),{maxRetries:n,shouldRetry:b=>b instanceof g&&b.retryable}),c;try{c=await u.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Mistral API",u.status,u.status>=500)}let f=Date.now()-i;if(!u.ok)throw Kt(u.status,c);let y=c,m=y.choices?.[0],x=m?.message,C=(x?.tool_calls??[]).map(b=>{let d;try{d=JSON.parse(b.function.arguments||"{}")}catch{d={_raw:b.function.arguments}}return{id:b.id,name:b.function.name,arguments:d}});return{text:x?.content??"",toolCalls:C,usage:{inputTokens:y.usage?.prompt_tokens??0,outputTokens:y.usage?.completion_tokens??0,totalTokens:y.usage?.total_tokens??0},raw:c,latencyMs:f,modelId:y.model??o.model,finishReason:Vt(m?.finish_reason)}},estimateCost(o,a){return null},supportsTools(o){return!0}}}function Ht(e){switch(e){case"COMPLETE":return"stop";case"MAX_TOKENS":return"max_tokens";case"TOOL_CALL":return"tool_calls";default:return"unknown"}}function zt(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):"Unknown error";switch(e){case 401:return new g("AUTH_FAILED",s,e,!1,t);case 429:return new g("RATE_LIMITED",s,e,!0,t);case 404:return new g("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new g("CONTEXT_LENGTH",s,e,!1,t):new g("PROVIDER_ERROR",s,e,!1,t);case 408:return new g("TIMEOUT",s,e,!0,t);default:return new g(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ne(e){let t="",s="https://api.cohere.com",r=6e4,n=2;return{name:"cohere",async initialize(o){if(!o.apiKey)throw new g("AUTH_FAILED","API key is required");t=o.apiKey,o.baseUrl&&(s=o.baseUrl),r=o.timeoutMs,n=o.maxRetries},async complete(o){let a={model:o.model,messages:o.messages.map(d=>d.role==="tool"?{role:"tool",content:d.content,tool_call_id:d.toolCallId}:d.role==="assistant"&&d.toolCalls&&d.toolCalls.length>0?{role:"assistant",content:d.content||null,tool_calls:d.toolCalls.map(h=>({id:h.id,type:"function",function:{name:h.name,arguments:JSON.stringify(h.arguments)}}))}:{role:d.role,content:d.content}),temperature:o.params.temperature,max_tokens:o.params.maxTokens};o.params.topP!==void 0&&(a.p=o.params.topP),o.params.stopSequences&&(a.stop_sequences=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(d=>({type:"function",function:{name:d.name,description:d.description,parameters:d.parameters}})),o.toolChoice&&(a.tool_choice=o.toolChoice));let p={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),u=await _(()=>e.fetch(`${s}/v2/chat`,{method:"POST",headers:p,body:JSON.stringify(a),timeoutMs:r}),{maxRetries:n,shouldRetry:d=>d instanceof g&&d.retryable}),c;try{c=await u.json()}catch{throw new g("PROVIDER_ERROR","Malformed response body from Cohere API",u.status,u.status>=500)}let f=Date.now()-i;if(!u.ok)throw zt(u.status,c);let y=c,m=y.message?.content?.filter(d=>d.type==="text").map(d=>d.text).join("")??"",x=(y.message?.tool_calls??[]).map(d=>{let h;try{h=JSON.parse(d.function.arguments||"{}")}catch{h={_raw:d.function.arguments}}return{id:d.id,name:d.function.name,arguments:h}}),C=y.usage?.tokens?.input_tokens??0,b=y.usage?.tokens?.output_tokens??0;return{text:m,toolCalls:x,usage:{inputTokens:C,outputTokens:b,totalTokens:C+b},raw:c,latencyMs:f,modelId:y.model??o.model,finishReason:Ht(y.finish_reason)}},estimateCost(o,a){return null},supportsTools(o){return!0}}}var _e={openai:ee,anthropic:te,ollama:se,gemini:re,mistral:oe,cohere:ne};function ke(e,t){let s=_e[e];if(!s){let r=Object.keys(_e).join(", ");throw new Error(`Unknown provider: "${e}". Supported providers: ${r}`)}return s(t)}async function ae(e,t,s,r){let n=r?.maxTurns??10,o=[],a=[],p=[...t.messages],i={inputTokens:0,outputTokens:0,totalTokens:0},u=0;for(let y=0;y<n;y++){let m={...t,messages:p},x=await e.complete(m);if(o.push({request:m,response:x}),i.inputTokens+=x.usage.inputTokens,i.outputTokens+=x.usage.outputTokens,i.totalTokens+=x.usage.totalTokens,u+=x.latencyMs,x.toolCalls.length===0)return{turns:o,finalText:x.text,allToolCalls:a,totalUsage:i,totalLatencyMs:u};a.push(...x.toolCalls),p=[...p,{role:"assistant",content:x.text,toolCalls:x.toolCalls}];for(let C of x.toolCalls){let b=s.find(h=>h.name===C.name),d;b?d=Wt(b,C.arguments):d={error:`Tool "${C.name}" not simulated`},p.push({role:"tool",content:JSON.stringify(d),toolCallId:C.id,toolName:C.name})}}let f=o[o.length-1]?.response??{text:"",toolCalls:[],usage:{inputTokens:0,outputTokens:0,totalTokens:0},raw:null,latencyMs:0,modelId:t.model,finishReason:"unknown"};return{turns:o,finalText:f.text,allToolCalls:a,totalUsage:i,totalLatencyMs:u}}function Wt(e,t){if(e.responses){for(let s of e.responses)if(Xt(s.when,t))return s.then}return e.defaultResponse!==void 0?e.defaultResponse:{error:"No matching simulation response"}}function Xt(e,t){for(let[s,r]of Object.entries(e))if(JSON.stringify(t[s])!==JSON.stringify(r))return!1;return!0}var l=require("zod");var M=require("zod"),fe=M.z.object({outputTextAttr:M.z.string().default("gen_ai.completion.0.content"),modelAttr:M.z.string().default("gen_ai.response.model"),systemAttr:M.z.string().default("gen_ai.system"),inputTokensAttr:M.z.string().default("gen_ai.usage.input_tokens"),outputTokensAttr:M.z.string().default("gen_ai.usage.output_tokens")}),ge=M.z.object({namePattern:M.z.string().optional().describe("Regex to filter span names"),attributeMatch:M.z.record(M.z.string()).optional().describe("Attributes that must match"),minDurationMs:M.z.number().min(0).optional()}),ie=M.z.object({port:M.z.number().int().min(1).max(65535).default(4318),timeoutMs:M.z.number().int().min(1e3).default(3e4),spanMapping:fe.default({}),spanFilter:ge.optional()});var k=l.z.string().min(1,"Must not be empty"),Yt=l.z.number().min(0).max(2).default(.2),$=l.z.number().min(0).max(1),he=l.z.string().refine(e=>{try{return new RegExp(e),!0}catch{return!1}},{message:"Must be a valid regex pattern"}),H=l.z.object({apiKeyEnv:k.describe("Environment variable name containing the API key. Never a raw key."),baseUrl:l.z.string().url().optional().describe("Custom base URL for API-compatible proxies (e.g., Azure OpenAI, LiteLLM)"),organization:l.z.string().optional().describe("Organization ID (OpenAI-specific)")}),Qt=l.z.object({apiKeyEnv:l.z.string().min(1).optional().describe("Environment variable name containing the API key. Optional for Ollama (local)."),baseUrl:l.z.string().url().optional().describe("Ollama server URL. Defaults to http://localhost:11434.")}),Zt=l.z.object({openai:H.optional(),anthropic:H.optional(),ollama:Qt.optional(),gemini:H.optional(),mistral:H.optional(),cohere:H.optional()}).refine(e=>Object.keys(e).some(t=>e[t]!==void 0),{message:"At least one provider must be configured"}),qt=l.z.object({temperature:Yt,maxTokens:l.z.number().int().min(1).max(128e3).default(1024),topP:l.z.number().min(0).max(1).optional(),stopSequences:l.z.array(l.z.string()).optional(),seed:l.z.number().int().optional().describe("Seed for reproducibility (provider-dependent support)")}),es=l.z.object({id:k.describe("Unique identifier for this model config, referenced in reports"),provider:l.z.enum(["openai","anthropic","ollama","gemini","mistral","cohere"]).describe("Must match a key in the providers section"),model:k.describe("Model name as the provider expects it (e.g., 'gpt-4o', 'claude-sonnet-4-5-20250929')"),params:qt.default({})}),ts=l.z.object({system:l.z.string().optional().describe("System prompt template. Supports {{variable}} interpolation."),user:k.describe("User prompt template. Supports {{variable}} interpolation."),assistant:l.z.string().optional().describe("Prefill for assistant response (Anthropic-specific)")}),ss=l.z.object({format:l.z.enum(["text","json"]).default("text"),schemaFile:l.z.string().optional().describe("Path to JSON Schema file (relative to config file). Required if format is 'json'."),contains:l.z.array(l.z.string()).optional().describe("Output must contain all of these substrings"),notContains:l.z.array(l.z.string()).optional().describe("Output must not contain any of these substrings"),maxLength:l.z.number().int().positive().optional().describe("Maximum character length of the output")}).refine(e=>!(e.format==="json"&&!e.schemaFile),{message:"schemaFile is required when format is 'json'"}),rs=l.z.object({enabled:l.z.boolean().default(!0),denyPatterns:l.z.array(he).default(["\\b\\d{3}-\\d{2}-\\d{4}\\b","\\b\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}\\b","\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"]).describe("Regex patterns that must NOT appear in output. Defaults include SSN, credit card, email."),customPatterns:l.z.array(l.z.object({name:k,pattern:he})).optional().describe("Named custom PII patterns for reporting clarity")}),os=l.z.object({deny:l.z.array(l.z.string()).default([]).describe("Words/phrases that must NOT appear in output (case-insensitive)"),allow:l.z.array(l.z.string()).optional().describe("If set, output MUST contain at least one of these words/phrases")}),ns=l.z.object({criteria:k.describe("Natural language description of what to evaluate (e.g., 'Response is empathetic and professional')"),minScore:$.default(.7).describe("Minimum score (0-1) for this criterion to pass"),model:l.z.string().optional().describe("Override judge model for this criterion. Defaults to first model in models list."),rubric:l.z.string().optional().describe("Detailed rubric for the judge. If omitted, a default rubric is generated from criteria.")}),as=l.z.object({tool:k.describe("Expected tool/function name"),shouldNotCall:l.z.boolean().optional().default(!1).describe("If true, assert this tool was NOT called"),argsMatch:l.z.record(l.z.unknown()).optional().describe("Key-value pairs that must be present in the tool call arguments (partial match)"),argsSchema:l.z.string().optional().describe("Path to JSON Schema file to validate the tool call arguments"),order:l.z.number().int().min(0).optional().describe("Expected position in the sequence of tool calls (0-indexed)"),responseContains:l.z.string().optional().describe("Assert the simulated tool response contains this substring")}),is=l.z.object({maxScore:$.default(.15).describe("Maximum drift score (0-1). Higher = more drift allowed. Fail if exceeded."),method:l.z.enum(["judge","embedding","field-diff"]).default("judge").describe("Drift detection method. 'judge' uses LLM comparison, 'embedding' uses cosine similarity, 'field-diff' compares JSON fields."),fields:l.z.array(l.z.string()).optional().describe("For field-diff method: JSON paths to compare (e.g., ['response.action', 'response.message'])")}),ls=l.z.object({pii:rs.optional(),keywords:os.optional()}),us=l.z.object({output:ss.optional(),guardrails:ls.optional(),judge:l.z.array(ns).optional().describe("LLM-as-judge evaluations. Each criterion is scored independently."),toolCalls:l.z.array(as).optional().describe("Expected tool/function calls in the model response"),baseline:l.z.object({drift:is.optional()}).optional()}),ps=l.z.object({name:k.describe("Tool/function name as the model sees it"),description:l.z.string().optional().describe("Tool description for documentation"),parameters:l.z.record(l.z.unknown()).optional().describe("JSON Schema for the tool's parameters"),responses:l.z.array(l.z.object({when:l.z.record(l.z.unknown()).describe("Condition: match tool call arguments (partial match)"),then:l.z.unknown().describe("Simulated response to return when condition matches")})).optional().describe("Simulated responses based on argument matching"),defaultResponse:l.z.unknown().optional().describe("Response when no 'when' condition matches")}),cs=l.z.object({name:k.describe("Unique test case name within the suite. Used in reports and JUnit output."),prompt:k.optional().describe("Reference to a key in the prompts section. Exactly one of prompt or command must be set."),command:k.optional().describe("Shell command to execute. Stdout is captured and assertions run against it. Exactly one of prompt or command must be set."),vars:l.z.record(l.z.string()).default({}).describe("Variables to interpolate into the prompt template or command"),models:l.z.array(l.z.string()).optional().describe("Override: run this test only against these model IDs. Defaults to all models. Ignored for command tests."),repeat:l.z.number().int().min(1).optional().describe("Override: number of repeat runs for this specific test case"),tools:l.z.array(ps).optional().describe("Simulated tools available to the model for this test case"),expect:us.describe("Assertions to evaluate against the model output"),tags:l.z.array(l.z.string()).optional().describe("Tags for filtering test cases in CLI (e.g., --tags regression)"),skip:l.z.boolean().optional().default(!1).describe("Skip this test case during execution")}).refine(e=>{let t=e.prompt!==void 0,s=e.command!==void 0;return(t||s)&&!(t&&s)},{message:"Exactly one of 'prompt' or 'command' must be set on each test case"}),ds=l.z.object({passRateMin:$.default(.95).describe("Minimum overall pass rate (0-1). Computed after repeats and aggregation."),schemaFailuresMax:l.z.number().int().min(0).default(0).describe("Maximum allowed schema validation failures across entire suite"),judgeAvgMin:$.optional().describe("Minimum average LLM-as-judge score across all criteria and test cases"),driftScoreMax:$.optional().describe("Maximum allowed drift score against active baseline"),piiFailuresMax:l.z.number().int().min(0).default(0).describe("Maximum allowed PII detection failures"),keywordFailuresMax:l.z.number().int().min(0).default(0).describe("Maximum allowed keyword guardrail failures"),costMaxUsd:l.z.number().positive().optional().describe("Maximum total cost in USD for the entire run. Aborts if exceeded mid-run."),latencyMaxMs:l.z.number().positive().optional().describe("Maximum average latency in ms. Fails gate if exceeded."),deterministicPassRate:$.optional().describe("Minimum pass rate for deterministic assertions only (tool_called, schema, pii, keywords, etc.)"),probabilisticPassRate:$.optional().describe("Minimum pass rate for probabilistic assertions only (judge, drift)")}),ms=l.z.object({enabled:l.z.boolean().default(!1),framework:l.z.enum(["eu-ai-act","custom"]).default("eu-ai-act"),outputDir:l.z.string().default("./compliance-reports"),metadata:l.z.object({systemName:l.z.string().optional().describe("Name of the AI system being tested"),systemVersion:l.z.string().optional().describe("Version of the AI system"),riskLevel:l.z.enum(["high","limited","minimal"]).optional(),operator:l.z.string().optional().describe("Organization operating the AI system"),intendedPurpose:l.z.string().optional().describe("Documented intended purpose of the AI system"),dataGovernanceNotes:l.z.string().optional()}).optional()}),fs=l.z.object({enabled:l.z.boolean().default(!1),includeArtifacts:l.z.boolean().default(!1).describe("Upload raw prompt inputs and model outputs. Disabled by default for privacy."),redactPatterns:l.z.array(he).optional().describe("Patterns to redact from artifacts before upload (applied on top of PII guardrails)"),apiUrl:l.z.string().url().default("https://api.kindlm.com/v1").describe("Cloud API URL. Override for self-hosted deployments.")}),gs=l.z.object({name:k,description:l.z.string().optional(),tags:l.z.array(l.z.string()).optional()}),Re=l.z.object({kindlm:l.z.literal(1).describe("Config schema version. Must be 1."),project:k.describe("Project identifier for cloud upload and report grouping"),suite:gs,providers:Zt,models:l.z.array(es).min(1,"At least one model must be configured"),prompts:l.z.record(ts).refine(e=>Object.keys(e).length>0,{message:"At least one prompt must be defined"}),tests:l.z.array(cs).min(1,"At least one test case must be defined"),gates:ds.default({}),compliance:ms.optional(),trace:ie.optional().describe("OpenTelemetry trace ingestion configuration for the 'kindlm trace' command"),upload:fs.default({}),defaults:l.z.object({repeat:l.z.number().int().min(1).default(1).describe("Default repeat count per test case"),concurrency:l.z.number().int().min(1).max(32).default(4).describe("Default concurrency for test execution"),timeoutMs:l.z.number().int().min(1e3).default(6e4).describe("Default timeout per provider call in ms"),judgeModel:l.z.string().optional().describe("Default model ID for LLM-as-judge assertions. Must reference a configured model.")}).default({})});function le(e){let t=Re.safeParse(e);return t.success?S(t.data):A({code:"CONFIG_VALIDATION_ERROR",message:"Config validation failed",details:{errors:t.error.issues.map(s=>`${s.path.join(".")}: ${s.message}`)}})}var De=require("yaml");var hs=1048576,Ie=1e3,je=50;function Le(e,t){if(e.length>hs)return A({code:"CONFIG_TOO_LARGE",message:`Config exceeds maximum size of 1MB (got ${(e.length/1048576).toFixed(1)}MB)`});let s;try{s=(0,De.parse)(e)}catch(i){return A({code:"CONFIG_PARSE_ERROR",message:`Failed to parse YAML: ${i.message}`,cause:i})}let r=le(s);if(!r.success)return r;let n=r.data;if(n.tests.length>Ie)return A({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${Ie} tests (got ${n.tests.length})`});if(n.models.length>je)return A({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${je} models (got ${n.models.length})`});let o=[],a=new Set;for(let i of n.models)a.has(i.id)&&o.push(`Duplicate model ID "${i.id}"`),a.add(i.id);let p=new Set;for(let i of n.tests)p.has(i.name)&&o.push(`Duplicate test name "${i.name}"`),p.add(i.name);for(let i of n.tests)i.prompt&&!(i.prompt in n.prompts)&&o.push(`Test "${i.name}" references prompt "${i.prompt}" which is not defined`);for(let i of n.tests)if(i.models)for(let u of i.models)a.has(u)||o.push(`Test "${i.name}" references model "${u}" which is not configured`);for(let i of n.models)n.providers[i.provider]||o.push(`Model "${i.id}" references provider "${i.provider}" which is not configured`);if(n.defaults.judgeModel&&!a.has(n.defaults.judgeModel)&&o.push(`defaults.judgeModel "${n.defaults.judgeModel}" is not a configured model`),t.fileReader)for(let i of n.tests){if(i.expect.output?.schemaFile){let u=Ne(t.configDir,i.expect.output.schemaFile);u.success?t.fileReader.readFile(u.data).success||o.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" not found at ${u.data}`):o.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" \u2014 ${u.error.message}`)}if(i.expect.toolCalls){for(let u of i.expect.toolCalls)if(u.argsSchema){let c=Ne(t.configDir,u.argsSchema);c.success?t.fileReader.readFile(c.data).success||o.push(`Test "${i.name}": argsSchema "${u.argsSchema}" for tool "${u.tool}" not found at ${c.data}`):o.push(`Test "${i.name}": argsSchema "${u.argsSchema}" for tool "${u.tool}" \u2014 ${c.error.message}`)}}}return o.length>0?A({code:"CONFIG_VALIDATION_ERROR",message:"Config cross-reference validation failed",details:{errors:o}}):S(n)}function Ne(e,t){if(t.startsWith("/")||t.startsWith("\\"))return A({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});if(/^[a-zA-Z]:/.test(t))return A({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});let s=e.endsWith("/")?e.slice(0,-1):e,r=`${s}/${t}`,n=$e(r),o=$e(s);return!n.startsWith(o+"/")&&n!==o?A({code:"PATH_TRAVERSAL",message:`Path "${t}" escapes the config directory`}):S(n)}function $e(e){let t=e.split("/"),s=[];for(let n of t)n==="."||n===""||(n===".."?s.pop():s.push(n));return(e.startsWith("/")?"/":"")+s.join("/")}var Fe=/\{\{(\w+)\}\}/g;function L(e,t){let s=ye(e,t);if(s.length>0)return A({code:"CONFIG_VALIDATION_ERROR",message:`Missing template variables: ${s.join(", ")}`,details:{missing:s}});let r=e.replace(Fe,(n,o)=>t[o]);return S(r)}function ye(e,t){let s=new Set;for(let r of e.matchAll(Fe)){let n=r[1];n!==void 0&&!(n in t)&&s.add(n)}return[...s]}function ue(e){let t=e[0];if(!t)throw new Error("aggregateRuns requires at least one run");let{testCaseName:s,modelId:r}=t,o=e.filter(y=>y.assertions.every(m=>m.passed)).length/e.length,a=new Map;for(let y of e)for(let m of y.assertions){let x=a.get(m.assertionType);x||(x=[],a.set(m.assertionType,x)),x.push(m.score)}let p={};for(let[y,m]of a){let x=m.reduce((C,b)=>C+b,0);p[y]={mean:x/m.length,min:Math.min(...m),max:Math.max(...m)}}let i=new Set;for(let y of e)for(let m of y.assertions)!m.passed&&m.failureCode&&i.add(m.failureCode);let u=e.reduce((y,m)=>y+m.latencyMs,0)/e.length,c=e.reduce((y,m)=>y+(m.costEstimateUsd??0),0),f=e.reduce((y,m)=>y+m.tokenUsage.totalTokens,0);return{testCaseName:s,modelId:r,runCount:e.length,passed:o===1,passRate:o,assertionScores:p,failureCodes:[...i],latencyAvgMs:u,totalCostUsd:c,totalTokens:f,runs:e}}function pe(e){let t=e.stdout.split(`
|
|
32
|
-
`),s=[],r=[],n,o=0;for(let a of t){let p=a.trimStart();if(!p.startsWith('{"kindlm":')){s.push(a);continue}let i=Rs(p);if(!i){s.push(a);continue}i.kindlm==="tool_call"?r.push({id:i.id??`cmd_tc_${o++}`,name:i.name,arguments:i.arguments}):i.kindlm==="output_json"&&(n=i.data)}return{outputText:s.join(`
|
|
33
|
-
`).trim(),toolCalls:r,outputJson:n,exitCode:e.exitCode,stderr:e.stderr}}function Rs(e){try{let t=JSON.parse(e);return typeof t.kindlm!="string"?null:t.kindlm==="tool_call"?typeof t.name!="string"?null:{kindlm:"tool_call",id:typeof t.id=="string"?t.id:void 0,name:t.name,arguments:typeof t.arguments=="object"&&t.arguments!==null?t.arguments:{}}:t.kindlm==="output_json"?{kindlm:"output_json",data:t.data}:null}catch{return null}}function Ue(e,t){return{async run(){let s=Date.now(),r=new Map;for(let R of e.tests)if(R.expect.output?.schemaFile){let P=Ts(t.configDir,R.expect.output.schemaFile),E=t.fileReader.readFile(P);if(!E.success)return A({code:"SCHEMA_FILE_ERROR",message:`Failed to read schema file "${R.expect.output.schemaFile}": ${E.error.message}`});try{r.set(R.name,JSON.parse(E.data))}catch(T){return A({code:"SCHEMA_FILE_ERROR",message:`Failed to parse schema file "${R.expect.output.schemaFile}" as JSON: ${T instanceof Error?T.message:String(T)}`})}}let n=[];for(let R of e.tests){if(R.skip)continue;let P=R.repeat??e.defaults.repeat;if(R.command)for(let E=0;E<P;E++)n.push({test:R,modelConfig:null,runIndex:E});else{let E=R.models??e.models.map(T=>T.id);for(let T of E){let v=e.models.find(O=>O.id===T);if(v)for(let O=0;O<P;O++)n.push({test:R,modelConfig:v,runIndex:O})}}}let o=await xs(n.map(R=>()=>ys(e,t,R,r)),e.defaults.concurrency),a=R=>`${R.testCaseName}::${R.modelId}`,p=new Map;for(let R of o){let P=a(R),E=p.get(P);E||(E=[],p.set(P,E)),E.push(R)}let i=[];for(let R of p.values())i.push(ue(R));let u=i.map(R=>({name:R.testCaseName,modelId:R.modelId,status:R.passed?"passed":"failed",assertions:R.runs[0]?.assertions??[],latencyMs:R.latencyAvgMs,costUsd:R.totalCostUsd})),c=e.tests.filter(R=>R.skip).map(R=>({name:R.name,modelId:"",status:"skipped",assertions:[],latencyMs:0,costUsd:0})),f=[...u,...c],y=f.filter(R=>R.status==="passed").length,m=f.filter(R=>R.status==="failed").length,x=f.filter(R=>R.status==="errored").length,C=f.filter(R=>R.status==="skipped").length,b=x>0?"errored":m>0?"failed":"passed",h={suites:[{name:e.suite.name,status:b,tests:f}],totalTests:f.length,passed:y,failed:m,errored:x,skipped:C,durationMs:Date.now()-s};return S({runResult:h,aggregated:i})}}}async function ys(e,t,s,r){let{test:n,modelConfig:o,runIndex:a}=s;if(n.command)return bs(e,t,n,a,r);if(!o)return j(n.name,"unknown",a,"No model config for prompt-based test");t.onProgress?.({type:"test_start",test:n.name,model:o.id,run:a});try{let p=t.adapters.get(o.provider);if(!p)return j(n.name,o.id,a,`Provider adapter "${o.provider}" not found`);let i=n.prompt?e.prompts[n.prompt]:void 0;if(!i)return j(n.name,o.id,a,`Prompt "${n.prompt}" not defined`);let u=L(i.user,n.vars);if(!u.success)return j(n.name,o.id,a,u.error.message);let c=[];if(i.system){let v=L(i.system,n.vars);if(!v.success)return j(n.name,o.id,a,v.error.message);c.push({role:"system",content:v.data})}c.push({role:"user",content:u.data});let f=(n.tools??[]).map(v=>({name:v.name,description:v.description,parameters:v.parameters})),y={model:o.model,messages:c,params:{temperature:o.params.temperature,maxTokens:o.params.maxTokens,topP:o.params.topP,stopSequences:o.params.stopSequences,seed:o.params.seed},tools:f.length>0?f:void 0},m=await ae(p,y,n.tools??[]),x=p.estimateCost(o.model,m.totalUsage),C={};n.expect.output?.schemaFile&&r.has(n.name)&&(C.schemaContent=r.get(n.name));let b=K(n.expect,C),d=e.defaults.judgeModel??e.models[0]?.id,h=e.models.find(v=>v.id===d),R=h?t.adapters.get(h.provider):void 0,P={outputText:m.finalText,toolCalls:m.allToolCalls,configDir:t.configDir,latencyMs:m.totalLatencyMs,costUsd:x??void 0,judgeAdapter:R,judgeModel:h?.model};if(t.baselineData){let v=`${n.name}::${o.id}`,O=t.baselineData.results[v];O&&(P.baselineText=O.outputText)}let E=[];for(let v of b){let O=await v.evaluate(P);E.push(...O)}let T=E.every(v=>v.passed);return t.onProgress?.({type:"test_complete",test:n.name,model:o.id,run:a,passed:T}),{testCaseName:n.name,modelId:o.id,runIndex:a,outputText:m.finalText,assertions:E,latencyMs:m.totalLatencyMs,tokenUsage:m.totalUsage,costEstimateUsd:x}}catch(p){return t.onProgress?.({type:"test_complete",test:n.name,model:o.id,run:a,passed:!1}),j(n.name,o.id,a,p instanceof Error?p.message:String(p))}}async function bs(e,t,s,r,n){let o="command";t.onProgress?.({type:"test_start",test:s.name,model:o,run:r});try{if(!t.commandExecutor)return j(s.name,o,r,"Command executor not available");if(!s.command)return j(s.name,o,r,"No command specified");let a=L(s.command,s.vars);if(!a.success)return j(s.name,o,r,a.error.message);let p=Date.now(),i=await t.commandExecutor.execute(a.data,{timeoutMs:e.defaults.timeoutMs,cwd:t.configDir});if(!i.success)return j(s.name,o,r,i.error.message);let u=Date.now()-p,c=pe(i.data),f={};s.expect.output?.schemaFile&&n.has(s.name)&&(f.schemaContent=n.get(s.name));let y=K(s.expect,f),m=e.defaults.judgeModel??e.models[0]?.id,x=e.models.find(R=>R.id===m),C=x?t.adapters.get(x.provider):void 0,b={outputText:c.outputText,outputJson:c.outputJson,toolCalls:c.toolCalls,configDir:t.configDir,latencyMs:u,judgeAdapter:C,judgeModel:x?.model};if(t.baselineData){let R=`${s.name}::${o}`,P=t.baselineData.results[R];P&&(b.baselineText=P.outputText)}let d=[];for(let R of y){let P=await R.evaluate(b);d.push(...P)}let h=d.every(R=>R.passed);return t.onProgress?.({type:"test_complete",test:s.name,model:o,run:r,passed:h}),{testCaseName:s.name,modelId:o,runIndex:r,outputText:c.outputText,assertions:d,latencyMs:u,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}catch(a){return t.onProgress?.({type:"test_complete",test:s.name,model:o,run:r,passed:!1}),j(s.name,o,r,a instanceof Error?a.message:String(a))}}function j(e,t,s,r){return{testCaseName:e,modelId:t,runIndex:s,outputText:"",assertions:[{assertionType:"internal",label:"Execution error",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:r}],latencyMs:0,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}async function xs(e,t){let s=new Array(e.length),r=0;async function n(){for(;r<e.length;){let a=r++,p=e[a];p&&(s[a]=await p())}}let o=Array.from({length:Math.min(t,e.length)},()=>n());return await Promise.all(o),s}function Ts(e,t){return t.startsWith("/")?t:`${e.endsWith("/")?e.slice(0,-1):e}/${t}`}function Je(e,t){let s=[],r=t.reduce((u,c)=>u+c.runCount,0),n=t.reduce((u,c)=>u+Math.round(c.passRate*c.runCount),0),o=r>0?n/r:0;s.push({gateName:"passRateMin",passed:o>=e.passRateMin,actual:o,threshold:e.passRateMin,message:o>=e.passRateMin?`Pass rate ${w(o)} meets minimum ${w(e.passRateMin)}`:`Pass rate ${w(o)} below minimum ${w(e.passRateMin)}`});let a=be(t,["SCHEMA_INVALID","SCHEMA_PARSE_ERROR"]);if(s.push({gateName:"schemaFailuresMax",passed:a<=e.schemaFailuresMax,actual:a,threshold:e.schemaFailuresMax,message:a<=e.schemaFailuresMax?`Schema failures ${a} within limit ${e.schemaFailuresMax}`:`Schema failures ${a} exceed limit ${e.schemaFailuresMax}`}),e.judgeAvgMin!==void 0){let u=Ge(t,"judge"),c=u.length>0?u.reduce((f,y)=>f+y,0)/u.length:1;s.push({gateName:"judgeAvgMin",passed:c>=e.judgeAvgMin,actual:c,threshold:e.judgeAvgMin,message:c>=e.judgeAvgMin?`Judge average ${w(c)} meets minimum ${w(e.judgeAvgMin)}`:`Judge average ${w(c)} below minimum ${w(e.judgeAvgMin)}`})}if(e.driftScoreMax!==void 0){let u=Ge(t,"drift"),c=u.length>0?Math.max(...u):0;s.push({gateName:"driftScoreMax",passed:c<=e.driftScoreMax,actual:c,threshold:e.driftScoreMax,message:c<=e.driftScoreMax?`Drift score ${w(c)} within limit ${w(e.driftScoreMax)}`:`Drift score ${w(c)} exceeds limit ${w(e.driftScoreMax)}`})}let p=be(t,["PII_DETECTED"]);s.push({gateName:"piiFailuresMax",passed:p<=e.piiFailuresMax,actual:p,threshold:e.piiFailuresMax,message:p<=e.piiFailuresMax?`PII failures ${p} within limit ${e.piiFailuresMax}`:`PII failures ${p} exceed limit ${e.piiFailuresMax}`});let i=be(t,["KEYWORD_DENIED","KEYWORD_MISSING"]);if(s.push({gateName:"keywordFailuresMax",passed:i<=e.keywordFailuresMax,actual:i,threshold:e.keywordFailuresMax,message:i<=e.keywordFailuresMax?`Keyword failures ${i} within limit ${e.keywordFailuresMax}`:`Keyword failures ${i} exceed limit ${e.keywordFailuresMax}`}),e.costMaxUsd!==void 0){let u=t.reduce((c,f)=>c+f.totalCostUsd,0);s.push({gateName:"costMaxUsd",passed:u<=e.costMaxUsd,actual:u,threshold:e.costMaxUsd,message:u<=e.costMaxUsd?`Total cost $${u.toFixed(4)} within limit $${e.costMaxUsd.toFixed(4)}`:`Total cost $${u.toFixed(4)} exceeds limit $${e.costMaxUsd.toFixed(4)}`})}if(e.latencyMaxMs!==void 0){let u=t.length>0?t.reduce((c,f)=>c+f.latencyAvgMs,0)/t.length:0;s.push({gateName:"latencyMaxMs",passed:u<=e.latencyMaxMs,actual:u,threshold:e.latencyMaxMs,message:u<=e.latencyMaxMs?`Average latency ${Math.round(u)}ms within limit ${e.latencyMaxMs}ms`:`Average latency ${Math.round(u)}ms exceeds limit ${e.latencyMaxMs}ms`})}if(e.deterministicPassRate!==void 0){let u=Be(t,"deterministic");s.push({gateName:"deterministicPassRate",passed:u>=e.deterministicPassRate,actual:u,threshold:e.deterministicPassRate,message:u>=e.deterministicPassRate?`Deterministic pass rate ${w(u)} meets minimum ${w(e.deterministicPassRate)}`:`Deterministic pass rate ${w(u)} below minimum ${w(e.deterministicPassRate)}`})}if(e.probabilisticPassRate!==void 0){let u=Be(t,"probabilistic");s.push({gateName:"probabilisticPassRate",passed:u>=e.probabilisticPassRate,actual:u,threshold:e.probabilisticPassRate,message:u>=e.probabilisticPassRate?`Probabilistic pass rate ${w(u)} meets minimum ${w(e.probabilisticPassRate)}`:`Probabilistic pass rate ${w(u)} below minimum ${w(e.probabilisticPassRate)}`})}return{passed:s.every(u=>u.passed),gates:s}}function Be(e,t){let s=0,r=0;for(let n of e)for(let o of n.runs)for(let a of o.assertions)D(a.assertionType)===t&&(s++,a.passed&&r++);return s>0?r/s:1}function be(e,t){let s=0;for(let r of e)for(let n of t)r.failureCodes.includes(n)&&s++;return s}function Ge(e,t){let s=[];for(let r of e){let n=r.assertionScores[t];n&&s.push(n.mean)}return s}function w(e){return(e*100).toFixed(1)+"%"}var N=e=>e,ce={bold:N,red:N,green:N,yellow:N,cyan:N,dim:N,greenBold:N,redBold:N};function Ke(e=ce){return{name:"pretty",generate(t,s){let r=[],n=e;r.push(""),r.push(n.bold(" KindLM Test Results")),r.push("");let o=0;for(let c of t.suites){r.push(Cs(c,n));for(let f of c.tests){r.push(As(f,n));let y=Ps(f,n);y&&r.push(y);for(let m of f.assertions)r.push(Es(m,n));o+=f.costUsd}r.push("")}r.push(n.bold(" Summary"));let a=n.green(`${t.passed} passed`),p=t.failed>0?n.red(`${t.failed} failed`):`${t.failed} failed`,i=t.errored>0?n.yellow(`${t.errored} errored`):`${t.errored} errored`;if(r.push(` ${a}, ${p}, ${i} (${t.totalTests} total)`),r.push(` Duration: ${He(t.durationMs)}`),o>0&&r.push(` Cost: ${ze(o)}`),r.push(""),s.gates.length>0){r.push(n.bold(" Quality Gates"));for(let c of s.gates){let f=c.passed?n.green("\u2713"):n.red("\u2717");r.push(` ${f} ${c.message}`)}r.push("")}return t.failed===0&&t.errored===0&&s.passed?r.push(n.greenBold(" \u2713 All tests passed")):r.push(n.redBold(" \u2717 Some tests failed")),r.push(""),{content:r.join(`
|
|
34
|
-
`),format:"text"}}}}function Cs(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${t.bold(e.name)}`}function As(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${e.name}`}function Ps(e,t){if(e.status==="skipped")return null;let s=[];return e.modelId&&s.push(e.modelId),e.latencyMs>0&&s.push(He(e.latencyMs)),e.costUsd>=5e-5&&s.push(ze(e.costUsd)),s.length===0?null:` ${t.dim(s.join(" \xB7 "))}`}function Es(e,t){if(e.passed){let o=Ve(e),a=o?`${e.label} ${t.cyan(o)}`:e.label;return` ${t.green("\u2713")} ${t.dim(a)}`}let s=Ve(e),r=e.failureMessage??"failed",n=s?`${e.label} ${t.cyan(s)}`:e.label;return` ${t.red("\u2717")} ${n}: ${r}`}function Ve(e){if(e.assertionType==="judge"||e.assertionType==="drift"){let t=vs(e);if(t!==null){let s=e.passed?"\u2265":"<";return`(${e.score.toFixed(2)} ${s} ${t.toFixed(2)})`}return`(${e.score.toFixed(2)})`}return""}function vs(e){if(e.metadata&&typeof e.metadata=="object"&&"threshold"in e.metadata){let t=e.metadata.threshold;if(typeof t=="number")return t}if(e.failureMessage){let t=e.failureMessage.match(/threshold (\d+\.?\d*)/i);if(t?.[1])return parseFloat(t[1]);let s=e.failureMessage.match(/below (\d+\.?\d*)/i);if(s?.[1])return parseFloat(s[1])}return null}function He(e){return e<1e3?`${e}ms`:`${(e/1e3).toFixed(2)}s`}function ze(e){return e<.01?`$${e.toFixed(4)}`:`$${e.toFixed(2)}`}function We(){return{name:"json",generate(e,t){let s={kindlm:{version:"1.0.0",timestamp:new Date().toISOString()},summary:{totalTests:e.totalTests,passed:e.passed,failed:e.failed,errored:e.errored,skipped:e.skipped,durationMs:e.durationMs},gates:{passed:t.passed,results:t.gates},suites:e.suites.map(r=>({name:r.name,status:r.status,tests:r.tests.map(n=>({name:n.name,status:n.status,assertions:n.assertions,latencyMs:n.latencyMs,costUsd:n.costUsd}))}))};return{content:JSON.stringify(s,null,2),format:"json"}}}}function Xe(){return{name:"junit",generate(e,t){let s=e.durationMs/1e3,r=[];r.push('<?xml version="1.0" encoding="UTF-8"?>'),r.push(`<testsuites name="KindLM" tests="${e.totalTests}" failures="${e.failed}" errors="${e.errored}" time="${s.toFixed(3)}">`);for(let n of e.suites){let o=n.tests.filter(i=>i.status==="failed").length,a=n.tests.filter(i=>i.status==="errored").length,p=n.tests.reduce((i,u)=>i+u.latencyMs,0)/1e3;r.push(` <testsuite name="${I(n.name)}" tests="${n.tests.length}" failures="${o}" errors="${a}" time="${p.toFixed(3)}">`);for(let i of n.tests){let u=i.latencyMs/1e3;if(r.push(` <testcase name="${I(i.name)}" classname="${I(n.name)}" time="${u.toFixed(3)}">`),i.status==="skipped")r.push(" <skipped/>");else if(i.status==="errored"&&i.error)r.push(` <error message="${I(i.error.message)}" type="${I(i.error.code)}">${I(i.error.message)}</error>`);else if(i.status==="failed"){let c=i.assertions.filter(f=>!f.passed);for(let f of c)r.push(` <failure message="${I(f.label)}" type="${I(f.failureCode??"ASSERTION_FAILED")}">${I(f.failureMessage??"Assertion failed")}</failure>`)}r.push(" </testcase>")}r.push(" </testsuite>")}if(t.gates.length>0){let n=t.gates.filter(o=>!o.passed).length;r.push(` <testsuite name="Quality Gates" tests="${t.gates.length}" failures="${n}" errors="0" time="0.000">`);for(let o of t.gates)r.push(` <testcase name="${I(o.gateName)}" classname="Quality Gates" time="0.000">`),o.passed||r.push(` <failure message="${I(o.message)}" type="GATE_FAILED">${I(o.message)}</failure>`),r.push(" </testcase>");r.push(" </testsuite>")}return r.push("</testsuites>"),{content:r.join(`
|
|
35
|
-
`),format:"
|
|
36
|
-
`),
|
|
37
|
-
`),
|
|
38
|
-
`)
|
|
39
|
-
|
|
31
|
+
${t.outputText}`}],params:{temperature:0,maxTokens:512}})}catch(a){return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"JUDGE_EVAL_ERROR",failureMessage:`Drift judge adapter error: ${a instanceof Error?a.message:String(a)}`}]}let n=Ut(s.text);if(!n.ok)return[{assertionType:"drift",label:"Drift check (judge)",passed:!1,score:0,failureCode:"DRIFT_PARSE_ERROR",failureMessage:`Failed to parse drift judge response: ${n.reason}`}];let o=1-n.driftScore,r=n.driftScore<=e.maxScore;return[{assertionType:"drift",label:"Drift check (judge)",passed:r,score:o,failureCode:r?void 0:"DRIFT_EXCEEDED",failureMessage:r?void 0:`Drift score ${n.driftScore.toFixed(2)} exceeds max ${e.maxScore}: ${n.reasoning}`,metadata:{driftScore:n.driftScore,reasoning:n.reasoning,threshold:e.maxScore}}]}}}function ie(e){return{type:"latency",evaluate(t){let s=t.latencyMs??0,n=s<=e.maxMs;return Promise.resolve([{assertionType:"latency",label:`Latency <= ${e.maxMs}ms`,passed:n,score:n?1:0,failureCode:n?void 0:"PROVIDER_TIMEOUT",failureMessage:n?void 0:`Latency ${s}ms exceeds max ${e.maxMs}ms`,metadata:{latencyMs:s}}])}}}function le(e){return{type:"cost",evaluate(t){let s=t.costUsd??0,n=s<=e.maxUsd;return Promise.resolve([{assertionType:"cost",label:`Cost <= $${e.maxUsd}`,passed:n,score:n?1:0,failureCode:n?void 0:"BUDGET_EXCEEDED",failureMessage:n?void 0:`Cost $${s.toFixed(4)} exceeds max $${e.maxUsd}`,metadata:{costUsd:s}}])}}}function z(e,t){let s=[];if(e.toolCalls)if(e.toolCalls.some(o=>o.order!==void 0)){let o=e.toolCalls.map(r=>({...r,argsSchema:r.argsSchemaResolved?JSON.stringify(r.argsSchemaResolved):r.argsSchema}));s.push(q(o))}else for(let o of e.toolCalls)if(o.shouldNotCall)s.push(Z(o.tool));else{let r=o.argsSchemaResolved?JSON.stringify(o.argsSchemaResolved):o.argsSchema??void 0;s.push(Q(o.tool,o.argsMatch??void 0,r))}if(e.output&&s.push(ee({format:e.output.format,schemaFile:e.output.schemaFile,schemaContent:t?.schemaContent,contains:e.output.contains,notContains:e.output.notContains,maxLength:e.output.maxLength})),e.guardrails?.pii&&s.push(re({denyPatterns:e.guardrails.pii.denyPatterns,customPatterns:e.guardrails.pii.customPatterns})),e.guardrails?.keywords&&(e.guardrails.keywords.allow&&e.guardrails.keywords.allow.length>0&&s.push(se(e.guardrails.keywords.allow)),e.guardrails.keywords.deny.length>0&&s.push(ne(e.guardrails.keywords.deny))),e.judge)for(let n of e.judge)s.push(oe({criteria:n.criteria,minScore:n.minScore,rubric:n.rubric,model:n.model}));return e.baseline?.drift&&s.push(ae({maxScore:e.baseline.drift.maxScore,method:e.baseline.drift.method,fields:e.baseline.drift.fields})),e.latency&&s.push(ie({maxMs:e.latency.maxMs})),e.cost&&s.push(le({maxUsd:e.cost.maxUsd})),s}var Bt=new Set(["judge","drift"]);function H(e){return Bt.has(e)?"probabilistic":"deterministic"}function Fe(e){return H(e)==="deterministic"}function Be(e){return H(e)==="probabilistic"}function U(e,t){let s=t[e];if(s)return{ok:!0,price:s,matchedModel:e,matchType:"exact"};let n=Object.keys(t).filter(o=>e.startsWith(`${o}-`)||e.startsWith(`${o}:`));if(n.length===1){let o=n[0];if(!o)return{ok:!1};let r=t[o];if(r)return{ok:!0,price:r,matchedModel:o,matchType:"prefix"}}return{ok:!1}}async function v(e,t){let{maxRetries:s,shouldRetry:n,baseDelayMs:o=500,maxDelayMs:r=3e4,getRetryAfterMs:a}=t,c;for(let i=0;i<=s;i++)try{return await e()}catch(l){if(c=l,i>=s||!n(l))throw l;let p=a?.(l),f=Vt({attempt:i,baseDelayMs:o,maxDelayMs:r,retryAfterMs:p});await Jt(f)}throw c}function Vt(e){let{attempt:t,baseDelayMs:s,maxDelayMs:n,retryAfterMs:o}=e;if(o!==void 0&&o>0)return o;let r=Math.min(n,s*Math.pow(2,t));return Math.max(Math.floor(s/2),Math.floor(Math.random()*r))}function Jt(e){return new Promise(t=>setTimeout(t,e))}var Gt={"gpt-4o":{input:2.5,output:10},"gpt-4o-mini":{input:.15,output:.6},"gpt-4-turbo":{input:10,output:30},"o3-mini":{input:1.1,output:4.4}};function Kt(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function Ve(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new h("CONTEXT_LENGTH",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ue(e){let t="",s="https://api.openai.com/v1",n,o=6e4,r=2;return{name:"openai",async initialize(a){if(!a.apiKey)throw new h("AUTH_FAILED","API key is required");t=a.apiKey,a.baseUrl&&(s=a.baseUrl),n=a.organization,o=a.timeoutMs,r=a.maxRetries},async complete(a){let c={model:a.model,messages:a.messages.map(u=>u.role==="tool"?{role:"tool",content:u.content,tool_call_id:u.toolCallId}:u.role==="assistant"&&u.toolCalls&&u.toolCalls.length>0?{role:"assistant",content:u.content||null,tool_calls:u.toolCalls.map(m=>({id:m.id,type:"function",function:{name:m.name,arguments:JSON.stringify(m.arguments)}}))}:{role:u.role,content:u.content}),temperature:a.params.temperature,max_tokens:a.params.maxTokens};a.params.topP!==void 0&&(c.top_p=a.params.topP),a.params.seed!==void 0&&(c.seed=a.params.seed),a.params.stopSequences&&(c.stop=a.params.stopSequences),a.tools&&a.tools.length>0&&(c.tools=a.tools.map(u=>({type:"function",function:{name:u.name,description:u.description,parameters:u.parameters}})),a.toolChoice&&(c.tool_choice=a.toolChoice));let i={"Content-Type":"application/json",Authorization:`Bearer ${t}`};n&&(i["OpenAI-Organization"]=n);let l=Date.now(),p=await v(async()=>{let u=await e.fetch(`${s}/chat/completions`,{method:"POST",headers:i,body:JSON.stringify(c),timeoutMs:o});if(!u.ok){let m;try{m=await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from OpenAI API",u.status,u.status>=500)}throw Ve(u.status,m)}try{return await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from OpenAI API",u.status,u.status>=500)}},{maxRetries:r,shouldRetry:u=>u instanceof h&&u.retryable,getRetryAfterMs:u=>{if(!(u instanceof h)||!u.raw)return;let C=u.raw.headers?.["retry-after"];if(typeof C=="string"){let w=Number(C);if(!Number.isNaN(w)&&w>0)return w*1e3}}}),f=Date.now()-l,y=p,R=y.choices?.[0],x=R?.message,g=(x?.tool_calls??[]).map(u=>{let m;try{m=JSON.parse(u.function.arguments||"{}")}catch{m={_raw:u.function.arguments}}return{id:u.id,name:u.function.name,arguments:m}});return{text:x?.content??"",toolCalls:g,usage:{inputTokens:y.usage?.prompt_tokens??0,outputTokens:y.usage?.completion_tokens??0,totalTokens:y.usage?.total_tokens??0},raw:p,latencyMs:f,modelId:y.model??a.model,finishReason:Kt(R?.finish_reason)}},estimateCost(a,c){let i=U(a,Gt);return i.ok?c.inputTokens/1e6*i.price.input+c.outputTokens/1e6*i.price.output:null},supportsTools(a){return!a.startsWith("o1-")},async embed(a,c){let i=c??"text-embedding-3-small",l={"Content-Type":"application/json",Authorization:`Bearer ${t}`};n&&(l["OpenAI-Organization"]=n);let y=(await v(async()=>{let R=await e.fetch(`${s}/embeddings`,{method:"POST",headers:l,body:JSON.stringify({model:i,input:a}),timeoutMs:o});if(!R.ok){let x;try{x=await R.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from OpenAI Embeddings API",R.status,R.status>=500)}throw Ve(R.status,x)}try{return await R.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from OpenAI Embeddings API",R.status,R.status>=500)}},{maxRetries:r,shouldRetry:R=>R instanceof h&&R.retryable})).data?.[0]?.embedding;if(!y||!Array.isArray(y))throw new h("PROVIDER_ERROR","No embedding returned from OpenAI Embeddings API");return y}}}var Ht={"claude-opus-4-5-20250929":{input:15,output:75},"claude-sonnet-4-5-20250929":{input:3,output:15},"claude-haiku-4-5-20251001":{input:.8,output:4}};function zt(e){switch(e){case"end_turn":return"stop";case"max_tokens":return"max_tokens";case"tool_use":return"tool_calls";default:return"unknown"}}function Wt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new h("CONTEXT_LENGTH",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function ce(e){let t="",s="https://api.anthropic.com",n=6e4,o=2;return{name:"anthropic",async initialize(r){if(!r.apiKey)throw new h("AUTH_FAILED","API key is required");t=r.apiKey,r.baseUrl&&(s=r.baseUrl),n=r.timeoutMs,o=r.maxRetries},async complete(r){let a=r.messages.find(m=>m.role==="system"),c=r.messages.filter(m=>m.role!=="system").map(m=>{if(m.role==="tool")return{role:"user",content:[{type:"tool_result",tool_use_id:m.toolCallId,content:m.content}]};if(m.role==="assistant"){if(m.toolCalls&&m.toolCalls.length>0){let E=[];m.content&&E.push({type:"text",text:m.content});for(let C of m.toolCalls)E.push({type:"tool_use",id:C.id,name:C.name,input:C.arguments});return{role:"assistant",content:E}}return{role:"assistant",content:m.content}}return{role:"user",content:m.content}}),i={model:r.model,max_tokens:r.params.maxTokens,messages:c};a&&(i.system=a.content),r.params.temperature!==void 0&&(i.temperature=r.params.temperature),r.params.topP!==void 0&&(i.top_p=r.params.topP),r.params.stopSequences&&(i.stop_sequences=r.params.stopSequences),r.tools&&r.tools.length>0&&(i.tools=r.tools.map(m=>({name:m.name,description:m.description??"",input_schema:m.parameters??{type:"object",properties:{}}})),r.toolChoice&&(i.tool_choice=r.toolChoice==="required"?{type:"any"}:{type:r.toolChoice}));let l=Date.now(),p=await v(async()=>{let m=await e.fetch(`${s}/v1/messages`,{method:"POST",headers:{"Content-Type":"application/json","x-api-key":t,"anthropic-version":"2023-06-01"},body:JSON.stringify(i),timeoutMs:n});if(!m.ok){let E;try{E=await m.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Anthropic API",m.status,m.status>=500)}throw Wt(m.status,E)}try{return await m.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Anthropic API",m.status,m.status>=500)}},{maxRetries:o,shouldRetry:m=>m instanceof h&&m.retryable,getRetryAfterMs:m=>{if(!(m instanceof h)||!m.raw)return;let w=m.raw.headers?.["retry-after"];if(typeof w=="string"){let A=Number(w);if(!Number.isNaN(A)&&A>0)return A*1e3}}}),f=Date.now()-l,y=p,R="",x=[];for(let m of y.content??[])m.type==="text"?R+=m.text:m.type==="tool_use"&&x.push({id:m.id,name:m.name,arguments:m.input??{}});let g=y.usage?.input_tokens??0,u=y.usage?.output_tokens??0;return{text:R,toolCalls:x,usage:{inputTokens:g,outputTokens:u,totalTokens:g+u},raw:p,latencyMs:f,modelId:y.model??r.model,finishReason:zt(y.stop_reason)}},estimateCost(r,a){let c=U(r,Ht);return c.ok?a.inputTokens/1e6*c.price.input+a.outputTokens/1e6*c.price.output:null},supportsTools(r){return!0}}}function Xt(e,t){if(t)return"tool_calls";switch(e){case"stop":return"stop";case"length":return"max_tokens";default:return"unknown"}}function Yt(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error):"Unknown error";switch(e){case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function de(e){let t="http://localhost:11434",s=6e4,n=2;return{name:"ollama",async initialize(o){o.baseUrl&&(t=o.baseUrl),s=o.timeoutMs,n=o.maxRetries},async complete(o){let r=o.messages.map(u=>u.role==="tool"?{role:"tool",content:u.content}:u.role==="assistant"&&u.toolCalls&&u.toolCalls.length>0?{role:"assistant",content:u.content||"",tool_calls:u.toolCalls.map(m=>({function:{name:m.name,arguments:m.arguments}}))}:{role:u.role,content:u.content}),a={model:o.model,messages:r,stream:!1,options:{temperature:o.params.temperature,num_predict:o.params.maxTokens}},c=a.options;o.params.topP!==void 0&&(c.top_p=o.params.topP),o.params.seed!==void 0&&(c.seed=o.params.seed),o.params.stopSequences&&(c.stop=o.params.stopSequences),o.tools&&o.tools.length>0&&(a.tools=o.tools.map(u=>({type:"function",function:{name:u.name,description:u.description,parameters:u.parameters}})));let i={"Content-Type":"application/json"},l=Date.now(),p=await v(async()=>{let u=await e.fetch(`${t}/api/chat`,{method:"POST",headers:i,body:JSON.stringify(a),timeoutMs:s});if(!u.ok){let m;try{m=await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Ollama API",u.status,u.status>=500)}throw Yt(u.status,m)}try{return await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Ollama API",u.status,u.status>=500)}},{maxRetries:n,shouldRetry:u=>u instanceof h&&u.retryable}),f=Date.now()-l,y=p,R=y.message,x=R?.tool_calls??[],g=x.map((u,m)=>({id:`ollama_call_${Date.now()}_${m}`,name:u.function.name,arguments:u.function.arguments??{}}));return{text:R?.content??"",toolCalls:g,usage:{inputTokens:y.prompt_eval_count??0,outputTokens:y.eval_count??0,totalTokens:(y.prompt_eval_count??0)+(y.eval_count??0)},raw:p,latencyMs:f,modelId:y.model??o.model,finishReason:Xt(y.done_reason,x.length>0)}},estimateCost(){return 0},supportsTools(){return!0}}}var Qt={"gemini-2.0-flash":{input:.1,output:.4},"gemini-2.0-flash-lite":{input:.075,output:.3},"gemini-1.5-pro":{input:1.25,output:5},"gemini-1.5-flash":{input:.075,output:.3},"gemini-1.5-flash-8b":{input:.0375,output:.15}};function Zt(e,t){if(t)return"tool_calls";switch(e){case"STOP":return"stop";case"MAX_TOKENS":return"max_tokens";case"SAFETY":return"stop";default:return"unknown"}}function Je(e,t){let s=typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 400:return s.toLowerCase().includes("api key")?new h("AUTH_FAILED",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 401:case 403:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function pe(e){let t="",s="https://generativelanguage.googleapis.com/v1beta",n=6e4,o=2;return{name:"gemini",async initialize(r){if(!r.apiKey)throw new h("AUTH_FAILED","API key is required");t=r.apiKey,r.baseUrl&&(s=r.baseUrl),n=r.timeoutMs,o=r.maxRetries},async complete(r){let a=[],c;for(let A of r.messages){if(A.role==="system"){c={parts:[{text:A.content}]};continue}if(A.role==="tool"){a.push({role:"function",parts:[{functionResponse:{name:A.toolName??"unknown",response:qt(A.content)}}]});continue}if(A.role==="assistant"&&A.toolCalls&&A.toolCalls.length>0){let P=[];A.content&&P.push({text:A.content});for(let b of A.toolCalls)P.push({functionCall:{name:b.name,args:b.arguments}});a.push({role:"model",parts:P});continue}let B=A.role==="assistant"?"model":"user";a.push({role:B,parts:[{text:A.content}]})}let i={contents:a,generationConfig:{temperature:r.params.temperature,maxOutputTokens:r.params.maxTokens}};c&&(i.systemInstruction=c);let l=i.generationConfig;r.params.topP!==void 0&&(l.topP=r.params.topP),r.params.stopSequences&&(l.stopSequences=r.params.stopSequences),r.tools&&r.tools.length>0&&(i.tools=[{functionDeclarations:r.tools.map(A=>({name:A.name,description:A.description,parameters:A.parameters}))}],r.toolChoice&&(i.toolConfig=er(r.toolChoice)));let p={"Content-Type":"application/json","x-goog-api-key":t},f=Date.now(),y=`${s}/models/${r.model}:generateContent`,R=await v(async()=>{let A=await e.fetch(y,{method:"POST",headers:p,body:JSON.stringify(i),timeoutMs:n});if(!A.ok){let B;try{B=await A.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Gemini API",A.status,A.status>=500)}throw Je(A.status,B)}try{return await A.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Gemini API",A.status,A.status>=500)}},{maxRetries:o,shouldRetry:A=>A instanceof h&&A.retryable}),x=Date.now()-f,g=R;if(g.error){let A=g.error.code??400;throw Je(A,{error:g.error})}let u=g.candidates?.[0],m=u?.content?.parts??[],E="",C=[],w=0;for(let A of m)A.text!==void 0&&(E+=A.text),A.functionCall&&(C.push({id:`gemini_call_${w}`,name:A.functionCall.name,arguments:A.functionCall.args??{}}),w++);return{text:E,toolCalls:C,usage:{inputTokens:g.usageMetadata?.promptTokenCount??0,outputTokens:g.usageMetadata?.candidatesTokenCount??0,totalTokens:g.usageMetadata?.totalTokenCount??0},raw:R,latencyMs:x,modelId:g.modelVersion??r.model,finishReason:Zt(u?.finishReason,C.length>0)}},estimateCost(r,a){let c=U(r,Qt);return c.ok?a.inputTokens/1e6*c.price.input+a.outputTokens/1e6*c.price.output:null},supportsTools(){return!0}}}function qt(e){try{return JSON.parse(e)}catch{return{result:e}}}function er(e){switch(e){case"auto":return{functionCallingConfig:{mode:"AUTO"}};case"required":return{functionCallingConfig:{mode:"ANY"}};case"none":return{functionCallingConfig:{mode:"NONE"}}}}function tr(e){switch(e){case"stop":return"stop";case"length":return"max_tokens";case"tool_calls":return"tool_calls";default:return"unknown"}}function rr(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):typeof t=="object"&&t!==null&&"error"in t?String(t.error?.message??"Unknown error"):"Unknown error";switch(e){case 401:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new h("CONTEXT_LENGTH",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function me(e){let t="",s="https://api.mistral.ai/v1",n=6e4,o=2;return{name:"mistral",async initialize(r){if(!r.apiKey)throw new h("AUTH_FAILED","API key is required");t=r.apiKey,r.baseUrl&&(s=r.baseUrl),n=r.timeoutMs,o=r.maxRetries},async complete(r){let a={model:r.model,messages:r.messages.map(g=>g.role==="tool"?{role:"tool",content:g.content,tool_call_id:g.toolCallId}:g.role==="assistant"&&g.toolCalls&&g.toolCalls.length>0?{role:"assistant",content:g.content||null,tool_calls:g.toolCalls.map(u=>({id:u.id,type:"function",function:{name:u.name,arguments:JSON.stringify(u.arguments)}}))}:{role:g.role,content:g.content}),temperature:r.params.temperature,max_tokens:r.params.maxTokens};r.params.topP!==void 0&&(a.top_p=r.params.topP),r.params.stopSequences&&(a.stop=r.params.stopSequences),r.tools&&r.tools.length>0&&(a.tools=r.tools.map(g=>({type:"function",function:{name:g.name,description:g.description,parameters:g.parameters}})),r.toolChoice&&(a.tool_choice=r.toolChoice));let c={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),l=await v(async()=>{let g=await e.fetch(`${s}/chat/completions`,{method:"POST",headers:c,body:JSON.stringify(a),timeoutMs:n});if(!g.ok){let u;try{u=await g.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Mistral API",g.status,g.status>=500)}throw rr(g.status,u)}try{return await g.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Mistral API",g.status,g.status>=500)}},{maxRetries:o,shouldRetry:g=>g instanceof h&&g.retryable}),p=Date.now()-i,f=l,y=f.choices?.[0],R=y?.message,x=(R?.tool_calls??[]).map(g=>{let u;try{u=JSON.parse(g.function.arguments||"{}")}catch{u={_raw:g.function.arguments}}return{id:g.id,name:g.function.name,arguments:u}});return{text:R?.content??"",toolCalls:x,usage:{inputTokens:f.usage?.prompt_tokens??0,outputTokens:f.usage?.completion_tokens??0,totalTokens:f.usage?.total_tokens??0},raw:l,latencyMs:p,modelId:f.model??r.model,finishReason:tr(y?.finish_reason)}},estimateCost(r,a){return null},supportsTools(r){return!0}}}function sr(e){switch(e){case"COMPLETE":return"stop";case"MAX_TOKENS":return"max_tokens";case"TOOL_CALL":return"tool_calls";default:return"unknown"}}function nr(e,t){let s=typeof t=="object"&&t!==null&&"message"in t?String(t.message??"Unknown error"):"Unknown error";switch(e){case 401:return new h("AUTH_FAILED",s,e,!1,t);case 429:return new h("RATE_LIMITED",s,e,!0,t);case 404:return new h("MODEL_NOT_FOUND",s,e,!1,t);case 400:return s.toLowerCase().includes("context length")?new h("CONTEXT_LENGTH",s,e,!1,t):new h("PROVIDER_ERROR",s,e,!1,t);case 408:return new h("TIMEOUT",s,e,!0,t);default:return new h(e>=500?"PROVIDER_ERROR":"UNKNOWN",s,e,e>=500,t)}}function fe(e){let t="",s="https://api.cohere.com",n=6e4,o=2;return{name:"cohere",async initialize(r){if(!r.apiKey)throw new h("AUTH_FAILED","API key is required");t=r.apiKey,r.baseUrl&&(s=r.baseUrl),n=r.timeoutMs,o=r.maxRetries},async complete(r){let a={model:r.model,messages:r.messages.map(u=>u.role==="tool"?{role:"tool",content:u.content,tool_call_id:u.toolCallId}:u.role==="assistant"&&u.toolCalls&&u.toolCalls.length>0?{role:"assistant",content:u.content||null,tool_calls:u.toolCalls.map(m=>({id:m.id,type:"function",function:{name:m.name,arguments:JSON.stringify(m.arguments)}}))}:{role:u.role,content:u.content}),temperature:r.params.temperature,max_tokens:r.params.maxTokens};r.params.topP!==void 0&&(a.p=r.params.topP),r.params.stopSequences&&(a.stop_sequences=r.params.stopSequences),r.tools&&r.tools.length>0&&(a.tools=r.tools.map(u=>({type:"function",function:{name:u.name,description:u.description,parameters:u.parameters}})),r.toolChoice&&(a.tool_choice=r.toolChoice));let c={"Content-Type":"application/json",Authorization:`Bearer ${t}`},i=Date.now(),l=await v(async()=>{let u=await e.fetch(`${s}/v2/chat`,{method:"POST",headers:c,body:JSON.stringify(a),timeoutMs:n});if(!u.ok){let m;try{m=await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Cohere API",u.status,u.status>=500)}throw nr(u.status,m)}try{return await u.json()}catch{throw new h("PROVIDER_ERROR","Malformed response body from Cohere API",u.status,u.status>=500)}},{maxRetries:o,shouldRetry:u=>u instanceof h&&u.retryable}),p=Date.now()-i,f=l,y=f.message?.content?.filter(u=>u.type==="text").map(u=>u.text).join("")??"",R=(f.message?.tool_calls??[]).map(u=>{let m;try{m=JSON.parse(u.function.arguments||"{}")}catch{m={_raw:u.function.arguments}}return{id:u.id,name:u.function.name,arguments:m}}),x=f.usage?.tokens?.input_tokens??0,g=f.usage?.tokens?.output_tokens??0;return{text:y,toolCalls:R,usage:{inputTokens:x,outputTokens:g,totalTokens:x+g},raw:l,latencyMs:p,modelId:f.model??r.model,finishReason:sr(f.finish_reason)}},estimateCost(r,a){return null},supportsTools(r){return!0}}}var Ge={openai:ue,anthropic:ce,ollama:de,gemini:pe,mistral:me,cohere:fe};function Ke(e,t){let s=Ge[e];if(!s){let n=Object.keys(Ge).join(", ");throw new Error(`Unknown provider: "${e}". Supported providers: ${n}`)}return s(t)}async function ge(e,t,s,n){let o=n?.maxTurns??10,r=[],a=[],c=[...t.messages],i={inputTokens:0,outputTokens:0,totalTokens:0},l=0;for(let R=0;R<o;R++){let x={...t,messages:c},g=await e.complete(x);if(r.push({request:x,response:g}),i.inputTokens+=g.usage.inputTokens,i.outputTokens+=g.usage.outputTokens,i.totalTokens+=g.usage.totalTokens,l+=g.latencyMs,g.toolCalls.length===0)return{turns:r,finalText:g.text,allToolCalls:a,totalUsage:i,totalLatencyMs:l,truncated:!1};a.push(...g.toolCalls),c=[...c,{role:"assistant",content:g.text,toolCalls:g.toolCalls}];for(let u of g.toolCalls){let m=s.find(C=>C.name===u.name),E;m?E=or(m,u.arguments):E={error:`Tool "${u.name}" not simulated`},c.push({role:"tool",content:JSON.stringify(E),toolCallId:u.id,toolName:u.name})}}let f=r[r.length-1]?.response??{text:"",toolCalls:[],usage:{inputTokens:0,outputTokens:0,totalTokens:0},raw:null,latencyMs:0,modelId:t.model,finishReason:"unknown"},y=f.text?`${f.text}
|
|
32
|
+
[Note: Conversation truncated after ${o} turns]`:`[Note: Conversation truncated after ${o} turns]`;return{turns:r,finalText:y,allToolCalls:a,totalUsage:i,totalLatencyMs:l,truncated:!0}}function or(e,t){if(e.responses){for(let s of e.responses)if(ar(s.when,t))return s.then}return e.defaultResponse!==void 0?e.defaultResponse:{error:"No matching simulation response"}}function ar(e,t){for(let[s,n]of Object.entries(e))if(JSON.stringify(t[s])!==JSON.stringify(n))return!1;return!0}var d=require("zod");var _=require("zod"),Ee=_.z.object({outputTextAttr:_.z.string().default("gen_ai.completion.0.content"),modelAttr:_.z.string().default("gen_ai.response.model"),systemAttr:_.z.string().default("gen_ai.system"),inputTokensAttr:_.z.string().default("gen_ai.usage.input_tokens"),outputTokensAttr:_.z.string().default("gen_ai.usage.output_tokens")}),Pe=_.z.object({namePattern:_.z.string().optional().describe("Regex to filter span names"),attributeMatch:_.z.record(_.z.string()).optional().describe("Attributes that must match"),minDurationMs:_.z.number().min(0).optional()}),he=_.z.object({port:_.z.number().int().min(1).max(65535).default(4318),timeoutMs:_.z.number().int().min(1e3).default(3e4),spanMapping:Ee.default({}),spanFilter:Pe.optional()});var N=d.z.string().min(1,"Must not be empty"),ir=d.z.number().min(0).max(2).default(.2),V=d.z.number().min(0).max(1),He=d.z.string().refine(e=>{try{return new RegExp(e),!0}catch{return!1}},{message:"Must be a valid regex pattern"}),W=d.z.object({apiKeyEnv:N.describe("Environment variable name containing the API key. Never a raw key."),baseUrl:d.z.string().url().optional().describe("Custom base URL for API-compatible proxies (e.g., Azure OpenAI, LiteLLM)"),organization:d.z.string().optional().describe("Organization ID (OpenAI-specific)")}),lr=d.z.object({apiKeyEnv:d.z.string().min(1).optional().describe("Environment variable name containing the API key. Optional for Ollama (local)."),baseUrl:d.z.string().url().optional().describe("Ollama server URL. Defaults to http://localhost:11434.")}),ur=d.z.object({openai:W.optional(),anthropic:W.optional(),ollama:lr.optional(),gemini:W.optional(),mistral:W.optional(),cohere:W.optional()}).refine(e=>Object.keys(e).some(t=>e[t]!==void 0),{message:"At least one provider must be configured"}),cr=d.z.object({temperature:ir,maxTokens:d.z.number().int().min(1).max(128e3).default(1024),topP:d.z.number().min(0).max(1).optional(),stopSequences:d.z.array(d.z.string()).optional(),seed:d.z.number().int().optional().describe("Seed for reproducibility (provider-dependent support)")}),dr=d.z.object({id:N.describe("Unique identifier for this model config, referenced in reports"),provider:d.z.enum(["openai","anthropic","ollama","gemini","mistral","cohere"]).describe("Must match a key in the providers section"),model:N.describe("Model name as the provider expects it (e.g., 'gpt-4o', 'claude-sonnet-4-5-20250929')"),params:cr.default({})}),pr=d.z.object({system:d.z.string().optional().describe("System prompt template. Supports {{variable}} interpolation."),user:N.describe("User prompt template. Supports {{variable}} interpolation."),assistant:d.z.string().optional().describe("Prefill for assistant response (Anthropic-specific)")}),mr=d.z.object({format:d.z.enum(["text","json"]).default("text"),schemaFile:d.z.string().optional().describe("Path to JSON Schema file (relative to config file). Required if format is 'json'."),contains:d.z.array(d.z.string()).optional().describe("Output must contain all of these substrings"),notContains:d.z.array(d.z.string()).optional().describe("Output must not contain any of these substrings"),maxLength:d.z.number().int().positive().optional().describe("Maximum character length of the output")}).refine(e=>!(e.format==="json"&&!e.schemaFile),{message:"schemaFile is required when format is 'json'"}),fr=d.z.object({enabled:d.z.boolean().default(!0),denyPatterns:d.z.array(He).default(["\\b\\d{3}-\\d{2}-\\d{4}\\b","\\b\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}\\b","\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"]).describe("Regex patterns that must NOT appear in output. Defaults include SSN, credit card, email."),customPatterns:d.z.array(d.z.object({name:N,pattern:He})).optional().describe("Named custom PII patterns for reporting clarity")}),gr=d.z.object({deny:d.z.array(d.z.string()).default([]).describe("Words/phrases that must NOT appear in output (case-insensitive)"),allow:d.z.array(d.z.string()).optional().describe("If set, output MUST contain at least one of these words/phrases")}),hr=d.z.object({criteria:N.describe("Natural language description of what to evaluate (e.g., 'Response is empathetic and professional')"),minScore:V.default(.7).describe("Minimum score (0-1) for this criterion to pass"),model:d.z.string().optional().describe("Override judge model for this criterion. Defaults to first model in models list."),rubric:d.z.string().optional().describe("Detailed rubric for the judge. If omitted, a default rubric is generated from criteria.")}),Rr=d.z.object({tool:N.describe("Expected tool/function name"),shouldNotCall:d.z.boolean().optional().default(!1).describe("If true, assert this tool was NOT called"),argsMatch:d.z.record(d.z.unknown()).optional().describe("Key-value pairs that must be present in the tool call arguments (partial match)"),argsSchema:d.z.string().optional().describe("Path to JSON Schema file to validate the tool call arguments"),argsSchemaResolved:d.z.record(d.z.unknown()).optional().describe("Resolved JSON Schema content from argsSchema file. Populated during config parsing \u2014 not user-supplied."),order:d.z.number().int().min(0).optional().describe("Expected position in the sequence of tool calls (0-indexed)")}),yr=d.z.object({maxScore:V.default(.15).describe("Maximum drift score (0-1). Higher = more drift allowed. Fail if exceeded."),method:d.z.enum(["judge","embedding","field-diff"]).default("judge").describe("Drift detection method. 'judge' uses LLM comparison, 'embedding' uses cosine similarity, 'field-diff' compares JSON fields."),fields:d.z.array(d.z.string()).optional().describe("For field-diff method: JSON paths to compare (e.g., ['response.action', 'response.message'])")}),br=d.z.object({pii:fr.optional(),keywords:gr.optional()}),xr=d.z.object({output:mr.optional(),guardrails:br.optional(),judge:d.z.array(hr).optional().describe("LLM-as-judge evaluations. Each criterion is scored independently."),toolCalls:d.z.array(Rr).optional().describe("Expected tool/function calls in the model response"),baseline:d.z.object({drift:yr.optional()}).optional(),latency:d.z.object({maxMs:d.z.number().positive().describe("Maximum allowed latency in milliseconds")}).optional().describe("Assert response latency is within threshold"),cost:d.z.object({maxUsd:d.z.number().positive().describe("Maximum allowed cost in USD")}).optional().describe("Assert response cost is within budget")}),Ar=d.z.object({name:N.describe("Tool/function name as the model sees it"),description:d.z.string().optional().describe("Tool description for documentation"),parameters:d.z.record(d.z.unknown()).optional().describe("JSON Schema for the tool's parameters"),responses:d.z.array(d.z.object({when:d.z.record(d.z.unknown()).describe("Condition: match tool call arguments (partial match)"),then:d.z.unknown().describe("Simulated response to return when condition matches")})).optional().describe("Simulated responses based on argument matching"),defaultResponse:d.z.unknown().optional().describe("Response when no 'when' condition matches")}),Tr=d.z.object({name:N.describe("Unique test case name within the suite. Used in reports and JUnit output."),prompt:N.optional().describe("Reference to a key in the prompts section. Exactly one of prompt or command must be set."),command:N.optional().describe("Shell command to execute. Stdout is captured and assertions run against it. Exactly one of prompt or command must be set."),vars:d.z.record(d.z.string()).default({}).describe("Variables to interpolate into the prompt template or command"),models:d.z.array(d.z.string()).optional().describe("Override: run this test only against these model IDs. Defaults to all models. Ignored for command tests."),repeat:d.z.number().int().min(1).max(100).optional().describe("Override: number of repeat runs for this specific test case"),tools:d.z.array(Ar).optional().describe("Simulated tools available to the model for this test case"),expect:xr.describe("Assertions to evaluate against the model output"),tags:d.z.array(d.z.string()).optional().describe("Tags for filtering test cases in CLI (e.g., --tags regression)"),skip:d.z.boolean().optional().default(!1).describe("Skip this test case during execution")}).refine(e=>{let t=e.prompt!==void 0,s=e.command!==void 0;return(t||s)&&!(t&&s)},{message:"Exactly one of 'prompt' or 'command' must be set on each test case"}),Cr=d.z.object({passRateMin:V.default(.95).describe("Minimum overall pass rate (0-1). Computed after repeats and aggregation."),schemaFailuresMax:d.z.number().int().min(0).default(0).describe("Maximum allowed schema validation failures across entire suite"),judgeAvgMin:V.optional().describe("Minimum average LLM-as-judge score across all criteria and test cases"),driftScoreMax:V.optional().describe("Maximum allowed drift score against active baseline"),piiFailuresMax:d.z.number().int().min(0).default(0).describe("Maximum allowed PII detection failures"),keywordFailuresMax:d.z.number().int().min(0).default(0).describe("Maximum allowed keyword guardrail failures"),costMaxUsd:d.z.number().positive().optional().describe("Maximum total cost in USD for the entire run. Aborts if exceeded mid-run."),latencyMaxMs:d.z.number().positive().optional().describe("Maximum average latency in ms. Fails gate if exceeded."),deterministicPassRate:V.optional().describe("Minimum pass rate for deterministic assertions only (tool_called, schema, pii, keywords, etc.)"),probabilisticPassRate:V.optional().describe("Minimum pass rate for probabilistic assertions only (judge, drift)")}),Er=d.z.object({enabled:d.z.boolean().default(!1),framework:d.z.enum(["eu-ai-act","custom"]).default("eu-ai-act"),outputDir:d.z.string().default("./compliance-reports"),metadata:d.z.object({systemName:d.z.string().optional().describe("Name of the AI system being tested"),systemVersion:d.z.string().optional().describe("Version of the AI system"),riskLevel:d.z.enum(["high","limited","minimal"]).optional(),operator:d.z.string().optional().describe("Organization operating the AI system"),intendedPurpose:d.z.string().optional().describe("Documented intended purpose of the AI system"),dataGovernanceNotes:d.z.string().optional()}).optional()}),Pr=d.z.object({enabled:d.z.boolean().default(!1),apiUrl:d.z.string().url().default("https://api.kindlm.com/v1").describe("Cloud API URL. Override for self-hosted deployments.")}),wr=d.z.object({name:N,description:d.z.string().optional(),tags:d.z.array(d.z.string()).optional()}),we=d.z.object({kindlm:d.z.literal(1).describe("Config schema version. Must be 1."),project:N.describe("Project identifier for cloud upload and report grouping"),suite:wr,providers:ur,models:d.z.array(dr).min(1,"At least one model must be configured"),prompts:d.z.record(pr).refine(e=>Object.keys(e).length>0,{message:"At least one prompt must be defined"}),tests:d.z.array(Tr).min(1,"At least one test case must be defined"),gates:Cr.default({}),compliance:Er.optional(),trace:he.optional().describe("OpenTelemetry trace ingestion configuration for the 'kindlm trace' command"),upload:Pr.default({}),defaults:d.z.object({repeat:d.z.number().int().min(1).max(100).default(1).describe("Default repeat count per test case"),concurrency:d.z.number().int().min(1).max(32).default(4).describe("Default concurrency for test execution"),timeoutMs:d.z.number().int().min(1e3).default(6e4).describe("Default timeout per provider call in ms"),judgeModel:d.z.string().optional().describe("Default model ID for LLM-as-judge assertions. Must reference a configured model.")}).default({})});function Re(e){let t=we.safeParse(e);return t.success?k(t.data):T({code:"CONFIG_VALIDATION_ERROR",message:"Config validation failed",details:{errors:t.error.issues.map(s=>`${s.path.join(".")}: ${s.message}`)}})}var Qe=require("yaml");var Mr=1048576,ze=1e3,We=50;function Ze(e,t){if(e.length>Mr)return T({code:"CONFIG_TOO_LARGE",message:`Config exceeds maximum size of 1MB (got ${(e.length/1048576).toFixed(1)}MB)`});let s;try{s=(0,Qe.parse)(e)}catch(i){return T({code:"CONFIG_PARSE_ERROR",message:`Failed to parse YAML: ${i.message}`,cause:i})}let n=Re(s);if(!n.success)return n;let o=n.data;if(o.tests.length>ze)return T({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${ze} tests (got ${o.tests.length})`});if(o.models.length>We)return T({code:"CONFIG_VALIDATION_ERROR",message:`Config exceeds maximum of ${We} models (got ${o.models.length})`});let r=[],a=new Set;for(let i of o.models)a.has(i.id)&&r.push(`Duplicate model ID "${i.id}"`),a.add(i.id);let c=new Set;for(let i of o.tests)c.has(i.name)&&r.push(`Duplicate test name "${i.name}"`),c.add(i.name);for(let i of o.tests)i.prompt&&!(i.prompt in o.prompts)&&r.push(`Test "${i.name}" references prompt "${i.prompt}" which is not defined`);for(let i of o.tests)if(i.models)for(let l of i.models)a.has(l)||r.push(`Test "${i.name}" references model "${l}" which is not configured`);for(let i of o.models)o.providers[i.provider]||r.push(`Model "${i.id}" references provider "${i.provider}" which is not configured`);if(o.defaults.judgeModel&&!a.has(o.defaults.judgeModel)&&r.push(`defaults.judgeModel "${o.defaults.judgeModel}" is not a configured model`),t.fileReader)for(let i of o.tests){if(i.expect.output?.schemaFile){let l=Xe(t.configDir,i.expect.output.schemaFile);l.success?t.fileReader.readFile(l.data).success||r.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" not found at ${l.data}`):r.push(`Test "${i.name}": schemaFile "${i.expect.output.schemaFile}" \u2014 ${l.error.message}`)}if(i.expect.toolCalls){for(let l of i.expect.toolCalls)if(l.argsSchema){let p=Xe(t.configDir,l.argsSchema);if(!p.success)r.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" \u2014 ${p.error.message}`);else{let f=t.fileReader.readFile(p.data);if(!f.success)r.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" not found at ${p.data}`);else try{l.argsSchemaResolved=JSON.parse(f.data)}catch{r.push(`Test "${i.name}": argsSchema "${l.argsSchema}" for tool "${l.tool}" is not valid JSON`)}}}}}return r.length>0?T({code:"CONFIG_VALIDATION_ERROR",message:"Config cross-reference validation failed",details:{errors:r}}):k(o)}function Xe(e,t){if(t.startsWith("/")||t.startsWith("\\"))return T({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});if(/^[a-zA-Z]:/.test(t))return T({code:"PATH_TRAVERSAL",message:"Absolute paths are not allowed in config references"});let s=e.endsWith("/")?e.slice(0,-1):e,n=`${s}/${t}`,o=Ye(n),r=Ye(s);return!o.startsWith(r+"/")&&o!==r?T({code:"PATH_TRAVERSAL",message:`Path "${t}" escapes the config directory`}):k(o)}function Ye(e){let t=e.split("/"),s=[];for(let o of t)o==="."||o===""||(o===".."?s.pop():s.push(o));return(e.startsWith("/")?"/":"")+s.join("/")}var qe=/\{\{([\w.]+)\}\}/g;function J(e,t,s){let n=Me(e,t,s);if(n.length>0)return T({code:"CONFIG_VALIDATION_ERROR",message:`Missing template variables: ${n.join(", ")}`,details:{missing:n}});let o=e.replace(qe,(r,a)=>{if(a.startsWith("env.")){let c=a.slice(4),i=s?.[c];return i===void 0?r:i}return t[a]});return k(o)}function Me(e,t,s){let n=new Set;for(let o of e.matchAll(qe)){let r=o[1];r!==void 0&&(r.startsWith("env.")||r in t||n.add(r))}return[...n]}function ye(e){let t=e[0];if(!t)return T("aggregateRuns requires at least one run");let{testCaseName:s,modelId:n}=t,r=e.filter(R=>R.assertions.every(x=>x.passed)).length/e.length,a=new Map;for(let R of e)for(let x of R.assertions){let g=x.label?`${x.assertionType}:${x.label}`:x.assertionType,u=a.get(g);u||(u=[],a.set(g,u)),u.push(x.score)}let c={};for(let[R,x]of a){let g=x.reduce((u,m)=>u+m,0);c[R]={mean:g/x.length,min:Math.min(...x),max:Math.max(...x)}}let i=new Set;for(let R of e)for(let x of R.assertions)!x.passed&&x.failureCode&&i.add(x.failureCode);let l=e.reduce((R,x)=>R+x.latencyMs,0)/e.length,p=e.reduce((R,x)=>R+(x.costEstimateUsd??0),0),f=e.reduce((R,x)=>R+x.tokenUsage.totalTokens,0),y=e.some(R=>R.errored===!0);return k({testCaseName:s,modelId:n,runCount:e.length,passed:r===1,errored:y,passRate:r,assertionScores:c,failureCodes:[...i],latencyAvgMs:l,totalCostUsd:p,totalTokens:f,runs:e})}function be(e){let t=e.stdout.split(`
|
|
33
|
+
`),s=[],n=[],o,r=0;for(let a of t){let c=a.trimStart();if(!c.startsWith('{"kindlm":')){s.push(a);continue}let i=Sr(c);if(!i){s.push(a);continue}i.kindlm==="tool_call"?n.push({id:i.id??`cmd_tc_${r++}`,name:i.name,arguments:i.arguments}):i.kindlm==="output_json"&&(o=i.data)}return{outputText:s.join(`
|
|
34
|
+
`).trim(),toolCalls:n,outputJson:o,exitCode:e.exitCode,stderr:e.stderr}}function Sr(e){try{let t=JSON.parse(e);return typeof t.kindlm!="string"?null:t.kindlm==="tool_call"?typeof t.name!="string"?null:{kindlm:"tool_call",id:typeof t.id=="string"?t.id:void 0,name:t.name,arguments:typeof t.arguments=="object"&&t.arguments!==null?t.arguments:{}}:t.kindlm==="output_json"?{kindlm:"output_json",data:t.data}:null}catch{return null}}function et(e,t,s){return{async run(){let n=Date.now(),o=new Map;for(let b of e.tests)if(b.expect.output?.schemaFile){let S=Ir(t.configDir,b.expect.output.schemaFile),O=t.fileReader.readFile(S);if(!O.success)return T({code:"SCHEMA_FILE_ERROR",message:`Failed to read schema file "${b.expect.output.schemaFile}": ${O.error.message}`});try{o.set(b.name,JSON.parse(O.data))}catch(j){return T({code:"SCHEMA_FILE_ERROR",message:`Failed to parse schema file "${b.expect.output.schemaFile}" as JSON: ${j instanceof Error?j.message:String(j)}`})}}let r=s?.tags,a=[];for(let b of e.tests){if(b.skip)continue;if(r&&r.length>0){let O=b.tags??[];if(!r.some(L=>O.includes(L)))continue}let S=b.repeat??e.defaults.repeat;if(b.command)for(let O=0;O<S;O++)a.push({test:b,modelConfig:null,runIndex:O});else{let O=b.models??e.models.map(j=>j.id);for(let j of O){let L=e.models.find(I=>I.id===j);if(L)for(let I=0;I<S;I++)a.push({test:b,modelConfig:L,runIndex:I})}}}let c=0,i=!1,l=e.gates?.costMaxUsd,p=await _r(a.map(b=>async()=>{if(i)return vr(b.test.name,b.modelConfig?.id??"command",b.runIndex,c,l??0);let S=await Or(e,t,b,o);return S.costEstimateUsd!==null&&S.costEstimateUsd!==void 0&&(c+=S.costEstimateUsd,l!==void 0&&c>l&&(i=!0)),S}),e.defaults.concurrency),f=b=>`${b.testCaseName}::${b.modelId}`,y=new Map;for(let b of p){let S=f(b),O=y.get(S);O||(O=[],y.set(S,O)),O.push(b)}let R=[];for(let b of y.values()){let S=ye(b);if(!S.success)return T({code:"UNKNOWN_ERROR",message:S.error});R.push(S.data)}let x=R.map(b=>{let S=b.errored?"errored":b.passed?"passed":"failed",O=b.passed?b.runs[0]:b.runs.find(L=>!L.assertions.every(I=>I.passed))??b.runs[0],j;if(S==="errored"){let L=b.runs.find(I=>I.errored&&I.error);if(L?.error)j={code:"UNKNOWN_ERROR",message:L.error.message};else{let I=b.runs.flatMap(X=>X.assertions).find(X=>!X.passed&&X.failureMessage);I?.failureMessage&&(j={code:"UNKNOWN_ERROR",message:I.failureMessage})}}return{name:b.testCaseName,modelId:b.modelId,status:S,assertions:O?.assertions??[],error:j,latencyMs:b.latencyAvgMs,costUsd:b.totalCostUsd}}),g=e.tests.filter(b=>b.skip).map(b=>({name:b.name,modelId:"",status:"skipped",assertions:[],latencyMs:0,costUsd:0})),u=[...x,...g],m=u.filter(b=>b.status==="passed").length,E=u.filter(b=>b.status==="failed").length,C=u.filter(b=>b.status==="errored").length,w=u.filter(b=>b.status==="skipped").length,A=C>0?"errored":E>0?"failed":"passed",P={suites:[{name:e.suite.name,status:A,tests:u}],totalTests:u.length,passed:m,failed:E,errored:C,skipped:w,durationMs:Date.now()-n};return k({runResult:P,aggregated:R})}}}async function Or(e,t,s,n){let{test:o,modelConfig:r,runIndex:a}=s;if(o.command)return kr(e,t,o,a,n);if(!r)return $(o.name,"unknown",a,"No model config for prompt-based test");try{t.onProgress?.({type:"test_start",test:o.name,model:r.id,run:a})}catch{}try{let c=t.adapters.get(r.provider);if(!c)return $(o.name,r.id,a,`Provider adapter "${r.provider}" not found`);let i=o.prompt?e.prompts[o.prompt]:void 0;if(!i)return $(o.name,r.id,a,`Prompt "${o.prompt}" not defined`);let l=J(i.user,o.vars);if(!l.success)return $(o.name,r.id,a,l.error.message);let p=[];if(i.system){let P=J(i.system,o.vars);if(!P.success)return $(o.name,r.id,a,P.error.message);p.push({role:"system",content:P.data})}if(p.push({role:"user",content:l.data}),i.assistant){let P=J(i.assistant,o.vars);if(!P.success)return $(o.name,r.id,a,P.error.message);p.push({role:"assistant",content:P.data})}let f=(o.tools??[]).map(P=>({name:P.name,description:P.description,parameters:P.parameters})),y={model:r.model,messages:p,params:{temperature:r.params.temperature,maxTokens:r.params.maxTokens,topP:r.params.topP,stopSequences:r.params.stopSequences,seed:r.params.seed},tools:f.length>0?f:void 0},R=await ge(c,y,o.tools??[]),x=c.estimateCost(r.model,R.totalUsage),g={};o.expect.output?.schemaFile&&n.has(o.name)&&(g.schemaContent=n.get(o.name));let u=z(o.expect,g),m=e.defaults.judgeModel??e.models[0]?.id,E=e.models.find(P=>P.id===m),C=E?t.adapters.get(E.provider):void 0,w={outputText:R.finalText,toolCalls:R.allToolCalls,configDir:t.configDir,latencyMs:R.totalLatencyMs,costUsd:x??void 0,judgeAdapter:C,judgeModel:E?.model,getEmbedding:c.embed?(P=>b=>P(b))(c.embed):void 0};if(t.baselineData){let P=`${o.name}::${r.id}`,b=t.baselineData.results[P];b&&(w.baselineText=b.outputText)}let A=[];for(let P of u){let b=await P.evaluate(w);A.push(...b)}let B=A.every(P=>P.passed);try{t.onProgress?.({type:"test_complete",test:o.name,model:r.id,run:a,passed:B})}catch{}return{testCaseName:o.name,modelId:r.id,runIndex:a,outputText:R.finalText,assertions:A,latencyMs:R.totalLatencyMs,tokenUsage:R.totalUsage,costEstimateUsd:x}}catch(c){try{t.onProgress?.({type:"test_complete",test:o.name,model:r.id,run:a,passed:!1})}catch{}return $(o.name,r.id,a,c instanceof Error?c.message:String(c))}}async function kr(e,t,s,n,o){let r="command";try{t.onProgress?.({type:"test_start",test:s.name,model:r,run:n})}catch{}try{if(!t.commandExecutor)return $(s.name,r,n,"Command executor not available");if(!s.command)return $(s.name,r,n,"No command specified");let a=J(s.command,s.vars);if(!a.success)return $(s.name,r,n,a.error.message);let c=Date.now(),i=await t.commandExecutor.execute(a.data,{timeoutMs:e.defaults.timeoutMs,cwd:t.configDir});if(!i.success)return $(s.name,r,n,i.error.message);let l=Date.now()-c,p=be(i.data),f={};s.expect.output?.schemaFile&&o.has(s.name)&&(f.schemaContent=o.get(s.name));let y=z(s.expect,f),R=e.defaults.judgeModel??e.models[0]?.id,x=e.models.find(C=>C.id===R),g=x?t.adapters.get(x.provider):void 0,u={outputText:p.outputText,outputJson:p.outputJson,toolCalls:p.toolCalls,configDir:t.configDir,latencyMs:l,judgeAdapter:g,judgeModel:x?.model,getEmbedding:g?.embed?(C=>w=>C(w))(g.embed):void 0};if(t.baselineData){let C=`${s.name}::${r}`,w=t.baselineData.results[C];w&&(u.baselineText=w.outputText)}let m=[];for(let C of y){let w=await C.evaluate(u);m.push(...w)}let E=m.every(C=>C.passed);try{t.onProgress?.({type:"test_complete",test:s.name,model:r,run:n,passed:E})}catch{}return{testCaseName:s.name,modelId:r,runIndex:n,outputText:p.outputText,assertions:m,latencyMs:l,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null}}catch(a){try{t.onProgress?.({type:"test_complete",test:s.name,model:r,run:n,passed:!1})}catch{}return $(s.name,r,n,a instanceof Error?a.message:String(a))}}function vr(e,t,s,n,o){return{testCaseName:e,modelId:t,runIndex:s,outputText:"",assertions:[{assertionType:"cost",label:"Budget exceeded",passed:!1,score:0,failureCode:"BUDGET_EXCEEDED",failureMessage:`Run budget exceeded: $${n.toFixed(4)} > $${o.toFixed(4)}`}],latencyMs:0,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:0}}function $(e,t,s,n){return{testCaseName:e,modelId:t,runIndex:s,outputText:"",assertions:[{assertionType:"internal",label:"Execution error",passed:!1,score:0,failureCode:"INTERNAL_ERROR",failureMessage:n}],latencyMs:0,tokenUsage:{inputTokens:0,outputTokens:0,totalTokens:0},costEstimateUsd:null,errored:!0,error:{code:"UNKNOWN_ERROR",message:n}}}async function _r(e,t){let s=Array.from({length:e.length}),n=0;async function o(){for(;n<e.length;){let a=n++,c=e[a];if(c===void 0)throw new Error(`Task at index ${a} is undefined`);s[a]=await c()}}let r=Array.from({length:Math.min(t,e.length)},()=>o());return await Promise.all(r),s}function Ir(e,t){return t.startsWith("/")?t:`${e.endsWith("/")?e.slice(0,-1):e}/${t}`}function st(e,t){let s=[],n=t.reduce((l,p)=>l+p.runCount,0),o=t.reduce((l,p)=>l+Math.round(p.passRate*p.runCount),0),r=n>0?o/n:0;s.push({gateName:"passRateMin",passed:r>=e.passRateMin,actual:r,threshold:e.passRateMin,message:r>=e.passRateMin?`Pass rate ${M(r)} meets minimum ${M(e.passRateMin)}`:`Pass rate ${M(r)} below minimum ${M(e.passRateMin)}`});let a=Se(t,["SCHEMA_INVALID","SCHEMA_PARSE_ERROR"]);if(s.push({gateName:"schemaFailuresMax",passed:a<=e.schemaFailuresMax,actual:a,threshold:e.schemaFailuresMax,message:a<=e.schemaFailuresMax?`Schema failures ${a} within limit ${e.schemaFailuresMax}`:`Schema failures ${a} exceed limit ${e.schemaFailuresMax}`}),e.judgeAvgMin!==void 0){let l=rt(t,"judge"),p=l.length>0?l.reduce((f,y)=>f+y,0)/l.length:1;s.push({gateName:"judgeAvgMin",passed:p>=e.judgeAvgMin,actual:p,threshold:e.judgeAvgMin,message:p>=e.judgeAvgMin?`Judge average ${M(p)} meets minimum ${M(e.judgeAvgMin)}`:`Judge average ${M(p)} below minimum ${M(e.judgeAvgMin)}`})}if(e.driftScoreMax!==void 0){let p=rt(t,"drift").map(y=>1-y),f=p.length>0?Math.max(...p):0;s.push({gateName:"driftScoreMax",passed:f<=e.driftScoreMax,actual:f,threshold:e.driftScoreMax,message:f<=e.driftScoreMax?`Drift score ${M(f)} within limit ${M(e.driftScoreMax)}`:`Drift score ${M(f)} exceeds limit ${M(e.driftScoreMax)}`})}let c=Se(t,["PII_DETECTED"]);s.push({gateName:"piiFailuresMax",passed:c<=e.piiFailuresMax,actual:c,threshold:e.piiFailuresMax,message:c<=e.piiFailuresMax?`PII failures ${c} within limit ${e.piiFailuresMax}`:`PII failures ${c} exceed limit ${e.piiFailuresMax}`});let i=Se(t,["KEYWORD_DENIED","KEYWORD_MISSING"]);if(s.push({gateName:"keywordFailuresMax",passed:i<=e.keywordFailuresMax,actual:i,threshold:e.keywordFailuresMax,message:i<=e.keywordFailuresMax?`Keyword failures ${i} within limit ${e.keywordFailuresMax}`:`Keyword failures ${i} exceed limit ${e.keywordFailuresMax}`}),e.costMaxUsd!==void 0){let l=t.reduce((p,f)=>p+f.totalCostUsd,0);s.push({gateName:"costMaxUsd",passed:l<=e.costMaxUsd,actual:l,threshold:e.costMaxUsd,message:l<=e.costMaxUsd?`Total cost $${l.toFixed(4)} within limit $${e.costMaxUsd.toFixed(4)}`:`Total cost $${l.toFixed(4)} exceeds limit $${e.costMaxUsd.toFixed(4)}`})}if(e.latencyMaxMs!==void 0){let l=t.length>0?t.reduce((p,f)=>p+f.latencyAvgMs,0)/t.length:0;s.push({gateName:"latencyMaxMs",passed:l<=e.latencyMaxMs,actual:l,threshold:e.latencyMaxMs,message:l<=e.latencyMaxMs?`Average latency ${Math.round(l)}ms within limit ${e.latencyMaxMs}ms`:`Average latency ${Math.round(l)}ms exceeds limit ${e.latencyMaxMs}ms`})}if(e.deterministicPassRate!==void 0){let l=tt(t,"deterministic");s.push({gateName:"deterministicPassRate",passed:l>=e.deterministicPassRate,actual:l,threshold:e.deterministicPassRate,message:l>=e.deterministicPassRate?`Deterministic pass rate ${M(l)} meets minimum ${M(e.deterministicPassRate)}`:`Deterministic pass rate ${M(l)} below minimum ${M(e.deterministicPassRate)}`})}if(e.probabilisticPassRate!==void 0){let l=tt(t,"probabilistic");s.push({gateName:"probabilisticPassRate",passed:l>=e.probabilisticPassRate,actual:l,threshold:e.probabilisticPassRate,message:l>=e.probabilisticPassRate?`Probabilistic pass rate ${M(l)} meets minimum ${M(e.probabilisticPassRate)}`:`Probabilistic pass rate ${M(l)} below minimum ${M(e.probabilisticPassRate)}`})}return{passed:s.every(l=>l.passed),gates:s}}function tt(e,t){let s=0,n=0;for(let o of e)for(let r of o.runs)for(let a of r.assertions)H(a.assertionType)===t&&(s++,a.passed&&n++);return s>0?n/s:1}function Se(e,t){let s=0;for(let n of e)for(let o of n.runs)for(let r of o.assertions)!r.passed&&r.failureCode&&t.includes(r.failureCode)&&s++;return s}function rt(e,t){let s=[];for(let n of e)for(let[o,r]of Object.entries(n.assertionScores))(o===t||o.startsWith(`${t}:`))&&s.push(r.mean);return s}function M(e){return(e*100).toFixed(1)+"%"}var F=e=>e,xe={bold:F,red:F,green:F,yellow:F,cyan:F,dim:F,greenBold:F,redBold:F};function ot(e=xe){return{name:"pretty",async generate(t,s){let n=[],o=e;n.push(""),n.push(o.bold(" KindLM Test Results")),n.push("");let r=0;for(let p of t.suites){n.push(Nr(p,o));for(let f of p.tests){n.push(jr(f,o));let y=$r(f,o);y&&n.push(y);for(let R of f.assertions)n.push(Dr(R,o));r+=f.costUsd}n.push("")}n.push(o.bold(" Summary"));let a=o.green(`${t.passed} passed`),c=t.failed>0?o.red(`${t.failed} failed`):`${t.failed} failed`,i=t.errored>0?o.yellow(`${t.errored} errored`):`${t.errored} errored`;if(n.push(` ${a}, ${c}, ${i} (${t.totalTests} total)`),n.push(` Duration: ${at(t.durationMs)}`),r>0&&n.push(` Cost: ${it(r)}`),n.push(""),s.gates.length>0){n.push(o.bold(" Quality Gates"));for(let p of s.gates){let f=p.passed?o.green("\u2713"):o.red("\u2717");n.push(` ${f} ${p.message}`)}n.push("")}return t.failed===0&&t.errored===0&&s.passed?n.push(o.greenBold(" \u2713 All tests passed")):n.push(o.redBold(" \u2717 Some tests failed")),n.push(""),{content:n.join(`
|
|
35
|
+
`),format:"text"}}}}function Nr(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${t.bold(e.name)}`}function jr(e,t){return` ${e.status==="passed"?t.green("\u2713"):e.status==="skipped"?t.yellow("\u25CB"):t.red("\u2717")} ${e.name}`}function $r(e,t){if(e.status==="skipped")return null;let s=[];return e.modelId&&s.push(e.modelId),e.latencyMs>0&&s.push(at(e.latencyMs)),e.costUsd>=5e-5&&s.push(it(e.costUsd)),s.length===0?null:` ${t.dim(s.join(" \xB7 "))}`}function Dr(e,t){if(e.passed){let r=nt(e),a=r?`${e.label} ${t.cyan(r)}`:e.label;return` ${t.green("\u2713")} ${t.dim(a)}`}let s=nt(e),n=e.failureMessage??"failed",o=s?`${e.label} ${t.cyan(s)}`:e.label;return` ${t.red("\u2717")} ${o}: ${n}`}function nt(e){if(e.assertionType==="judge"||e.assertionType==="drift"){let t=Lr(e);if(t!==null){let s=e.passed?"\u2265":"<";return`(${e.score.toFixed(2)} ${s} ${t.toFixed(2)})`}return`(${e.score.toFixed(2)})`}return""}function Lr(e){if(e.metadata&&typeof e.metadata=="object"&&"threshold"in e.metadata){let t=e.metadata.threshold;if(typeof t=="number")return t}return null}function at(e){return e<1e3?`${e}ms`:`${(e/1e3).toFixed(2)}s`}function it(e){return e<.01?`$${e.toFixed(4)}`:`$${e.toFixed(2)}`}function lt(e){return{name:"json",async generate(t,s){let n={kindlm:{version:e??"0.0.0",timestamp:new Date().toISOString()},summary:{totalTests:t.totalTests,passed:t.passed,failed:t.failed,errored:t.errored,skipped:t.skipped,durationMs:t.durationMs},gates:{passed:s.passed,results:s.gates},suites:t.suites.map(o=>({name:o.name,status:o.status,tests:o.tests.map(r=>({name:r.name,status:r.status,assertions:r.assertions,latencyMs:r.latencyMs,costUsd:r.costUsd}))}))};return{content:JSON.stringify(n,null,2),format:"json"}}}}function ut(){return{name:"junit",async generate(e,t){let s=e.durationMs/1e3,n=[];n.push('<?xml version="1.0" encoding="UTF-8"?>'),n.push(`<testsuites name="KindLM" tests="${e.totalTests}" failures="${e.failed}" errors="${e.errored}" time="${s.toFixed(3)}">`);for(let o of e.suites){let r=o.tests.filter(i=>i.status==="failed").length,a=o.tests.filter(i=>i.status==="errored").length,c=o.tests.reduce((i,l)=>i+l.latencyMs,0)/1e3;n.push(` <testsuite name="${D(o.name)}" tests="${o.tests.length}" failures="${r}" errors="${a}" time="${c.toFixed(3)}">`);for(let i of o.tests){let l=i.latencyMs/1e3;if(n.push(` <testcase name="${D(i.name)}" classname="${D(o.name)}" time="${l.toFixed(3)}">`),i.status==="skipped")n.push(" <skipped/>");else if(i.status==="errored"&&i.error)n.push(` <error message="${D(i.error.message)}" type="${D(i.error.code)}">${D(i.error.message)}</error>`);else if(i.status==="failed"){let p=i.assertions.filter(f=>!f.passed);for(let f of p)n.push(` <failure message="${D(f.label)}" type="${D(f.failureCode??"ASSERTION_FAILED")}">${D(f.failureMessage??"Assertion failed")}</failure>`)}n.push(" </testcase>")}n.push(" </testsuite>")}if(t.gates.length>0){let o=t.gates.filter(r=>!r.passed).length;n.push(` <testsuite name="Quality Gates" tests="${t.gates.length}" failures="${o}" errors="0" time="0.000">`);for(let r of t.gates)n.push(` <testcase name="${D(r.gateName)}" classname="Quality Gates" time="0.000">`),r.passed||n.push(` <failure message="${D(r.message)}" type="GATE_FAILED">${D(r.message)}</failure>`),n.push(" </testcase>");n.push(" </testsuite>")}return n.push("</testsuites>"),{content:n.join(`
|
|
36
|
+
`),format:"xml"}}}}function D(e){return e=e.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\uFFFE\uFFFF]/g,""),e.replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""").replace(/'/g,"'")}async function ct(e){let t=new TextEncoder().encode(e),s=await globalThis.crypto.subtle.digest("SHA-256",t),n=new Uint8Array(s);return Array.from(n).map(o=>o.toString(16).padStart(2,"0")).join("")}function Oe(e){if(Array.isArray(e))return e.map(Oe);if(e&&typeof e=="object"){let t=Object.entries(e).sort(([s],[n])=>s.localeCompare(n)).map(([s,n])=>[s,Oe(n)]);return Object.fromEntries(t)}return e}function Ur(e){return JSON.stringify(Oe(e))}function pt(e){return{name:"compliance",async generate(t,s){let n=new Date().toISOString(),o=e?.kindlmVersion??"unknown",r=[];r.push("# EU AI Act \u2014 Annex IV Compliance Report"),r.push(""),r.push(`**Generated:** ${n}`),r.push("**Framework:** EU AI Act (Regulation 2024/1689)"),r.push(`**Tool:** KindLM v${o}`),e?.systemName&&r.push(`**System Name:** ${e.systemName}`),e?.operator&&r.push(`**Operator:** ${e.operator}`),e?.riskLevel&&r.push(`**Risk Level:** ${e.riskLevel}`),e?.intendedPurpose&&r.push(`**Intended Purpose:** ${e.intendedPurpose}`),r.push(""),r.push("## Article 9 \u2014 Risk Management System"),r.push(""),r.push("Testing demonstrates ongoing risk identification and mitigation through automated behavioral regression tests."),r.push(""),r.push(Ae(s,["passRateMin"])),r.push("## Article 10 \u2014 Data and Data Governance"),r.push(""),r.push("PII detection guardrails verify that personal data is not exposed in AI system outputs."),r.push(""),r.push(Ae(s,["piiFailuresMax","keywordFailuresMax"])),r.push("## Article 12 \u2014 Record-Keeping"),r.push(""),r.push("### Test Execution Log"),r.push(""),r.push("| Metric | Value |"),r.push("|--------|-------|"),r.push(`| Total Tests | ${t.totalTests} |`),r.push(`| Passed | ${t.passed} |`),r.push(`| Failed | ${t.failed} |`),r.push(`| Errored | ${t.errored} |`),r.push(`| Duration | ${t.durationMs}ms |`),r.push(""),r.push("### Suite Results"),r.push("");for(let p of t.suites){r.push(`**${p.name}** \u2014 ${p.status}`);for(let f of p.tests){let y=f.status==="passed"?"PASS":"FAIL";r.push(`- [${y}] ${f.name}`)}r.push("")}r.push("## Article 13 \u2014 Transparency and Provision of Information"),r.push(""),r.push("This report provides transparent documentation of AI system testing methodology, results, and quality gate evaluations as required under Article 13."),r.push(""),r.push(Ae(s,["judgeAvgMin","driftScoreMax"])),r.push("## Article 15 \u2014 Accuracy, Robustness and Cybersecurity"),r.push(""),r.push("Schema validation and behavioral assertions verify output accuracy and robustness."),r.push(""),r.push(Ae(s,["schemaFailuresMax","costMaxUsd","latencyMaxMs"])),r.push("## Quality Gate Summary"),r.push(""),r.push("| Gate | Result | Actual | Threshold |"),r.push("|------|--------|--------|-----------|");for(let p of s.gates){let f=p.passed?"PASS":"FAIL";r.push(`| ${p.gateName} | ${f} | ${dt(p.actual)} | ${dt(p.threshold)} |`)}r.push("");let a=s.passed?"PASS":"FAIL";r.push(`**Overall Verdict:** ${a}`),r.push("");let c=r.join(`
|
|
37
|
+
`),i=await ct(c),l=e?await ct(Ur({content:c,metadata:e})):i;return r.push("---"),r.push(`**Tamper Evidence Hash (SHA-256):** \`${i}\``),r.push(`**Run Identity Hash (SHA-256):** \`${l}\``),r.push(`**Run ID:** ${e?.runId??"N/A"}`),r.push(`**KindLM Version:** ${e?.kindlmVersion??"N/A"}`),r.push(`**Git Commit:** ${e?.gitCommitSha??"N/A"}`),r.push(`**Models:** ${e?.modelIds?.join(", ")??"N/A"}`),r.push(""),{content:r.join(`
|
|
38
|
+
`),format:"markdown"}}}}function Ae(e,t){let s=e.gates.filter(o=>t.includes(o.gateName));if(s.length===0)return"";let n=[];n.push("**Gate Evidence:**");for(let o of s){let r=o.passed?"PASS":"FAIL";n.push(`- [${r}] ${o.message}`)}return n.push(""),n.join(`
|
|
39
|
+
`)}function dt(e){return Number.isInteger(e)?String(e):e.toFixed(4)}var G="1",ke=["1"],Fr={};function ve(e){if(typeof e!="object"||e===null)return{ok:!1,error:"Baseline data is not an object"};let t=e,s=t.version;if(typeof s!="string")return{ok:!1,error:"Baseline data missing version field"};if(s===G){let a=mt(t);return a.ok?{ok:!0,baseline:t,migrated:!1}:a}if(!ke.includes(s))return{ok:!1,error:`Unsupported baseline version: "${s}". Known versions: ${ke.join(", ")}`};let n=s,o=!1;for(;n!==G;){let a=Fr[n];if(!a)return{ok:!1,error:`No migration path from version "${n}" to "${G}"`};t=a(t),n=t.version,o=!0}let r=mt(t);return r.ok?{ok:!0,baseline:t,migrated:o}:r}function mt(e){return typeof e.suiteName!="string"?{ok:!1,error:"Baseline file missing required field: suiteName"}:typeof e.createdAt!="string"?{ok:!1,error:"Baseline file missing required field: createdAt"}:typeof e.results!="object"||e.results===null?{ok:!1,error:"Baseline file missing required field: results"}:{ok:!0}}function _e(e){return JSON.stringify(e,null,2)}function Ie(e){let t;try{t=JSON.parse(e)}catch{return T({code:"BASELINE_CORRUPT",message:"Baseline file is not valid JSON"})}if(typeof t!="object"||t===null)return T({code:"BASELINE_CORRUPT",message:"Baseline file is not a JSON object"});let s=t;if(typeof s.version!="string")return T({code:"BASELINE_CORRUPT",message:"Baseline file missing required field: version"});let n=ve(t);if(!n.ok){let o=ke.includes(s.version);return T({code:o?"BASELINE_CORRUPT":"BASELINE_VERSION_MISMATCH",message:n.error})}return k(n.baseline)}function ft(e,t){let s=t.read(e);return s.success?Ie(s.data):s}function gt(e,t){let s=_e(e);return t.write(e.suiteName,s)}function ht(e){return e.list()}function Rt(e,t,s){let n={};for(let o of t){let r=`${o.testCaseName}::${o.modelId}`,c=o.runs.find(i=>i.assertions.every(l=>l.passed))??o.runs[0];n[r]={passRate:o.passRate,outputText:c?.outputText??"",failureCodes:o.failureCodes,latencyAvgMs:o.latencyAvgMs,costUsd:o.totalCostUsd,runCount:o.runCount}}return{version:G,suiteName:e,createdAt:s,results:n}}function yt(e,t){let s=[],n=[],o=[],r=[],a=[],c=new Set(Object.keys(e.results));for(let[i,l]of Object.entries(t)){let p=e.results[i];if(!p){r.push(i);continue}c.delete(i);let f=l.passRate-p.passRate;if(f<-.001){let y=l.failureCodes.filter(R=>!p.failureCodes.includes(R));s.push({testName:i,baselinePassRate:p.passRate,currentPassRate:l.passRate,newFailureCodes:y})}else f>.001?n.push({testName:i,baselinePassRate:p.passRate,currentPassRate:l.passRate}):o.push({testName:i,passRate:l.passRate})}for(let i of c)a.push(i);return{suiteName:e.suiteName,hasBaseline:!0,regressions:s,improvements:n,unchanged:o,newTests:r,removedTests:a}}function At(e){if(!e||typeof e!="object")return T({code:"CONFIG_PARSE_ERROR",message:"OTLP payload must be a non-null object"});let t=e;if(!Array.isArray(t.resourceSpans))return T({code:"CONFIG_PARSE_ERROR",message:"OTLP payload missing resourceSpans array"});let s=[];for(let n of t.resourceSpans){let o=xt(n.resource?.attributes??[]);if(Array.isArray(n.scopeSpans)){for(let r of n.scopeSpans)if(Array.isArray(r.spans))for(let a of r.spans){let c=bt(a.startTimeUnixNano),i=bt(a.endTimeUnixNano);s.push({traceId:a.traceId,spanId:a.spanId,parentSpanId:a.parentSpanId||void 0,name:a.name,kind:a.kind,startTimeMs:c,endTimeMs:i,durationMs:i-c,attributes:xt(a.attributes??[]),resourceAttributes:o,statusCode:a.status?.code,statusMessage:a.status?.message})}}}return k(s)}function bt(e){try{let t=BigInt(e);return Number(t/1000000n)}catch{return 0}}function xt(e){let t={};for(let s of e){let n=Br(s.value);n!==void 0&&(t[s.key]=n)}return t}function Br(e){if(e.stringValue!==void 0)return e.stringValue;if(e.intValue!==void 0)return parseInt(e.intValue,10);if(e.doubleValue!==void 0)return e.doubleValue;if(e.boolValue!==void 0)return e.boolValue}function Tt(e,t){return t?e.filter(s=>{if(t.namePattern){if(te(t.namePattern))return!0;if(!new RegExp(t.namePattern).test(s.name))return!1}if(t.attributeMatch){for(let[n,o]of Object.entries(t.attributeMatch))if(String(s.attributes[n])!==o)return!1}return!(t.minDurationMs!==void 0&&s.durationMs<t.minDurationMs)}):e}function Ct(e,t){let s="",n=[],o=0,r=0,a=0,c,i;for(let l of e){let p={...l.resourceAttributes,...l.attributes},f=p[t.outputTextAttr];typeof f=="string"&&f&&(s=s?`${s}
|
|
40
|
+
${f}`:f);let y=p[t.modelAttr];typeof y=="string"&&y&&(c=y);let R=p[t.systemAttr];typeof R=="string"&&R&&(i=R);let x=p[t.inputTokensAttr];typeof x=="number"&&(r+=x);let g=p[t.outputTokensAttr];typeof g=="number"&&(a+=g),l.parentSpanId||(o+=l.durationMs);let u=p["gen_ai.tool.name"],m=p["gen_ai.tool.arguments"];if(typeof u=="string"){let E={};if(typeof m=="string")try{E=JSON.parse(m)}catch{}n.push({id:l.spanId,name:u,arguments:E})}}return{outputText:s,toolCalls:n,latencyMs:o,inputTokens:r,outputTokens:a,model:c,system:i}}function Et(e,t){let s=e.toolCalls.map(n=>({id:n.id,name:n.name,arguments:n.arguments}));return{outputText:e.outputText,toolCalls:s,configDir:t.configDir,latencyMs:e.latencyMs,judgeAdapter:t.judgeAdapter,judgeModel:t.judgeModel,baselineText:t.baselineText}}0&&(module.exports={BASELINE_VERSION,KindLMConfigSchema,ProviderError,SpanFilterSchema,SpanMappingSchema,TraceConfigSchema,aggregateRuns,buildBaselineData,buildContextFromTrace,classifyAssertion,compareBaseline,cosineSimilarity,createAnthropicAdapter,createAssertionsFromExpect,createCohereAdapter,createComplianceReporter,createCostAssertion,createDriftAssertion,createGeminiAdapter,createJsonReporter,createJudgeAssertion,createJunitReporter,createKeywordsAbsentAssertion,createKeywordsPresentAssertion,createLatencyAssertion,createMistralAdapter,createOllamaAdapter,createOpenAIAdapter,createPiiAssertion,createPrettyReporter,createProvider,createRunner,createSchemaAssertion,createToolCalledAssertion,createToolNotCalledAssertion,createToolOrderAssertion,deserializeBaseline,err,evaluateGates,filterSpans,findMissingVars,interpolate,isDeterministic,isProbabilistic,listBaselines,lookupModelPricing,mapSpansToResult,migrateBaseline,noColor,ok,parseCommandOutput,parseConfig,parseOtlpPayload,readBaseline,runConversation,serializeBaseline,validateConfig,validateUnitIntervalScore,withRetry,writeBaseline});
|
|
40
41
|
//# sourceMappingURL=index.cjs.map
|