npm - @vtstech/pi-model-test - Versions diffs - 1.1.2 → 1.1.4 - Mend

@vtstech/pi-model-test 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/model-test.js +110 -544
package/package.json +2 -2

package/model-test.js CHANGED Viewed

@@ -1,7 +1,4 @@
 // .build-npm/model-test/model-test.temp.ts
-import * as fs from "node:fs";
-import * as os from "node:os";
-import * as path from "node:path";
 import {
   section,
   ok,
@@ -13,79 +10,21 @@ import {
   sanitizeForReport
 } from "@vtstech/pi-shared/format";
 import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
-var CONFIG = {
-  // General API settings
-  DEFAULT_TIMEOUT_MS: 999999,
-  // ~16.7 minutes — effectively unlimited for slow models
-  CONNECT_TIMEOUT_S: 60,
-  // 60 seconds to establish connection
-  MAX_RETRIES: 1,
-  // Single retry for transient failures
-  RETRY_DELAY_MS: 1e4,
-  // 10 seconds between retries
-  // Model generation settings
-  NUM_PREDICT: 1024,
-  // Max tokens in response
-  TEMPERATURE: 0.1,
-  // Low temperature for more deterministic output
-  // Test-specific settings
-  MIN_THINKING_LENGTH: 10,
-  // Minimum chars to consider thinking tokens valid
-  TOOL_TEST_TIMEOUT_MS: 999999,
-  // Effectively unlimited for slow tool usage tests
-  TOOL_SUPPORT_TIMEOUT_MS: 999999,
-  // Effectively unlimited for tool support detection
-  // Metadata retrieval
-  TAGS_TIMEOUT_MS: 15e3,
-  // 15 seconds for /api/tags
-  MODEL_INFO_TIMEOUT_MS: 3e4,
-  // 30 seconds for model info lookup
-  // Provider API settings
-  PROVIDER_TIMEOUT_MS: 999999,
-  // Effectively unlimited for cloud provider API calls
-  PROVIDER_TOOL_TIMEOUT_MS: 12e4,
-  // 120 seconds for tool usage tests on providers
-  // Rate limiting
-  TEST_DELAY_MS: 1e4
-  // 10 seconds between tests to avoid rate limiting
-};
-var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
-var TOOL_SUPPORT_CACHE_PATH = path.join(TOOL_SUPPORT_CACHE_DIR, "tool_support.json");
-var _toolSupportCacheInMemory = null;
-function readToolSupportCache() {
-  try {
-    if (fs.existsSync(TOOL_SUPPORT_CACHE_PATH)) {
-      const raw = fs.readFileSync(TOOL_SUPPORT_CACHE_PATH, "utf-8");
-      return JSON.parse(raw);
-    }
-  } catch {
-  }
-  return {};
-}
-function writeToolSupportCache(cache) {
-  if (!fs.existsSync(TOOL_SUPPORT_CACHE_DIR)) {
-    fs.mkdirSync(TOOL_SUPPORT_CACHE_DIR, { recursive: true });
-  }
-  fs.writeFileSync(TOOL_SUPPORT_CACHE_PATH, JSON.stringify(cache, null, 2) + "\n", "utf-8");
-}
-function getCachedToolSupport(model) {
-  const cache = _toolSupportCacheInMemory || readToolSupportCache();
-  if (!_toolSupportCacheInMemory) _toolSupportCacheInMemory = cache;
-  const entry = cache[model];
-  if (!entry) return null;
-  if (!entry.support || !["native", "react", "none"].includes(entry.support)) return null;
-  return entry;
-}
-function cacheToolSupport(model, support, family) {
-  const cache = _toolSupportCacheInMemory || readToolSupportCache();
-  cache[model] = {
-    support,
-    testedAt: (/* @__PURE__ */ new Date()).toISOString(),
-    family
-  };
-  _toolSupportCacheInMemory = cache;
-  writeToolSupportCache(cache);
-}
+import {
+  ALL_DIALECT_PATTERNS,
+  parseReactWithPatterns
+} from "@vtstech/pi-shared/react-parser";
+import {
+  CONFIG,
+  WEATHER_TOOL_DEFINITION,
+  scoreReasoning,
+  getCachedToolSupport,
+  cacheToolSupport,
+  testToolUsageUnified,
+  testReasoningUnified,
+  testInstructionFollowingUnified,
+  TOOL_SUPPORT_CACHE_PATH
+} from "@vtstech/pi-shared/model-test-utils";
 function model_test_temp_default(pi) {
   function ollamaBase() {
     return getOllamaBaseUrl();
@@ -96,65 +35,75 @@ function model_test_temp_default(pi) {
       await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
     }
   }
-  function scoreReasoning(msg) {
-    const allNumbers = msg.match(/\b(\d+)\b/g) || [];
-    const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
-    const isCorrect = answer === "8";
-    const reasoningPatterns = [
-      "because",
-      "therefore",
-      "since",
-      "step",
-      "subtract",
-      "minus",
-      "each day",
-      "each night",
-      "slides",
-      "climbs",
-      "night",
-      "reaches",
-      "finally",
-      "last day"
-    ];
-    const hasReasoningWords = reasoningPatterns.some((w) => msg.toLowerCase().includes(w));
-    const hasNumberedSteps = /^\s*\d+\.\s/m.test(msg);
-    const hasReasoning = hasReasoningWords || hasNumberedSteps;
-    if (isCorrect && hasReasoning) return { score: "STRONG", pass: true };
-    if (isCorrect) return { score: "MODERATE", pass: true };
-    if (hasReasoning) return { score: "WEAK", pass: false };
-    return { score: "FAIL", pass: false };
-  }
-  function scoreNativeToolCall(fnName, args) {
-    const hasCorrectTool = fnName === "get_weather";
-    const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
-    const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
-    if (hasCorrectTool && hasLocation && unitValid) return { score: "STRONG", pass: true };
-    if (hasCorrectTool && hasLocation) return { score: "MODERATE", pass: true };
-    return { score: "WEAK", pass: false };
+  function makeOllamaChatFn() {
+    return async (model, messages, _options) => {
+      const result = await ollamaChat(model, messages);
+      return {
+        content: result.response?.message?.content || "",
+        elapsedMs: result.elapsedMs,
+        raw: result.response
+      };
+    };
   }
-  function scoreTextToolCall(fnName, args) {
-    const isWeatherTool = fnName === "get_weather";
-    const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
-    if (isWeatherTool && hasLocation) return { score: "STRONG", pass: true };
-    if (isWeatherTool) return { score: "MODERATE", pass: true };
-    return { score: "WEAK", pass: false };
+  function makeOllamaToolChatFn() {
+    return async (model, messages, options) => {
+      const tools = options?.tools || void 0;
+      const body = {
+        model,
+        messages,
+        stream: false,
+        options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
+      };
+      if (tools && tools.length > 0) {
+        body.tools = tools;
+      }
+      const controller = new AbortController();
+      const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
+      const start = Date.now();
+      try {
+        const res = await fetch(`${ollamaBase()}/api/chat`, {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify(body),
+          signal: controller.signal
+        });
+        const elapsedMs = Date.now() - start;
+        clearTimeout(timeoutId);
+        if (!res.ok) {
+          const errorText = await res.text().catch(() => "unknown error");
+          throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
+        }
+        const text = await res.text();
+        if (!text.trim()) throw new Error("Empty response from Ollama");
+        const parsed = JSON.parse(text);
+        const toolCalls = parsed?.message?.tool_calls;
+        const content = parsed?.message?.content || "";
+        return {
+          content,
+          toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : void 0,
+          elapsedMs,
+          raw: parsed
+        };
+      } catch (e) {
+        clearTimeout(timeoutId);
+        throw e;
+      }
+    };
   }
-  function parseTextToolCall(content) {
-    const firstBrace = content.indexOf("{");
-    if (firstBrace === -1) return null;
-    const lastBrace = content.lastIndexOf("}");
-    if (lastBrace <= firstBrace) return null;
-    const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
-    let textToolParsed = null;
-    try {
-      textToolParsed = JSON.parse(jsonCandidate);
-    } catch {
-      return null;
-    }
-    if (!textToolParsed || typeof textToolParsed.name !== "string") return null;
-    const rawArgs = textToolParsed.arguments || { ...textToolParsed };
-    const { name: _, ...fnArgs } = rawArgs;
-    return { fnName: textToolParsed.name, args: fnArgs };
+  function makeProviderChatFn(providerInfo) {
+    return async (model, messages, options) => {
+      const result = await providerChat(providerInfo, model, messages, {
+        maxTokens: CONFIG.NUM_PREDICT,
+        tools: options?.tools,
+        timeoutMs: CONFIG.PROVIDER_TOOL_TIMEOUT_MS
+      });
+      return {
+        content: result.content,
+        toolCalls: result.toolCalls,
+        elapsedMs: result.elapsedMs,
+        raw: void 0
+      };
+    };
   }
   async function ollamaChat(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS, retries = CONFIG.MAX_RETRIES) {
     const body = { model, messages, stream: false, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
@@ -347,22 +296,7 @@ function model_test_temp_default(pi) {
     }
   }
   async function testReasoningProvider(providerInfo, model) {
-    const prompt = `A snail climbs 3 feet up a wall each day, but slides back 2 feet each night. The wall is 10 feet tall. How many days does it take the snail to reach the top? Think step by step and give the final answer on its own line like: ANSWER: <number>`;
-    try {
-      const result = await providerChat(providerInfo, model, [
-        { role: "user", content: prompt }
-      ]);
-      const msg = result.content.trim();
-      if (msg.length === 0) {
-        return { pass: false, score: "ERROR", reasoning: "Empty response from provider", answer: "?", elapsedMs: result.elapsedMs };
-      }
-      const allNumbers = msg.match(/\b(\d+)\b/g) || [];
-      const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
-      const { score, pass } = scoreReasoning(msg);
-      return { pass, score, reasoning: msg, answer, elapsedMs: result.elapsedMs };
-    } catch (e) {
-      return { pass: false, score: "ERROR", reasoning: e.message, answer: "?", elapsedMs: 0 };
-    }
+    return testReasoningUnified(makeProviderChatFn(providerInfo), model);
   }
   async function testThinking(model) {
     const prompt = "Multiply 37 by 43. Explain your reasoning step by step and give the final answer.";
@@ -386,182 +320,10 @@ function model_test_temp_default(pi) {
     }
   }
   async function testToolUsage(model) {
-    const tools = [
-      {
-        type: "function",
-        function: {
-          name: "get_weather",
-          description: "Get the current weather for a location",
-          parameters: {
-            type: "object",
-            properties: {
-              location: { type: "string", description: "City name" },
-              unit: { type: "string", enum: ["celsius", "fahrenheit"] }
-            },
-            required: ["location"]
-          }
-        }
-      }
-    ];
-    const body = {
-      model,
-      messages: [
-        { role: "system", content: "You are a helpful assistant. Use the available tools when needed." },
-        { role: "user", content: "What's the weather like in Paris right now?" }
-      ],
-      tools,
-      stream: false,
-      options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
-    };
-    try {
-      const controller = new AbortController();
-      const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
-      const start = Date.now();
-      const res = await fetch(`${ollamaBase()}/api/chat`, {
-        method: "POST",
-        headers: { "Content-Type": "application/json" },
-        body: JSON.stringify(body),
-        signal: controller.signal
-      });
-      const elapsedMs = Date.now() - start;
-      clearTimeout(timeoutId);
-      if (!res.ok) {
-        const errorText = await res.text().catch(() => "unknown error");
-        return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `fetch error: ${res.status}`, response: "", elapsedMs };
-      }
-      const text = await res.text();
-      if (!text.trim()) throw new Error("Empty response from Ollama");
-      const parsed = JSON.parse(text);
-      const toolCalls = parsed?.message?.tool_calls;
-      const content = parsed?.message?.content || "";
-      if (toolCalls && toolCalls.length > 0) {
-        const call = toolCalls[0];
-        const fn = call.function || {};
-        let args = {};
-        try {
-          args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
-        } catch {
-          return {
-            pass: true,
-            score: "WEAK",
-            hasToolCalls: true,
-            toolCall: `malformed args: ${String(fn.arguments)}`,
-            response: content,
-            elapsedMs
-          };
-        }
-        const { score, pass } = scoreNativeToolCall(fn.name || "", args);
-        return {
-          pass,
-          score,
-          hasToolCalls: true,
-          toolCall: `${fn.name}(${JSON.stringify(args)})`,
-          response: content,
-          elapsedMs
-        };
-      }
-      const textParsed = parseTextToolCall(content);
-      if (textParsed) {
-        const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
-        return {
-          pass,
-          score,
-          hasToolCalls: true,
-          toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
-          response: content,
-          elapsedMs
-        };
-      }
-      return {
-        pass: false,
-        score: "FAIL",
-        hasToolCalls: false,
-        toolCall: "none",
-        response: content,
-        elapsedMs
-      };
-    } catch (e) {
-      return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `error: ${e.message}`, response: "", elapsedMs: 0 };
-    }
+    return testToolUsageUnified(makeOllamaToolChatFn(), model);
   }
   async function testToolUsageProvider(providerInfo, model) {
-    const tools = [
-      {
-        type: "function",
-        function: {
-          name: "get_weather",
-          description: "Get the current weather for a location",
-          parameters: {
-            type: "object",
-            properties: {
-              location: { type: "string", description: "City name" },
-              unit: { type: "string", enum: ["celsius", "fahrenheit"] }
-            },
-            required: ["location"]
-          }
-        }
-      }
-    ];
-    try {
-      const result = await providerChat(providerInfo, model, [
-        { role: "system", content: "You are a helpful assistant. Use the available tools when needed." },
-        { role: "user", content: "What's the weather like in Paris right now?" }
-      ], {
-        maxTokens: CONFIG.NUM_PREDICT,
-        tools,
-        timeoutMs: CONFIG.PROVIDER_TOOL_TIMEOUT_MS
-      });
-      const content = result.content;
-      const toolCalls = result.toolCalls;
-      if (toolCalls && toolCalls.length > 0) {
-        const call = toolCalls[0];
-        const fn = call.function || {};
-        let args = {};
-        try {
-          args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
-        } catch {
-          return {
-            pass: true,
-            score: "WEAK",
-            hasToolCalls: true,
-            toolCall: `malformed args: ${String(fn.arguments)}`,
-            response: content,
-            elapsedMs: result.elapsedMs
-          };
-        }
-        const { score, pass } = scoreNativeToolCall(fn.name || "", args);
-        return {
-          pass,
-          score,
-          hasToolCalls: true,
-          toolCall: `${fn.name}(${JSON.stringify(args)})`,
-          response: content,
-          elapsedMs: result.elapsedMs
-        };
-      }
-      const textParsed = parseTextToolCall(content);
-      if (textParsed) {
-        const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
-        return {
-          pass,
-          score,
-          hasToolCalls: true,
-          toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
-          response: content,
-          elapsedMs: result.elapsedMs
-        };
-      }
-      return {
-        pass: false,
-        score: "FAIL",
-        hasToolCalls: false,
-        toolCall: "none",
-        response: content,
-        elapsedMs: result.elapsedMs
-      };
-    } catch (e) {
-      return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `error: ${e.message}`, response: "", elapsedMs: 0 };
-    }
+    return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
   }
   async function testReactParsing(model) {
     const systemPrompt = [
@@ -642,67 +404,35 @@ function model_test_temp_default(pi) {
           }
         }
       } else {
-        const dialectDefs = [
-          { name: "react", action: "Action:", input: "Action Input:" },
-          { name: "function", action: "Function:", input: "Function Input:" },
-          { name: "tool", action: "Tool:", input: "Tool Input:" },
-          { name: "call", action: "Call:", input: "Input:" }
-        ];
-        for (const dd of dialectDefs) {
-          const esc = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
-          const aT = esc(dd.action);
-          const iT = esc(dd.input);
-          const primaryRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s*\\n?\\s*${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
-          const sameRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s+${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
-          const parenRe = new RegExp(`${aT}\\s*(\\w+)\\s*\\(([^)]*)\\)`, "i");
-          let m = primaryRe.exec(content) || sameRe.exec(content);
-          let isParen = false;
-          if (!m) {
-            m = parenRe.exec(content);
-            isParen = true;
-          }
-          if (m) {
-            const toolName = m[1].trim().replace(/[`"']/g, "");
-            const rawArgs = m[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
-            let argsStr = "";
-            if (isParen && rawArgs && !rawArgs.startsWith("{")) {
-              const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
-              if (pairs) {
-                const obj = {};
-                for (const p of pairs) {
-                  const ci = p.indexOf(":");
-                  let v = p.slice(ci + 1).trim();
-                  if (v.startsWith('"') && v.endsWith('"') || v.startsWith("'") && v.endsWith("'")) v = v.slice(1, -1);
-                  obj[p.slice(0, ci).trim()] = v;
-                }
-                argsStr = JSON.stringify(obj);
-              } else {
-                argsStr = rawArgs;
-              }
-            } else {
-              const js = rawArgs.indexOf("{");
-              if (js !== -1) {
-                let d = 0, je = -1;
-                for (let i = js; i < rawArgs.length; i++) {
-                  if (rawArgs[i] === "{") d++;
-                  else if (rawArgs[i] === "}") {
-                    d--;
-                    if (d === 0) {
-                      je = i;
+        for (const dp of ALL_DIALECT_PATTERNS) {
+          const result = parseReactWithPatterns(content, dp, true);
+          if (result) {
+            let argsStr;
+            const rawArgs = result.args ? JSON.stringify(result.args) : "";
+            if (rawArgs && rawArgs !== "{}") {
+              argsStr = rawArgs;
+            } else if (result.raw) {
+              const jsonStart = result.raw.indexOf("{");
+              if (jsonStart !== -1) {
+                let depth = 0, jsonEnd = -1;
+                for (let i = jsonStart; i < result.raw.length; i++) {
+                  if (result.raw[i] === "{") depth++;
+                  else if (result.raw[i] === "}") {
+                    depth--;
+                    if (depth === 0) {
+                      jsonEnd = i;
                       break;
                     }
                   }
                 }
-                argsStr = je !== -1 ? rawArgs.slice(js, je + 1) : rawArgs;
+                argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
               } else {
-                argsStr = rawArgs;
+                argsStr = "";
               }
+            } else {
+              argsStr = "";
             }
-            let thought = "";
-            const thoughtRe = /Thought:\s*(.*?)(?=Action:|Function:|Tool:|Call:|Final Answer:|$)/is;
-            const tm = thoughtRe.exec(content);
-            if (tm) thought = tm[1].trim();
-            parsedResult = { name: toolName, args: argsStr, thought, dialect: dd.name };
+            parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
             break;
           }
         }
@@ -763,158 +493,10 @@ function model_test_temp_default(pi) {
     }
   }
   async function testInstructionFollowing(model) {
-    const prompt = `You must respond with ONLY a valid JSON object. No markdown, no explanation, no backticks, no extra text.
-The JSON object must have exactly these 4 keys:
-- "name" (string): your model name
-- "can_count" (boolean): true
-- "sum" (number): the result of 15 + 27
-- "language" (string): the language you are responding in`;
-    try {
-      const { response, elapsedMs } = await ollamaChat(model, [
-        { role: "user", content: prompt }
-      ], { num_predict: CONFIG.NUM_PREDICT });
-      const msg = (response?.message?.content || "").trim();
-      let parsed = null;
-      let repairNote = "";
-      try {
-        const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
-        parsed = JSON.parse(cleaned);
-      } catch {
-        const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
-        let braceDepth = 0, bracketDepth = 0;
-        let inString = false, escapeNext = false;
-        for (let i = 0; i < cleaned.length; i++) {
-          const c = cleaned[i];
-          if (escapeNext) {
-            escapeNext = false;
-            continue;
-          }
-          if (c === "\\") {
-            if (inString) escapeNext = true;
-            continue;
-          }
-          if (c === '"') {
-            inString = !inString;
-            continue;
-          }
-          if (inString) continue;
-          if (c === "{") braceDepth++;
-          else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
-          else if (c === "[") bracketDepth++;
-          else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
-        }
-        if (braceDepth > 0 || bracketDepth > 0) {
-          const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
-          try {
-            parsed = JSON.parse(repaired);
-            repairNote = " (repaired truncated JSON)";
-          } catch {
-          }
-        }
-      }
-      if (!parsed) {
-        return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs };
-      }
-      const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
-      const correctSum = parsed.sum === 42;
-      const hasCorrectCount = parsed.can_count === true;
-      let score;
-      if (hasKeys && correctSum && hasCorrectCount) {
-        score = "STRONG";
-      } else if (hasKeys && (correctSum || hasCorrectCount)) {
-        score = "MODERATE";
-      } else if (parsed.sum !== void 0 || parsed.name) {
-        score = "WEAK";
-      } else {
-        score = "FAIL";
-      }
-      return {
-        pass: hasKeys,
-        score,
-        output: JSON.stringify(parsed) + repairNote,
-        elapsedMs
-      };
-    } catch (e) {
-      return { pass: false, score: "ERROR", output: e.message, elapsedMs: 0 };
-    }
+    return testInstructionFollowingUnified(makeOllamaChatFn(), model);
   }
   async function testInstructionFollowingProvider(providerInfo, model) {
-    const prompt = `You must respond with ONLY a valid JSON object. No markdown, no explanation, no backticks, no extra text.
-The JSON object must have exactly these 4 keys:
-- "name" (string): your model name
-- "can_count" (boolean): true
-- "sum" (number): the result of 15 + 27
-- "language" (string): the language you are responding in`;
-    try {
-      const result = await providerChat(providerInfo, model, [
-        { role: "user", content: prompt }
-      ]);
-      const msg = result.content.trim();
-      let parsed = null;
-      let repairNote = "";
-      try {
-        const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
-        parsed = JSON.parse(cleaned);
-      } catch {
-        const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
-        let braceDepth = 0, bracketDepth = 0;
-        let inString = false, escapeNext = false;
-        for (let i = 0; i < cleaned.length; i++) {
-          const c = cleaned[i];
-          if (escapeNext) {
-            escapeNext = false;
-            continue;
-          }
-          if (c === "\\") {
-            if (inString) escapeNext = true;
-            continue;
-          }
-          if (c === '"') {
-            inString = !inString;
-            continue;
-          }
-          if (inString) continue;
-          if (c === "{") braceDepth++;
-          else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
-          else if (c === "[") bracketDepth++;
-          else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
-        }
-        if (braceDepth > 0 || bracketDepth > 0) {
-          const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
-          try {
-            parsed = JSON.parse(repaired);
-            repairNote = " (repaired truncated JSON)";
-          } catch {
-          }
-        }
-      }
-      if (!parsed) {
-        return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs: result.elapsedMs };
-      }
-      const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
-      const correctSum = parsed.sum === 42;
-      const hasCorrectCount = parsed.can_count === true;
-      let score;
-      if (hasKeys && correctSum && hasCorrectCount) {
-        score = "STRONG";
-      } else if (hasKeys && (correctSum || hasCorrectCount)) {
-        score = "MODERATE";
-      } else if (parsed.sum !== void 0 || parsed.name) {
-        score = "WEAK";
-      } else {
-        score = "FAIL";
-      }
-      return {
-        pass: hasKeys,
-        score,
-        output: JSON.stringify(parsed) + repairNote,
-        elapsedMs: result.elapsedMs
-      };
-    } catch (e) {
-      return { pass: false, score: "ERROR", output: e.message, elapsedMs: 0 };
-    }
+    return testInstructionFollowingUnified(makeProviderChatFn(providerInfo), model);
   }
   async function testToolSupport(model, family) {
     const cached = getCachedToolSupport(model);
@@ -926,23 +508,7 @@ The JSON object must have exactly these 4 keys:
         elapsedMs: 0
       };
     }
-    const tools = [
-      {
-        type: "function",
-        function: {
-          name: "get_weather",
-          description: "Get the current weather for a location",
-          parameters: {
-            type: "object",
-            properties: {
-              location: { type: "string", description: "City name" },
-              unit: { type: "string", enum: ["celsius", "fahrenheit"] }
-            },
-            required: ["location"]
-          }
-        }
-      }
-    ];
+    const tools = [WEATHER_TOOL_DEFINITION];
     const body = {
       model,
       messages: [

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vtstech/pi-model-test",
-  "version": "1.1.2",
+  "version": "1.1.4",
   "description": "Model benchmark/testing extension for Pi Coding Agent",
   "main": "model-test.js",
   "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
     "url": "https://github.com/VTSTech/pi-coding-agent"
   },
   "dependencies": {
-    "@vtstech/pi-shared": "1.1.2"
+    "@vtstech/pi-shared": "1.1.4"
   },
   "peerDependencies": {
     "@mariozechner/pi-coding-agent": ">=0.66"