npm - @vtstech/pi-model-test - Versions diffs - 1.1.7 → 1.1.9 - Mend

@vtstech/pi-model-test 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/model-test.js +137 -249
package/package.json +2 -2

package/model-test.js CHANGED Viewed

@@ -9,10 +9,13 @@ import {
   truncate,
   sanitizeForReport
 } from "@vtstech/pi-shared/format";
-import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
+import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
+import { debugLog } from "@vtstech/pi-shared/debug";
 import {
   ALL_DIALECT_PATTERNS,
-  parseReactWithPatterns
+  parseReactWithPatterns,
+  detectReactDialect,
+  extractBraceJson
 } from "@vtstech/pi-shared/react-parser";
 import {
   CONFIG,
@@ -28,15 +31,71 @@ import {
   testInstructionFollowingUnified,
   TOOL_SUPPORT_CACHE_PATH
 } from "@vtstech/pi-shared/model-test-utils";
+import {
+  branding as sharedBranding,
+  formatTestSummary,
+  formatRecommendation
+} from "@vtstech/pi-shared/test-report";
 function model_test_temp_default(pi) {
   const effectiveConfig = getEffectiveConfig();
   function ollamaBase() {
     return getOllamaBaseUrl();
   }
   async function rateLimitDelay(lines) {
-    if (CONFIG.TEST_DELAY_MS > 0) {
-      lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
-      await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
+    if (effectiveConfig.TEST_DELAY_MS > 0) {
+      lines.push(info(`Waiting ${msHuman(effectiveConfig.TEST_DELAY_MS)} to avoid rate limiting...`));
+      await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
+    }
+  }
+  function reportScore(lines, score, descriptions, fallback) {
+    const desc = descriptions[score] || descriptions["*"] || `(${score})`;
+    if (score === "STRONG" || score === "MODERATE") {
+      lines.push(ok(desc));
+    } else if (score === "WEAK") {
+      lines.push(warn(desc));
+    } else if (score === "FAIL") {
+      lines.push(fail(desc));
+    } else {
+      lines.push(fail(fallback));
+    }
+  }
+  function reportReasoningScore(lines, result) {
+    reportScore(lines, result.score, {
+      STRONG: `Answer: ${result.answer} \u2014 Correct with clear reasoning (${result.score})`,
+      MODERATE: `Answer: ${result.answer} \u2014 Correct but weak reasoning (${result.score})`,
+      WEAK: `Answer: ${result.answer} \u2014 Reasoned but wrong answer (${result.score})`,
+      FAIL: `Answer: ${result.answer} \u2014 No reasoning detected (${result.score})`
+    }, `Error: ${result.reasoning.includes("<!DOCTYPE") || result.reasoning.includes("<html") ? result.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(result.reasoning, 300)}`);
+  }
+  function reportInstructionScore(lines, result) {
+    reportScore(lines, result.score, {
+      STRONG: `JSON output valid with correct values (${result.score})`,
+      MODERATE: `JSON output valid but some values incorrect (${result.score})`,
+      WEAK: `Partial JSON compliance (${result.score})`
+    }, `Failed to produce valid JSON (${result.score})`);
+  }
+  function reportToolScore(lines, result) {
+    if (result.score === "STRONG" || result.score === "MODERATE") {
+      lines.push(ok(`Tool call: ${result.toolCall} (${result.score})`));
+    } else if (result.score === "WEAK") {
+      lines.push(warn(`Tool call: ${result.toolCall} (${result.score}) \u2014 malformed call`));
+    } else if (result.score === "FAIL") {
+      const hasResponse = result.response && result.response.trim().length > 0;
+      lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${result.score})`));
+    } else {
+      lines.push(fail(`Error: ${result.toolCall}`));
+    }
+    if (result.score === "STRONG" || result.score === "MODERATE" || result.score === "WEAK") {
+      if (result.response) {
+        lines.push(info(`Raw response: ${sanitizeForReport(result.response)}`));
+      }
+    } else if (result.score === "FAIL") {
+      const hasResponse = result.response && result.response.trim().length > 0;
+      if (hasResponse) {
+        lines.push(info(`Text response: ${sanitizeForReport(result.response)}`));
+      } else {
+        lines.push(info("Text response: (empty)"));
+      }
     }
   }
   function makeOllamaChatFn(useStreaming = true) {
@@ -195,7 +254,8 @@ function model_test_temp_default(pi) {
             if (parsed.message?.content) messageContent += parsed.message.content;
             if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
             if (parsed.done) done = true;
-          } catch {
+          } catch (err) {
+            debugLog("model-test", "skipped malformed JSON chunk in streaming response", err);
           }
         }
       }
@@ -392,22 +452,6 @@ function model_test_temp_default(pi) {
   async function testToolUsageProvider(providerInfo, model) {
     return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
   }
-  function extractBraceJson(raw) {
-    const jsonStart = raw.indexOf("{");
-    if (jsonStart === -1) return "";
-    let depth = 0, jsonEnd = -1;
-    for (let i = jsonStart; i < raw.length; i++) {
-      if (raw[i] === "{") depth++;
-      else if (raw[i] === "}") {
-        depth--;
-        if (depth === 0) {
-          jsonEnd = i;
-          break;
-        }
-      }
-    }
-    return jsonEnd !== -1 ? raw.slice(jsonStart, jsonEnd + 1) : "";
-  }
   async function testReactParsing(model) {
     const systemPrompt = [
       "You are a helpful assistant with access to tools.",
@@ -451,41 +495,20 @@ function model_test_temp_default(pi) {
         return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
       }
       let parsedResult = null;
-      const sharedParser = pi._reactParser;
-      if (sharedParser?.ALL_DIALECT_PATTERNS) {
-        for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
-          const result = sharedParser.parseReactWithPatterns(content, dp, true);
-          if (result) {
-            let toolName = result.name;
-            let argsStr;
-            const rawArgs = result.args ? JSON.stringify(result.args) : "";
-            if (rawArgs && rawArgs !== "{}") {
-              argsStr = rawArgs;
-            } else if (result.raw) {
-              argsStr = extractBraceJson(result.raw);
-            } else {
-              argsStr = "";
-            }
-            parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
-            break;
-          }
-        }
-      } else {
-        for (const dp of ALL_DIALECT_PATTERNS) {
-          const result = parseReactWithPatterns(content, dp, true);
-          if (result) {
-            let argsStr;
-            const rawArgs = result.args ? JSON.stringify(result.args) : "";
-            if (rawArgs && rawArgs !== "{}") {
-              argsStr = rawArgs;
-            } else if (result.raw) {
-              argsStr = extractBraceJson(result.raw);
-            } else {
-              argsStr = "";
-            }
-            parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
-            break;
+      for (const dp of ALL_DIALECT_PATTERNS) {
+        const result = parseReactWithPatterns(content, dp, true);
+        if (result) {
+          let argsStr;
+          const rawArgs = result.args ? JSON.stringify(result.args) : "";
+          if (rawArgs && rawArgs !== "{}") {
+            argsStr = rawArgs;
+          } else if (result.raw) {
+            argsStr = extractBraceJson(result.raw);
+          } else {
+            argsStr = "";
           }
+          parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
+          break;
         }
       }
       if (parsedResult) {
@@ -576,7 +599,7 @@ function model_test_temp_default(pi) {
     try {
       const start = Date.now();
       const controller = new AbortController();
-      const timeoutId = setTimeout(() => controller.abort(), 13e4);
+      const timeoutId = setTimeout(() => controller.abort(), effectiveConfig.TOOL_SUPPORT_TIMEOUT_MS);
       const res = await fetch(`${ollamaBase()}/api/chat`, {
         method: "POST",
         headers: { "Content-Type": "application/json" },
@@ -607,7 +630,8 @@ function model_test_temp_default(pi) {
         try {
           const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
           argsStr = JSON.stringify(args);
-        } catch {
+        } catch (err) {
+          debugLog("model-test", "failed to parse tool call arguments", err);
           argsStr = String(fn.arguments);
         }
         const level2 = "native";
@@ -619,41 +643,14 @@ function model_test_temp_default(pi) {
           elapsedMs
         };
       }
-      const reactPatterns = [
-        // Classic ReAct
-        /^\s*Action:\s*/im,
-        /^\s*Action Input:\s*/im,
-        /^\s*Thought:\s*/im,
-        /Action:\s*\w+/i,
-        /Action Input:\s*\{/i,
-        // Function dialect
-        /^\s*Function:\s*/im,
-        /^\s*Function Input:\s*/im,
-        /Function:\s*\w+/i,
-        // Tool dialect
-        /^\s*Tool:\s*/im,
-        /^\s*Tool Input:\s*/im,
-        /Tool:\s*\w+/i,
-        // Call dialect
-        /^\s*Call:\s*/im,
-        /^\s*Input:\s*/im,
-        /Call:\s*\w+/i
-      ];
-      const matchedPatterns = [];
-      for (const p of reactPatterns) {
-        if (p.test(content)) matchedPatterns.push(p.source);
-      }
-      if (matchedPatterns.length > 0) {
-        let dialectName = "react";
-        if (/Function:/i.test(content)) dialectName = "function";
-        else if (/Tool:/i.test(content)) dialectName = "tool";
-        else if (/Call:/i.test(content)) dialectName = "call";
+      const detectedDialect = detectReactDialect(content);
+      if (detectedDialect) {
         const level2 = "react";
         cacheToolSupport(model, level2, family);
         return {
           level: level2,
           cached: false,
-          evidence: `ReAct format detected (${dialectName} dialect) in text response`,
+          evidence: `ReAct format detected (${detectedDialect.name} dialect) in text response`,
           elapsedMs
         };
       }
@@ -697,7 +694,8 @@ function model_test_temp_default(pi) {
       if (!res.ok) return [];
       const data = await res.json();
       return (data.models || []).map((m) => m.name).filter(Boolean);
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to list Ollama models", err);
       return [];
     }
   }
@@ -706,43 +704,44 @@ function model_test_temp_default(pi) {
   }
   function updateModelsJsonReasoning(model, hasReasoning) {
     try {
+      const written = readModifyWriteModelsJson((config2) => {
+        for (const provider of Object.values(config2.providers || {})) {
+          const models = provider.models || [];
+          for (const m of models) {
+            if (m.id === model) {
+              const current = m.reasoning;
+              if (current === hasReasoning) {
+                return null;
+              }
+              m.reasoning = hasReasoning;
+              return config2;
+            }
+          }
+        }
+        return null;
+      });
+      if (!written) {
+        return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
+      }
       const config = readModelsJson();
-      let updated = false;
       for (const provider of Object.values(config.providers || {})) {
         const models = provider.models || [];
         for (const m of models) {
-          if (m.id === model) {
-            const current = m.reasoning;
-            if (current === hasReasoning) {
-              return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
-            }
-            m.reasoning = hasReasoning;
-            updated = true;
-            break;
+          if (m.id === model && m.reasoning === hasReasoning) {
+            return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
           }
         }
-        if (updated) break;
       }
-      if (!updated) {
-        return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
-      }
-      writeModelsJson(config);
       const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
-      return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
+      return { updated: true, message: `Updated ${model}: ${action}` };
     } catch (e) {
       return { updated: false, message: `Failed to update models.json: ${e.message}` };
     }
   }
-  const branding = [
-    `  \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
-    `  Written by VTSTech`,
-    `  GitHub: https://github.com/VTSTech`,
-    `  Website: www.vts-tech.org`
-  ].join("\n");
   async function testModelOllama(model, providerInfo, ctx) {
     const lines = [];
     const totalStart = Date.now();
-    lines.push(branding);
+    lines.push(sharedBranding);
     lines.push(section(`MODEL: ${model}`));
     lines.push(info("Provider: Ollama (local/remote)"));
     const modelsJson = readModelsJson();
@@ -783,7 +782,8 @@ function model_test_temp_default(pi) {
           modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
         }
       }
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to fetch model metadata from /api/show", err);
     }
     const detectedFamily = detectModelFamily(model);
     lines.push(info(`Size: ${modelSize}  |  Params: ${modelParams}  |  Quant: ${modelQuant}`));
@@ -793,18 +793,7 @@ function model_test_temp_default(pi) {
     lines.push(info("Testing..."));
     const reasoning = await testReasoning(model);
     lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
-    if (reasoning.score === "STRONG") {
-      lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
-    } else if (reasoning.score === "MODERATE") {
-      lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
-    } else if (reasoning.score === "WEAK") {
-      lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
-    } else if (reasoning.score === "FAIL") {
-      lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
-    } else {
-      const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
-      lines.push(fail(`Error: ${errMsg}`));
-    }
+    reportReasoningScore(lines, reasoning);
     lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
     lines.push(section("THINKING TEST"));
     lines.push(info('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
@@ -827,32 +816,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const tools = await testToolUsage(model);
     lines.push(info(`Time: ${msHuman(tools.elapsedMs)}`));
-    if (tools.score === "STRONG") {
-      lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
-      if (tools.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
-      }
-    } else if (tools.score === "MODERATE") {
-      lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
-      if (tools.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
-      }
-    } else if (tools.score === "WEAK") {
-      lines.push(warn(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
-      if (tools.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
-      }
-    } else if (tools.score === "FAIL") {
-      const hasResponse = tools.response && tools.response.trim().length > 0;
-      lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
-      if (hasResponse) {
-        lines.push(info(`Text response: ${sanitizeForReport(tools.response)}`));
-      } else {
-        lines.push(info("Text response: (empty)"));
-      }
-    } else {
-      lines.push(fail(`Error: ${tools.toolCall}`));
-    }
+    reportToolScore(lines, tools);
     lines.push(section("REACT PARSING TEST"));
     lines.push(info(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
     lines.push(info("Testing..."));
@@ -889,15 +853,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const instructions = await testInstructionFollowing(model);
     lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
-    if (instructions.score === "STRONG") {
-      lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
-    } else if (instructions.score === "MODERATE") {
-      lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
-    } else if (instructions.score === "WEAK") {
-      lines.push(warn(`Partial JSON compliance (${instructions.score})`));
-    } else {
-      lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
-    }
+    reportInstructionScore(lines, instructions);
     lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
     lines.push(section("TOOL SUPPORT DETECTION"));
     lines.push(info("Probing model for tool calling capability (native / ReAct / none)"));
@@ -930,11 +886,10 @@ function model_test_temp_default(pi) {
     }
     lines.push(info(`Evidence: ${toolSupport.evidence}`));
     lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
-    lines.push(section("SUMMARY"));
     const totalMs = Date.now() - totalStart;
     const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
     const reactPass = react.score === "STRONG" || react.score === "MODERATE";
-    const tests = [
+    const ollamaTests = [
       { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
       { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
       { name: "Tool Usage", pass: toolPass, score: tools.score },
@@ -942,23 +897,10 @@ function model_test_temp_default(pi) {
       { name: "Instructions", pass: instructions.pass, score: instructions.score },
       { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
     ];
-    const passed = tests.filter((t) => t.pass).length;
-    const total = tests.length;
-    for (const t of tests) {
-      lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
-    }
-    lines.push(info(`Total time: ${msHuman(totalMs)}`));
-    lines.push(info(`Score: ${passed}/${total} tests passed`));
-    lines.push(section("RECOMMENDATION"));
-    if (passed === 6) {
-      lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
-    } else if (passed >= 5) {
-      lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
-    } else if (passed >= 4) {
-      lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
-    } else {
-      lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
-    }
+    const passed = ollamaTests.filter((t) => t.pass).length;
+    const total = ollamaTests.length;
+    lines.push(...formatTestSummary(ollamaTests, totalMs));
+    lines.push(...formatRecommendation(model, passed, total));
     try {
       const historyEntry = {
         timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -985,14 +927,15 @@ function model_test_temp_default(pi) {
           lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
         }
       }
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to save test history", err);
     }
     return lines.join("\n");
   }
   async function testModelProvider(providerInfo, model, ctx) {
     const lines = [];
     const totalStart = Date.now();
-    lines.push(branding);
+    lines.push(sharedBranding);
     lines.push(section(`MODEL: ${model}`));
     lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
     lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
@@ -1031,18 +974,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const reasoning = await testReasoningProvider(providerInfo, model);
     lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
-    if (reasoning.score === "STRONG") {
-      lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
-    } else if (reasoning.score === "MODERATE") {
-      lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
-    } else if (reasoning.score === "WEAK") {
-      lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
-    } else if (reasoning.score === "FAIL") {
-      lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
-    } else {
-      const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
-      lines.push(fail(`Error: ${errMsg}`));
-    }
+    reportReasoningScore(lines, reasoning);
     lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
     lines.push(section("INSTRUCTION FOLLOWING TEST"));
     lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
@@ -1050,15 +982,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const instructions = await testInstructionFollowingProvider(providerInfo, model);
     lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
-    if (instructions.score === "STRONG") {
-      lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
-    } else if (instructions.score === "MODERATE") {
-      lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
-    } else if (instructions.score === "WEAK") {
-      lines.push(warn(`Partial JSON compliance (${instructions.score})`));
-    } else {
-      lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
-    }
+    reportInstructionScore(lines, instructions);
     lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
     lines.push(section("TOOL USAGE TEST"));
     lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
@@ -1066,62 +990,23 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const toolTest = await testToolUsageProvider(providerInfo, model);
     lines.push(info(`Time: ${msHuman(toolTest.elapsedMs)}`));
-    if (toolTest.score === "STRONG") {
-      lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
-      if (toolTest.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
-      }
-    } else if (toolTest.score === "MODERATE") {
-      lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
-      if (toolTest.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
-      }
-    } else if (toolTest.score === "WEAK") {
-      lines.push(warn(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
-      if (toolTest.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
-      }
-    } else if (toolTest.score === "FAIL") {
-      const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
-      lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
-      if (hasResponse) {
-        lines.push(info(`Text response: ${sanitizeForReport(toolTest.response)}`));
-      } else {
-        lines.push(info("Text response: (empty)"));
-      }
-    } else {
-      lines.push(fail(`Error: ${toolTest.toolCall}`));
-    }
+    reportToolScore(lines, toolTest);
     lines.push(section("SKIPPED TESTS (OLLAMA-ONLY)"));
     lines.push(warn("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
     lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
     lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
     lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
-    lines.push(section("SUMMARY"));
     const totalMs = Date.now() - totalStart;
-    const tests = [
+    const providerTests = [
       { name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
       { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
       { name: "Instructions", pass: instructions.pass, score: instructions.score },
       { name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
     ];
-    const passed = tests.filter((t) => t.pass).length;
-    const total = tests.length;
-    for (const t of tests) {
-      lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
-    }
-    lines.push(info(`Total time: ${msHuman(totalMs)}`));
-    lines.push(info(`Score: ${passed}/${total} tests passed`));
-    lines.push(section("RECOMMENDATION"));
-    if (passed === 4) {
-      lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
-    } else if (passed >= 3) {
-      lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
-    } else if (passed >= 2) {
-      lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
-    } else {
-      lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
-    }
+    const passed = providerTests.filter((t) => t.pass).length;
+    const total = providerTests.length;
+    lines.push(...formatTestSummary(providerTests, totalMs));
+    lines.push(...formatRecommendation(model, passed, total, providerInfo.name));
     try {
       const historyEntry = {
         timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -1148,7 +1033,8 @@ function model_test_temp_default(pi) {
           lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
         }
       }
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to save provider test history", err);
     }
     return lines.join("\n");
   }
@@ -1168,7 +1054,8 @@ function model_test_temp_default(pi) {
       try {
         const models = await getOllamaModels();
         return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
-      } catch {
+      } catch (err) {
+        debugLog("model-test", "failed to get model completions", err);
         return [];
       }
     },
@@ -1188,7 +1075,8 @@ function model_test_temp_default(pi) {
         let models;
         try {
           models = await getOllamaModels();
-        } catch {
+        } catch (err) {
+          debugLog("model-test", "failed to list Ollama models for --all", err);
           ctx.ui.notify("Could not list Ollama models", "error");
           return;
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vtstech/pi-model-test",
-  "version": "1.1.7",
+  "version": "1.1.9",
   "description": "Model benchmark/testing extension for Pi Coding Agent",
   "main": "model-test.js",
   "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
     "url": "https://github.com/VTSTech/pi-coding-agent"
   },
   "dependencies": {
-    "@vtstech/pi-shared": "1.1.7"
+    "@vtstech/pi-shared": "1.1.9"
   },
   "peerDependencies": {
     "@mariozechner/pi-coding-agent": ">=0.66"