npm - @vtstech/pi-model-test - Versions diffs - 1.1.7 → 1.1.8 - Mend

@vtstech/pi-model-test 1.1.7 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/model-test.js +79 -155
package/package.json +2 -2

package/model-test.js CHANGED Viewed

@@ -9,10 +9,12 @@ import {
   truncate,
   sanitizeForReport
 } from "@vtstech/pi-shared/format";
-import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
+import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
 import {
   ALL_DIALECT_PATTERNS,
-  parseReactWithPatterns
+  parseReactWithPatterns,
+  detectReactDialect,
+  extractBraceJson
 } from "@vtstech/pi-shared/react-parser";
 import {
   CONFIG,
@@ -28,15 +30,20 @@ import {
   testInstructionFollowingUnified,
   TOOL_SUPPORT_CACHE_PATH
 } from "@vtstech/pi-shared/model-test-utils";
+import {
+  branding as sharedBranding,
+  formatTestSummary,
+  formatRecommendation
+} from "@vtstech/pi-shared/test-report";
 function model_test_temp_default(pi) {
   const effectiveConfig = getEffectiveConfig();
   function ollamaBase() {
     return getOllamaBaseUrl();
   }
   async function rateLimitDelay(lines) {
-    if (CONFIG.TEST_DELAY_MS > 0) {
-      lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
-      await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
+    if (effectiveConfig.TEST_DELAY_MS > 0) {
+      lines.push(info(`Waiting ${msHuman(effectiveConfig.TEST_DELAY_MS)} to avoid rate limiting...`));
+      await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
     }
   }
   function makeOllamaChatFn(useStreaming = true) {
@@ -195,7 +202,8 @@ function model_test_temp_default(pi) {
             if (parsed.message?.content) messageContent += parsed.message.content;
             if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
             if (parsed.done) done = true;
-          } catch {
+          } catch (err) {
+            debugLog("model-test", "skipped malformed JSON chunk in streaming response", err);
           }
         }
       }
@@ -392,22 +400,6 @@ function model_test_temp_default(pi) {
   async function testToolUsageProvider(providerInfo, model) {
     return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
   }
-  function extractBraceJson(raw) {
-    const jsonStart = raw.indexOf("{");
-    if (jsonStart === -1) return "";
-    let depth = 0, jsonEnd = -1;
-    for (let i = jsonStart; i < raw.length; i++) {
-      if (raw[i] === "{") depth++;
-      else if (raw[i] === "}") {
-        depth--;
-        if (depth === 0) {
-          jsonEnd = i;
-          break;
-        }
-      }
-    }
-    return jsonEnd !== -1 ? raw.slice(jsonStart, jsonEnd + 1) : "";
-  }
   async function testReactParsing(model) {
     const systemPrompt = [
       "You are a helpful assistant with access to tools.",
@@ -451,41 +443,20 @@ function model_test_temp_default(pi) {
         return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
       }
       let parsedResult = null;
-      const sharedParser = pi._reactParser;
-      if (sharedParser?.ALL_DIALECT_PATTERNS) {
-        for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
-          const result = sharedParser.parseReactWithPatterns(content, dp, true);
-          if (result) {
-            let toolName = result.name;
-            let argsStr;
-            const rawArgs = result.args ? JSON.stringify(result.args) : "";
-            if (rawArgs && rawArgs !== "{}") {
-              argsStr = rawArgs;
-            } else if (result.raw) {
-              argsStr = extractBraceJson(result.raw);
-            } else {
-              argsStr = "";
-            }
-            parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
-            break;
-          }
-        }
-      } else {
-        for (const dp of ALL_DIALECT_PATTERNS) {
-          const result = parseReactWithPatterns(content, dp, true);
-          if (result) {
-            let argsStr;
-            const rawArgs = result.args ? JSON.stringify(result.args) : "";
-            if (rawArgs && rawArgs !== "{}") {
-              argsStr = rawArgs;
-            } else if (result.raw) {
-              argsStr = extractBraceJson(result.raw);
-            } else {
-              argsStr = "";
-            }
-            parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
-            break;
+      for (const dp of ALL_DIALECT_PATTERNS) {
+        const result = parseReactWithPatterns(content, dp, true);
+        if (result) {
+          let argsStr;
+          const rawArgs = result.args ? JSON.stringify(result.args) : "";
+          if (rawArgs && rawArgs !== "{}") {
+            argsStr = rawArgs;
+          } else if (result.raw) {
+            argsStr = extractBraceJson(result.raw);
+          } else {
+            argsStr = "";
           }
+          parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
+          break;
         }
       }
       if (parsedResult) {
@@ -576,7 +547,7 @@ function model_test_temp_default(pi) {
     try {
       const start = Date.now();
       const controller = new AbortController();
-      const timeoutId = setTimeout(() => controller.abort(), 13e4);
+      const timeoutId = setTimeout(() => controller.abort(), effectiveConfig.TOOL_SUPPORT_TIMEOUT_MS);
       const res = await fetch(`${ollamaBase()}/api/chat`, {
         method: "POST",
         headers: { "Content-Type": "application/json" },
@@ -607,7 +578,8 @@ function model_test_temp_default(pi) {
         try {
           const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
           argsStr = JSON.stringify(args);
-        } catch {
+        } catch (err) {
+          debugLog("model-test", "failed to parse tool call arguments", err);
           argsStr = String(fn.arguments);
         }
         const level2 = "native";
@@ -619,41 +591,14 @@ function model_test_temp_default(pi) {
           elapsedMs
         };
       }
-      const reactPatterns = [
-        // Classic ReAct
-        /^\s*Action:\s*/im,
-        /^\s*Action Input:\s*/im,
-        /^\s*Thought:\s*/im,
-        /Action:\s*\w+/i,
-        /Action Input:\s*\{/i,
-        // Function dialect
-        /^\s*Function:\s*/im,
-        /^\s*Function Input:\s*/im,
-        /Function:\s*\w+/i,
-        // Tool dialect
-        /^\s*Tool:\s*/im,
-        /^\s*Tool Input:\s*/im,
-        /Tool:\s*\w+/i,
-        // Call dialect
-        /^\s*Call:\s*/im,
-        /^\s*Input:\s*/im,
-        /Call:\s*\w+/i
-      ];
-      const matchedPatterns = [];
-      for (const p of reactPatterns) {
-        if (p.test(content)) matchedPatterns.push(p.source);
-      }
-      if (matchedPatterns.length > 0) {
-        let dialectName = "react";
-        if (/Function:/i.test(content)) dialectName = "function";
-        else if (/Tool:/i.test(content)) dialectName = "tool";
-        else if (/Call:/i.test(content)) dialectName = "call";
+      const detectedDialect = detectReactDialect(content);
+      if (detectedDialect) {
         const level2 = "react";
         cacheToolSupport(model, level2, family);
         return {
           level: level2,
           cached: false,
-          evidence: `ReAct format detected (${dialectName} dialect) in text response`,
+          evidence: `ReAct format detected (${detectedDialect.name} dialect) in text response`,
           elapsedMs
         };
       }
@@ -697,7 +642,8 @@ function model_test_temp_default(pi) {
       if (!res.ok) return [];
       const data = await res.json();
       return (data.models || []).map((m) => m.name).filter(Boolean);
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to list Ollama models", err);
       return [];
     }
   }
@@ -706,43 +652,44 @@ function model_test_temp_default(pi) {
   }
   function updateModelsJsonReasoning(model, hasReasoning) {
     try {
+      const written = readModifyWriteModelsJson((config2) => {
+        for (const provider of Object.values(config2.providers || {})) {
+          const models = provider.models || [];
+          for (const m of models) {
+            if (m.id === model) {
+              const current = m.reasoning;
+              if (current === hasReasoning) {
+                return null;
+              }
+              m.reasoning = hasReasoning;
+              return config2;
+            }
+          }
+        }
+        return null;
+      });
+      if (!written) {
+        return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
+      }
       const config = readModelsJson();
-      let updated = false;
       for (const provider of Object.values(config.providers || {})) {
         const models = provider.models || [];
         for (const m of models) {
-          if (m.id === model) {
-            const current = m.reasoning;
-            if (current === hasReasoning) {
-              return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
-            }
-            m.reasoning = hasReasoning;
-            updated = true;
-            break;
+          if (m.id === model && m.reasoning === hasReasoning) {
+            return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
           }
         }
-        if (updated) break;
       }
-      if (!updated) {
-        return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
-      }
-      writeModelsJson(config);
       const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
-      return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
+      return { updated: true, message: `Updated ${model}: ${action}` };
     } catch (e) {
       return { updated: false, message: `Failed to update models.json: ${e.message}` };
     }
   }
-  const branding = [
-    `  \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
-    `  Written by VTSTech`,
-    `  GitHub: https://github.com/VTSTech`,
-    `  Website: www.vts-tech.org`
-  ].join("\n");
   async function testModelOllama(model, providerInfo, ctx) {
     const lines = [];
     const totalStart = Date.now();
-    lines.push(branding);
+    lines.push(sharedBranding);
     lines.push(section(`MODEL: ${model}`));
     lines.push(info("Provider: Ollama (local/remote)"));
     const modelsJson = readModelsJson();
@@ -783,7 +730,8 @@ function model_test_temp_default(pi) {
           modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
         }
       }
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to fetch model metadata from /api/show", err);
     }
     const detectedFamily = detectModelFamily(model);
     lines.push(info(`Size: ${modelSize}  |  Params: ${modelParams}  |  Quant: ${modelQuant}`));
@@ -930,11 +878,10 @@ function model_test_temp_default(pi) {
     }
     lines.push(info(`Evidence: ${toolSupport.evidence}`));
     lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
-    lines.push(section("SUMMARY"));
     const totalMs = Date.now() - totalStart;
     const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
     const reactPass = react.score === "STRONG" || react.score === "MODERATE";
-    const tests = [
+    const ollamaTests = [
       { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
       { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
       { name: "Tool Usage", pass: toolPass, score: tools.score },
@@ -942,23 +889,10 @@ function model_test_temp_default(pi) {
       { name: "Instructions", pass: instructions.pass, score: instructions.score },
       { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
     ];
-    const passed = tests.filter((t) => t.pass).length;
-    const total = tests.length;
-    for (const t of tests) {
-      lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
-    }
-    lines.push(info(`Total time: ${msHuman(totalMs)}`));
-    lines.push(info(`Score: ${passed}/${total} tests passed`));
-    lines.push(section("RECOMMENDATION"));
-    if (passed === 6) {
-      lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
-    } else if (passed >= 5) {
-      lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
-    } else if (passed >= 4) {
-      lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
-    } else {
-      lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
-    }
+    const passed = ollamaTests.filter((t) => t.pass).length;
+    const total = ollamaTests.length;
+    lines.push(...formatTestSummary(ollamaTests, totalMs));
+    lines.push(...formatRecommendation(model, passed, total));
     try {
       const historyEntry = {
         timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -985,14 +919,15 @@ function model_test_temp_default(pi) {
           lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
         }
       }
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to save test history", err);
     }
     return lines.join("\n");
   }
   async function testModelProvider(providerInfo, model, ctx) {
     const lines = [];
     const totalStart = Date.now();
-    lines.push(branding);
+    lines.push(sharedBranding);
     lines.push(section(`MODEL: ${model}`));
     lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
     lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
@@ -1097,31 +1032,17 @@ function model_test_temp_default(pi) {
     lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
     lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
     lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
-    lines.push(section("SUMMARY"));
     const totalMs = Date.now() - totalStart;
-    const tests = [
+    const providerTests = [
       { name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
       { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
       { name: "Instructions", pass: instructions.pass, score: instructions.score },
       { name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
     ];
-    const passed = tests.filter((t) => t.pass).length;
-    const total = tests.length;
-    for (const t of tests) {
-      lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
-    }
-    lines.push(info(`Total time: ${msHuman(totalMs)}`));
-    lines.push(info(`Score: ${passed}/${total} tests passed`));
-    lines.push(section("RECOMMENDATION"));
-    if (passed === 4) {
-      lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
-    } else if (passed >= 3) {
-      lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
-    } else if (passed >= 2) {
-      lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
-    } else {
-      lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
-    }
+    const passed = providerTests.filter((t) => t.pass).length;
+    const total = providerTests.length;
+    lines.push(...formatTestSummary(providerTests, totalMs));
+    lines.push(...formatRecommendation(model, passed, total, providerInfo.name));
     try {
       const historyEntry = {
         timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -1148,7 +1069,8 @@ function model_test_temp_default(pi) {
           lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
         }
       }
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to save provider test history", err);
     }
     return lines.join("\n");
   }
@@ -1168,7 +1090,8 @@ function model_test_temp_default(pi) {
       try {
         const models = await getOllamaModels();
         return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
-      } catch {
+      } catch (err) {
+        debugLog("model-test", "failed to get model completions", err);
         return [];
       }
     },
@@ -1188,7 +1111,8 @@ function model_test_temp_default(pi) {
         let models;
         try {
           models = await getOllamaModels();
-        } catch {
+        } catch (err) {
+          debugLog("model-test", "failed to list Ollama models for --all", err);
           ctx.ui.notify("Could not list Ollama models", "error");
           return;
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vtstech/pi-model-test",
-  "version": "1.1.7",
+  "version": "1.1.8",
   "description": "Model benchmark/testing extension for Pi Coding Agent",
   "main": "model-test.js",
   "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
     "url": "https://github.com/VTSTech/pi-coding-agent"
   },
   "dependencies": {
-    "@vtstech/pi-shared": "1.1.7"
+    "@vtstech/pi-shared": "1.1.8"
   },
   "peerDependencies": {
     "@mariozechner/pi-coding-agent": ">=0.66"