npm - @vtstech/pi-model-test - Versions diffs - 1.1.6 → 1.1.8 - Mend

@vtstech/pi-model-test 1.1.6 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -44,8 +44,11 @@ pi install "npm:@vtstech/pi-model-test"
 - Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
 - Uses native `fetch()` for all HTTP communication (no shell subprocess or curl dependency)
+- **Streaming Ollama chat** — uses `/api/chat` with `stream: true` for earlier timeout detection and reduced memory
 - Automatic remote Ollama URL resolution (reads from `models.json` on every call — picks up config changes immediately)
-- Timeout resilience with auto-retry on empty responses
+- Timeout resilience with exponential backoff retry on connection failures
+- **Configurable test parameters** — override timeouts, delays, temperature via `~/.pi/agent/model-test-config.json`
+- **Test history with regression detection** — tracks results at `~/.pi/agent/cache/model-test-history.json`, flags score degradation
 - Rate limit delay between tests (configurable)
 - Thinking model fallback (retries with `think: true`)
 - Tool support cache (`~/.pi/agent/cache/tool_support.json`)

package/model-test.js CHANGED Viewed

@@ -9,10 +9,12 @@ import {
   truncate,
   sanitizeForReport
 } from "@vtstech/pi-shared/format";
-import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
+import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
 import {
   ALL_DIALECT_PATTERNS,
-  parseReactWithPatterns
+  parseReactWithPatterns,
+  detectReactDialect,
+  extractBraceJson
 } from "@vtstech/pi-shared/react-parser";
 import {
   CONFIG,
@@ -20,24 +22,34 @@ import {
   scoreReasoning,
   getCachedToolSupport,
   cacheToolSupport,
+  getEffectiveConfig,
+  appendTestHistory,
+  detectRegression,
   testToolUsageUnified,
   testReasoningUnified,
   testInstructionFollowingUnified,
   TOOL_SUPPORT_CACHE_PATH
 } from "@vtstech/pi-shared/model-test-utils";
+import {
+  branding as sharedBranding,
+  formatTestSummary,
+  formatRecommendation
+} from "@vtstech/pi-shared/test-report";
 function model_test_temp_default(pi) {
+  const effectiveConfig = getEffectiveConfig();
   function ollamaBase() {
     return getOllamaBaseUrl();
   }
   async function rateLimitDelay(lines) {
-    if (CONFIG.TEST_DELAY_MS > 0) {
-      lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
-      await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
+    if (effectiveConfig.TEST_DELAY_MS > 0) {
+      lines.push(info(`Waiting ${msHuman(effectiveConfig.TEST_DELAY_MS)} to avoid rate limiting...`));
+      await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
     }
   }
-  function makeOllamaChatFn() {
+  function makeOllamaChatFn(useStreaming = true) {
     return async (model, messages, _options) => {
-      const result = await ollamaChat(model, messages);
+      const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
+      const result = await chatFn(model, messages);
       return {
         content: result.response?.message?.content || "",
         elapsedMs: result.elapsedMs,
@@ -154,6 +166,69 @@ function model_test_temp_default(pi) {
     }
     throw new Error("Unreachable");
   }
+  async function ollamaChatStream(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS) {
+    const body = { model, messages, stream: true, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
+    const url = `${ollamaBase()}/api/chat`;
+    const controller = new AbortController();
+    const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
+    const start = Date.now();
+    try {
+      const res = await fetch(url, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify(body),
+        signal: controller.signal
+      });
+      if (!res.ok) {
+        const errorText = await res.text().catch(() => "unknown error");
+        throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
+      }
+      if (!res.body) {
+        throw new Error("Ollama streaming response has no body");
+      }
+      let messageContent = "";
+      let thinkingContent = "";
+      let done = false;
+      const reader = res.body.getReader();
+      const decoder = new TextDecoder();
+      while (!done) {
+        const { value, done: streamDone } = await reader.read();
+        if (streamDone) break;
+        const chunk = decoder.decode(value, { stream: true });
+        const lines = chunk.split("\n").filter((line) => line.trim().length > 0);
+        for (const line of lines) {
+          try {
+            const parsed = JSON.parse(line);
+            if (parsed.message?.content) messageContent += parsed.message.content;
+            if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
+            if (parsed.done) done = true;
+          } catch (err) {
+            debugLog("model-test", "skipped malformed JSON chunk in streaming response", err);
+          }
+        }
+      }
+      const elapsedMs = Date.now() - start;
+      if (!messageContent.trim() && !thinkingContent.trim()) {
+        throw new Error("Empty streaming response from Ollama");
+      }
+      const response = {
+        message: {
+          content: messageContent,
+          thinking: thinkingContent,
+          role: "assistant"
+        },
+        done: true
+      };
+      return { response, elapsedMs };
+    } catch (e) {
+      if (e instanceof Error && e.name === "AbortError") {
+        throw new Error(`Ollama API timed out after ${msHuman(timeoutMs)}`);
+      }
+      throw e;
+    } finally {
+      clearTimeout(timeoutId);
+    }
+  }
   async function providerChat(providerInfo, model, messages, options = {}) {
     const { baseUrl, apiKey } = providerInfo;
     const maxTokens = options.maxTokens ?? CONFIG.NUM_PREDICT;
@@ -368,73 +443,20 @@ function model_test_temp_default(pi) {
         return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
       }
       let parsedResult = null;
-      const sharedParser = pi._reactParser;
-      if (sharedParser?.ALL_DIALECT_PATTERNS) {
-        for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
-          const result = sharedParser.parseReactWithPatterns(content, dp, true);
-          if (result) {
-            let toolName = result.name;
-            let argsStr;
-            const rawArgs = result.args ? JSON.stringify(result.args) : "";
-            if (rawArgs && rawArgs !== "{}") {
-              argsStr = rawArgs;
-            } else if (result.raw) {
-              const jsonStart = result.raw.indexOf("{");
-              if (jsonStart !== -1) {
-                let depth = 0, jsonEnd = -1;
-                for (let i = jsonStart; i < result.raw.length; i++) {
-                  if (result.raw[i] === "{") depth++;
-                  else if (result.raw[i] === "}") {
-                    depth--;
-                    if (depth === 0) {
-                      jsonEnd = i;
-                      break;
-                    }
-                  }
-                }
-                argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
-              } else {
-                argsStr = "";
-              }
-            } else {
-              argsStr = "";
-            }
-            parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
-            break;
-          }
-        }
-      } else {
-        for (const dp of ALL_DIALECT_PATTERNS) {
-          const result = parseReactWithPatterns(content, dp, true);
-          if (result) {
-            let argsStr;
-            const rawArgs = result.args ? JSON.stringify(result.args) : "";
-            if (rawArgs && rawArgs !== "{}") {
-              argsStr = rawArgs;
-            } else if (result.raw) {
-              const jsonStart = result.raw.indexOf("{");
-              if (jsonStart !== -1) {
-                let depth = 0, jsonEnd = -1;
-                for (let i = jsonStart; i < result.raw.length; i++) {
-                  if (result.raw[i] === "{") depth++;
-                  else if (result.raw[i] === "}") {
-                    depth--;
-                    if (depth === 0) {
-                      jsonEnd = i;
-                      break;
-                    }
-                  }
-                }
-                argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
-              } else {
-                argsStr = "";
-              }
-            } else {
-              argsStr = "";
-            }
-            parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
-            break;
+      for (const dp of ALL_DIALECT_PATTERNS) {
+        const result = parseReactWithPatterns(content, dp, true);
+        if (result) {
+          let argsStr;
+          const rawArgs = result.args ? JSON.stringify(result.args) : "";
+          if (rawArgs && rawArgs !== "{}") {
+            argsStr = rawArgs;
+          } else if (result.raw) {
+            argsStr = extractBraceJson(result.raw);
+          } else {
+            argsStr = "";
           }
+          parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
+          break;
         }
       }
       if (parsedResult) {
@@ -525,7 +547,7 @@ function model_test_temp_default(pi) {
     try {
       const start = Date.now();
       const controller = new AbortController();
-      const timeoutId = setTimeout(() => controller.abort(), 13e4);
+      const timeoutId = setTimeout(() => controller.abort(), effectiveConfig.TOOL_SUPPORT_TIMEOUT_MS);
       const res = await fetch(`${ollamaBase()}/api/chat`, {
         method: "POST",
         headers: { "Content-Type": "application/json" },
@@ -556,7 +578,8 @@ function model_test_temp_default(pi) {
         try {
           const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
           argsStr = JSON.stringify(args);
-        } catch {
+        } catch (err) {
+          debugLog("model-test", "failed to parse tool call arguments", err);
           argsStr = String(fn.arguments);
         }
         const level2 = "native";
@@ -568,41 +591,14 @@ function model_test_temp_default(pi) {
           elapsedMs
         };
       }
-      const reactPatterns = [
-        // Classic ReAct
-        /^\s*Action:\s*/im,
-        /^\s*Action Input:\s*/im,
-        /^\s*Thought:\s*/im,
-        /Action:\s*\w+/i,
-        /Action Input:\s*\{/i,
-        // Function dialect
-        /^\s*Function:\s*/im,
-        /^\s*Function Input:\s*/im,
-        /Function:\s*\w+/i,
-        // Tool dialect
-        /^\s*Tool:\s*/im,
-        /^\s*Tool Input:\s*/im,
-        /Tool:\s*\w+/i,
-        // Call dialect
-        /^\s*Call:\s*/im,
-        /^\s*Input:\s*/im,
-        /Call:\s*\w+/i
-      ];
-      const matchedPatterns = [];
-      for (const p of reactPatterns) {
-        if (p.test(content)) matchedPatterns.push(p.source);
-      }
-      if (matchedPatterns.length > 0) {
-        let dialectName = "react";
-        if (/Function:/i.test(content)) dialectName = "function";
-        else if (/Tool:/i.test(content)) dialectName = "tool";
-        else if (/Call:/i.test(content)) dialectName = "call";
+      const detectedDialect = detectReactDialect(content);
+      if (detectedDialect) {
         const level2 = "react";
         cacheToolSupport(model, level2, family);
         return {
           level: level2,
           cached: false,
-          evidence: `ReAct format detected (${dialectName} dialect) in text response`,
+          evidence: `ReAct format detected (${detectedDialect.name} dialect) in text response`,
           elapsedMs
         };
       }
@@ -646,7 +642,8 @@ function model_test_temp_default(pi) {
       if (!res.ok) return [];
       const data = await res.json();
       return (data.models || []).map((m) => m.name).filter(Boolean);
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to list Ollama models", err);
       return [];
     }
   }
@@ -655,43 +652,44 @@ function model_test_temp_default(pi) {
   }
   function updateModelsJsonReasoning(model, hasReasoning) {
     try {
+      const written = readModifyWriteModelsJson((config2) => {
+        for (const provider of Object.values(config2.providers || {})) {
+          const models = provider.models || [];
+          for (const m of models) {
+            if (m.id === model) {
+              const current = m.reasoning;
+              if (current === hasReasoning) {
+                return null;
+              }
+              m.reasoning = hasReasoning;
+              return config2;
+            }
+          }
+        }
+        return null;
+      });
+      if (!written) {
+        return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
+      }
       const config = readModelsJson();
-      let updated = false;
       for (const provider of Object.values(config.providers || {})) {
         const models = provider.models || [];
         for (const m of models) {
-          if (m.id === model) {
-            const current = m.reasoning;
-            if (current === hasReasoning) {
-              return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
-            }
-            m.reasoning = hasReasoning;
-            updated = true;
-            break;
+          if (m.id === model && m.reasoning === hasReasoning) {
+            return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
           }
         }
-        if (updated) break;
-      }
-      if (!updated) {
-        return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
       }
-      writeModelsJson(config);
       const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
-      return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
+      return { updated: true, message: `Updated ${model}: ${action}` };
     } catch (e) {
       return { updated: false, message: `Failed to update models.json: ${e.message}` };
     }
   }
-  const branding = [
-    `  \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
-    `  Written by VTSTech`,
-    `  GitHub: https://github.com/VTSTech`,
-    `  Website: www.vts-tech.org`
-  ].join("\n");
   async function testModelOllama(model, providerInfo, ctx) {
     const lines = [];
     const totalStart = Date.now();
-    lines.push(branding);
+    lines.push(sharedBranding);
     lines.push(section(`MODEL: ${model}`));
     lines.push(info("Provider: Ollama (local/remote)"));
     const modelsJson = readModelsJson();
@@ -732,7 +730,8 @@ function model_test_temp_default(pi) {
           modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
         }
       }
-    } catch {
+    } catch (err) {
+      debugLog("model-test", "failed to fetch model metadata from /api/show", err);
     }
     const detectedFamily = detectModelFamily(model);
     lines.push(info(`Size: ${modelSize}  |  Params: ${modelParams}  |  Quant: ${modelQuant}`));
@@ -879,11 +878,10 @@ function model_test_temp_default(pi) {
     }
     lines.push(info(`Evidence: ${toolSupport.evidence}`));
     lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
-    lines.push(section("SUMMARY"));
     const totalMs = Date.now() - totalStart;
     const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
     const reactPass = react.score === "STRONG" || react.score === "MODERATE";
-    const tests = [
+    const ollamaTests = [
       { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
       { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
       { name: "Tool Usage", pass: toolPass, score: tools.score },
@@ -891,29 +889,45 @@ function model_test_temp_default(pi) {
       { name: "Instructions", pass: instructions.pass, score: instructions.score },
       { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
     ];
-    const passed = tests.filter((t) => t.pass).length;
-    const total = tests.length;
-    for (const t of tests) {
-      lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
-    }
-    lines.push(info(`Total time: ${msHuman(totalMs)}`));
-    lines.push(info(`Score: ${passed}/${total} tests passed`));
-    lines.push(section("RECOMMENDATION"));
-    if (passed === 6) {
-      lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
-    } else if (passed >= 5) {
-      lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
-    } else if (passed >= 4) {
-      lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
-    } else {
-      lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
+    const passed = ollamaTests.filter((t) => t.pass).length;
+    const total = ollamaTests.length;
+    lines.push(...formatTestSummary(ollamaTests, totalMs));
+    lines.push(...formatRecommendation(model, passed, total));
+    try {
+      const historyEntry = {
+        timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+        model,
+        providerKind: "ollama",
+        providerName: providerName || "ollama",
+        tests: {
+          reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
+          thinking: { supported: thinking.supported },
+          toolUsage: { score: tools.score, pass: tools.score === "STRONG" || tools.score === "MODERATE", toolCall: tools.toolCall },
+          reactParsing: { score: react.score, pass: react.score === "STRONG" || react.score === "MODERATE", toolCall: react.toolCall, dialect: react.dialect },
+          instructionFollowing: { score: instructions.score, pass: instructions.pass },
+          toolSupport: { level: toolSupport.level, evidence: toolSupport.evidence }
+        },
+        passedCount: passed,
+        totalCount: total,
+        totalMs
+      };
+      appendTestHistory(historyEntry);
+      const regressions = detectRegression(model, historyEntry);
+      if (regressions.length > 0) {
+        lines.push(section("REGRESSION DETECTED"));
+        for (const reg of regressions) {
+          lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
+        }
+      }
+    } catch (err) {
+      debugLog("model-test", "failed to save test history", err);
     }
     return lines.join("\n");
   }
   async function testModelProvider(providerInfo, model, ctx) {
     const lines = [];
     const totalStart = Date.now();
-    lines.push(branding);
+    lines.push(sharedBranding);
     lines.push(section(`MODEL: ${model}`));
     lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
     lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
@@ -1018,30 +1032,45 @@ function model_test_temp_default(pi) {
     lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
     lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
     lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
-    lines.push(section("SUMMARY"));
     const totalMs = Date.now() - totalStart;
-    const tests = [
+    const providerTests = [
       { name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
       { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
       { name: "Instructions", pass: instructions.pass, score: instructions.score },
       { name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
     ];
-    const passed = tests.filter((t) => t.pass).length;
-    const total = tests.length;
-    for (const t of tests) {
-      lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
-    }
-    lines.push(info(`Total time: ${msHuman(totalMs)}`));
-    lines.push(info(`Score: ${passed}/${total} tests passed`));
-    lines.push(section("RECOMMENDATION"));
-    if (passed === 4) {
-      lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
-    } else if (passed >= 3) {
-      lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
-    } else if (passed >= 2) {
-      lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
-    } else {
-      lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
+    const passed = providerTests.filter((t) => t.pass).length;
+    const total = providerTests.length;
+    lines.push(...formatTestSummary(providerTests, totalMs));
+    lines.push(...formatRecommendation(model, passed, total, providerInfo.name));
+    try {
+      const historyEntry = {
+        timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+        model,
+        providerKind: "builtin",
+        providerName: providerInfo.name,
+        tests: {
+          reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
+          thinking: { supported: false },
+          toolUsage: { score: toolTest.score, pass: toolTest.pass, toolCall: toolTest.toolCall },
+          reactParsing: { score: "SKIP", pass: false, toolCall: "n/a" },
+          instructionFollowing: { score: instructions.score, pass: instructions.pass },
+          toolSupport: { level: "native", evidence: "provider-native (not probed)" }
+        },
+        passedCount: passed,
+        totalCount: total,
+        totalMs
+      };
+      appendTestHistory(historyEntry);
+      const regressions = detectRegression(model, historyEntry);
+      if (regressions.length > 0) {
+        lines.push(section("REGRESSION DETECTED"));
+        for (const reg of regressions) {
+          lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
+        }
+      }
+    } catch (err) {
+      debugLog("model-test", "failed to save provider test history", err);
     }
     return lines.join("\n");
   }
@@ -1061,7 +1090,8 @@ function model_test_temp_default(pi) {
       try {
         const models = await getOllamaModels();
         return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
-      } catch {
+      } catch (err) {
+        debugLog("model-test", "failed to get model completions", err);
         return [];
       }
     },
@@ -1081,7 +1111,8 @@ function model_test_temp_default(pi) {
         let models;
         try {
           models = await getOllamaModels();
-        } catch {
+        } catch (err) {
+          debugLog("model-test", "failed to list Ollama models for --all", err);
           ctx.ui.notify("Could not list Ollama models", "error");
           return;
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vtstech/pi-model-test",
-  "version": "1.1.6",
+  "version": "1.1.8",
   "description": "Model benchmark/testing extension for Pi Coding Agent",
   "main": "model-test.js",
   "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
     "url": "https://github.com/VTSTech/pi-coding-agent"
   },
   "dependencies": {
-    "@vtstech/pi-shared": "1.1.6"
+    "@vtstech/pi-shared": "1.1.8"
   },
   "peerDependencies": {
     "@mariozechner/pi-coding-agent": ">=0.66"