npm - @vtstech/pi-model-test - Versions diffs - 1.1.6 → 1.1.7 - Mend

@vtstech/pi-model-test 1.1.6 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -44,8 +44,11 @@ pi install "npm:@vtstech/pi-model-test"
 - Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
 - Uses native `fetch()` for all HTTP communication (no shell subprocess or curl dependency)
+- **Streaming Ollama chat** — uses `/api/chat` with `stream: true` for earlier timeout detection and reduced memory
 - Automatic remote Ollama URL resolution (reads from `models.json` on every call — picks up config changes immediately)
-- Timeout resilience with auto-retry on empty responses
+- Timeout resilience with exponential backoff retry on connection failures
+- **Configurable test parameters** — override timeouts, delays, temperature via `~/.pi/agent/model-test-config.json`
+- **Test history with regression detection** — tracks results at `~/.pi/agent/cache/model-test-history.json`, flags score degradation
 - Rate limit delay between tests (configurable)
 - Thinking model fallback (retries with `think: true`)
 - Tool support cache (`~/.pi/agent/cache/tool_support.json`)

package/model-test.js CHANGED Viewed

@@ -20,12 +20,16 @@ import {
   scoreReasoning,
   getCachedToolSupport,
   cacheToolSupport,
+  getEffectiveConfig,
+  appendTestHistory,
+  detectRegression,
   testToolUsageUnified,
   testReasoningUnified,
   testInstructionFollowingUnified,
   TOOL_SUPPORT_CACHE_PATH
 } from "@vtstech/pi-shared/model-test-utils";
 function model_test_temp_default(pi) {
+  const effectiveConfig = getEffectiveConfig();
   function ollamaBase() {
     return getOllamaBaseUrl();
   }
@@ -35,9 +39,10 @@ function model_test_temp_default(pi) {
       await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
     }
   }
-  function makeOllamaChatFn() {
+  function makeOllamaChatFn(useStreaming = true) {
     return async (model, messages, _options) => {
-      const result = await ollamaChat(model, messages);
+      const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
+      const result = await chatFn(model, messages);
       return {
         content: result.response?.message?.content || "",
         elapsedMs: result.elapsedMs,
@@ -154,6 +159,68 @@ function model_test_temp_default(pi) {
     }
     throw new Error("Unreachable");
   }
+  async function ollamaChatStream(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS) {
+    const body = { model, messages, stream: true, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
+    const url = `${ollamaBase()}/api/chat`;
+    const controller = new AbortController();
+    const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
+    const start = Date.now();
+    try {
+      const res = await fetch(url, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify(body),
+        signal: controller.signal
+      });
+      if (!res.ok) {
+        const errorText = await res.text().catch(() => "unknown error");
+        throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
+      }
+      if (!res.body) {
+        throw new Error("Ollama streaming response has no body");
+      }
+      let messageContent = "";
+      let thinkingContent = "";
+      let done = false;
+      const reader = res.body.getReader();
+      const decoder = new TextDecoder();
+      while (!done) {
+        const { value, done: streamDone } = await reader.read();
+        if (streamDone) break;
+        const chunk = decoder.decode(value, { stream: true });
+        const lines = chunk.split("\n").filter((line) => line.trim().length > 0);
+        for (const line of lines) {
+          try {
+            const parsed = JSON.parse(line);
+            if (parsed.message?.content) messageContent += parsed.message.content;
+            if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
+            if (parsed.done) done = true;
+          } catch {
+          }
+        }
+      }
+      const elapsedMs = Date.now() - start;
+      if (!messageContent.trim() && !thinkingContent.trim()) {
+        throw new Error("Empty streaming response from Ollama");
+      }
+      const response = {
+        message: {
+          content: messageContent,
+          thinking: thinkingContent,
+          role: "assistant"
+        },
+        done: true
+      };
+      return { response, elapsedMs };
+    } catch (e) {
+      if (e instanceof Error && e.name === "AbortError") {
+        throw new Error(`Ollama API timed out after ${msHuman(timeoutMs)}`);
+      }
+      throw e;
+    } finally {
+      clearTimeout(timeoutId);
+    }
+  }
   async function providerChat(providerInfo, model, messages, options = {}) {
     const { baseUrl, apiKey } = providerInfo;
     const maxTokens = options.maxTokens ?? CONFIG.NUM_PREDICT;
@@ -325,6 +392,22 @@ function model_test_temp_default(pi) {
   async function testToolUsageProvider(providerInfo, model) {
     return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
   }
+  function extractBraceJson(raw) {
+    const jsonStart = raw.indexOf("{");
+    if (jsonStart === -1) return "";
+    let depth = 0, jsonEnd = -1;
+    for (let i = jsonStart; i < raw.length; i++) {
+      if (raw[i] === "{") depth++;
+      else if (raw[i] === "}") {
+        depth--;
+        if (depth === 0) {
+          jsonEnd = i;
+          break;
+        }
+      }
+    }
+    return jsonEnd !== -1 ? raw.slice(jsonStart, jsonEnd + 1) : "";
+  }
   async function testReactParsing(model) {
     const systemPrompt = [
       "You are a helpful assistant with access to tools.",
@@ -379,23 +462,7 @@ function model_test_temp_default(pi) {
             if (rawArgs && rawArgs !== "{}") {
               argsStr = rawArgs;
             } else if (result.raw) {
-              const jsonStart = result.raw.indexOf("{");
-              if (jsonStart !== -1) {
-                let depth = 0, jsonEnd = -1;
-                for (let i = jsonStart; i < result.raw.length; i++) {
-                  if (result.raw[i] === "{") depth++;
-                  else if (result.raw[i] === "}") {
-                    depth--;
-                    if (depth === 0) {
-                      jsonEnd = i;
-                      break;
-                    }
-                  }
-                }
-                argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
-              } else {
-                argsStr = "";
-              }
+              argsStr = extractBraceJson(result.raw);
             } else {
               argsStr = "";
             }
@@ -412,23 +479,7 @@ function model_test_temp_default(pi) {
             if (rawArgs && rawArgs !== "{}") {
               argsStr = rawArgs;
             } else if (result.raw) {
-              const jsonStart = result.raw.indexOf("{");
-              if (jsonStart !== -1) {
-                let depth = 0, jsonEnd = -1;
-                for (let i = jsonStart; i < result.raw.length; i++) {
-                  if (result.raw[i] === "{") depth++;
-                  else if (result.raw[i] === "}") {
-                    depth--;
-                    if (depth === 0) {
-                      jsonEnd = i;
-                      break;
-                    }
-                  }
-                }
-                argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
-              } else {
-                argsStr = "";
-              }
+              argsStr = extractBraceJson(result.raw);
             } else {
               argsStr = "";
             }
@@ -908,6 +959,34 @@ function model_test_temp_default(pi) {
     } else {
       lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
     }
+    try {
+      const historyEntry = {
+        timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+        model,
+        providerKind: "ollama",
+        providerName: providerName || "ollama",
+        tests: {
+          reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
+          thinking: { supported: thinking.supported },
+          toolUsage: { score: tools.score, pass: tools.score === "STRONG" || tools.score === "MODERATE", toolCall: tools.toolCall },
+          reactParsing: { score: react.score, pass: react.score === "STRONG" || react.score === "MODERATE", toolCall: react.toolCall, dialect: react.dialect },
+          instructionFollowing: { score: instructions.score, pass: instructions.pass },
+          toolSupport: { level: toolSupport.level, evidence: toolSupport.evidence }
+        },
+        passedCount: passed,
+        totalCount: total,
+        totalMs
+      };
+      appendTestHistory(historyEntry);
+      const regressions = detectRegression(model, historyEntry);
+      if (regressions.length > 0) {
+        lines.push(section("REGRESSION DETECTED"));
+        for (const reg of regressions) {
+          lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
+        }
+      }
+    } catch {
+    }
     return lines.join("\n");
   }
   async function testModelProvider(providerInfo, model, ctx) {
@@ -1043,6 +1122,34 @@ function model_test_temp_default(pi) {
     } else {
       lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
     }
+    try {
+      const historyEntry = {
+        timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+        model,
+        providerKind: "builtin",
+        providerName: providerInfo.name,
+        tests: {
+          reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
+          thinking: { supported: false },
+          toolUsage: { score: toolTest.score, pass: toolTest.pass, toolCall: toolTest.toolCall },
+          reactParsing: { score: "SKIP", pass: false, toolCall: "n/a" },
+          instructionFollowing: { score: instructions.score, pass: instructions.pass },
+          toolSupport: { level: "native", evidence: "provider-native (not probed)" }
+        },
+        passedCount: passed,
+        totalCount: total,
+        totalMs
+      };
+      appendTestHistory(historyEntry);
+      const regressions = detectRegression(model, historyEntry);
+      if (regressions.length > 0) {
+        lines.push(section("REGRESSION DETECTED"));
+        for (const reg of regressions) {
+          lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
+        }
+      }
+    } catch {
+    }
     return lines.join("\n");
   }
   async function testModel(model, ctx) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vtstech/pi-model-test",
-  "version": "1.1.6",
+  "version": "1.1.7",
   "description": "Model benchmark/testing extension for Pi Coding Agent",
   "main": "model-test.js",
   "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
     "url": "https://github.com/VTSTech/pi-coding-agent"
   },
   "dependencies": {
-    "@vtstech/pi-shared": "1.1.6"
+    "@vtstech/pi-shared": "1.1.7"
   },
   "peerDependencies": {
     "@mariozechner/pi-coding-agent": ">=0.66"