npm - @vtstech/pi-model-test - Versions diffs - 1.1.8 → 1.1.9 - Mend

@vtstech/pi-model-test 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/model-test.js +58 -94
package/package.json +2 -2

package/model-test.js CHANGED Viewed

@@ -10,6 +10,7 @@ import {
   sanitizeForReport
 } from "@vtstech/pi-shared/format";
 import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
+import { debugLog } from "@vtstech/pi-shared/debug";
 import {
   ALL_DIALECT_PATTERNS,
   parseReactWithPatterns,
@@ -46,6 +47,57 @@ function model_test_temp_default(pi) {
       await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
     }
   }
+  function reportScore(lines, score, descriptions, fallback) {
+    const desc = descriptions[score] || descriptions["*"] || `(${score})`;
+    if (score === "STRONG" || score === "MODERATE") {
+      lines.push(ok(desc));
+    } else if (score === "WEAK") {
+      lines.push(warn(desc));
+    } else if (score === "FAIL") {
+      lines.push(fail(desc));
+    } else {
+      lines.push(fail(fallback));
+    }
+  }
+  function reportReasoningScore(lines, result) {
+    reportScore(lines, result.score, {
+      STRONG: `Answer: ${result.answer} \u2014 Correct with clear reasoning (${result.score})`,
+      MODERATE: `Answer: ${result.answer} \u2014 Correct but weak reasoning (${result.score})`,
+      WEAK: `Answer: ${result.answer} \u2014 Reasoned but wrong answer (${result.score})`,
+      FAIL: `Answer: ${result.answer} \u2014 No reasoning detected (${result.score})`
+    }, `Error: ${result.reasoning.includes("<!DOCTYPE") || result.reasoning.includes("<html") ? result.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(result.reasoning, 300)}`);
+  }
+  function reportInstructionScore(lines, result) {
+    reportScore(lines, result.score, {
+      STRONG: `JSON output valid with correct values (${result.score})`,
+      MODERATE: `JSON output valid but some values incorrect (${result.score})`,
+      WEAK: `Partial JSON compliance (${result.score})`
+    }, `Failed to produce valid JSON (${result.score})`);
+  }
+  function reportToolScore(lines, result) {
+    if (result.score === "STRONG" || result.score === "MODERATE") {
+      lines.push(ok(`Tool call: ${result.toolCall} (${result.score})`));
+    } else if (result.score === "WEAK") {
+      lines.push(warn(`Tool call: ${result.toolCall} (${result.score}) \u2014 malformed call`));
+    } else if (result.score === "FAIL") {
+      const hasResponse = result.response && result.response.trim().length > 0;
+      lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${result.score})`));
+    } else {
+      lines.push(fail(`Error: ${result.toolCall}`));
+    }
+    if (result.score === "STRONG" || result.score === "MODERATE" || result.score === "WEAK") {
+      if (result.response) {
+        lines.push(info(`Raw response: ${sanitizeForReport(result.response)}`));
+      }
+    } else if (result.score === "FAIL") {
+      const hasResponse = result.response && result.response.trim().length > 0;
+      if (hasResponse) {
+        lines.push(info(`Text response: ${sanitizeForReport(result.response)}`));
+      } else {
+        lines.push(info("Text response: (empty)"));
+      }
+    }
+  }
   function makeOllamaChatFn(useStreaming = true) {
     return async (model, messages, _options) => {
       const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
@@ -741,18 +793,7 @@ function model_test_temp_default(pi) {
     lines.push(info("Testing..."));
     const reasoning = await testReasoning(model);
     lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
-    if (reasoning.score === "STRONG") {
-      lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
-    } else if (reasoning.score === "MODERATE") {
-      lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
-    } else if (reasoning.score === "WEAK") {
-      lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
-    } else if (reasoning.score === "FAIL") {
-      lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
-    } else {
-      const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
-      lines.push(fail(`Error: ${errMsg}`));
-    }
+    reportReasoningScore(lines, reasoning);
     lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
     lines.push(section("THINKING TEST"));
     lines.push(info('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
@@ -775,32 +816,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const tools = await testToolUsage(model);
     lines.push(info(`Time: ${msHuman(tools.elapsedMs)}`));
-    if (tools.score === "STRONG") {
-      lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
-      if (tools.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
-      }
-    } else if (tools.score === "MODERATE") {
-      lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
-      if (tools.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
-      }
-    } else if (tools.score === "WEAK") {
-      lines.push(warn(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
-      if (tools.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
-      }
-    } else if (tools.score === "FAIL") {
-      const hasResponse = tools.response && tools.response.trim().length > 0;
-      lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
-      if (hasResponse) {
-        lines.push(info(`Text response: ${sanitizeForReport(tools.response)}`));
-      } else {
-        lines.push(info("Text response: (empty)"));
-      }
-    } else {
-      lines.push(fail(`Error: ${tools.toolCall}`));
-    }
+    reportToolScore(lines, tools);
     lines.push(section("REACT PARSING TEST"));
     lines.push(info(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
     lines.push(info("Testing..."));
@@ -837,15 +853,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const instructions = await testInstructionFollowing(model);
     lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
-    if (instructions.score === "STRONG") {
-      lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
-    } else if (instructions.score === "MODERATE") {
-      lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
-    } else if (instructions.score === "WEAK") {
-      lines.push(warn(`Partial JSON compliance (${instructions.score})`));
-    } else {
-      lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
-    }
+    reportInstructionScore(lines, instructions);
     lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
     lines.push(section("TOOL SUPPORT DETECTION"));
     lines.push(info("Probing model for tool calling capability (native / ReAct / none)"));
@@ -966,18 +974,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const reasoning = await testReasoningProvider(providerInfo, model);
     lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
-    if (reasoning.score === "STRONG") {
-      lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
-    } else if (reasoning.score === "MODERATE") {
-      lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
-    } else if (reasoning.score === "WEAK") {
-      lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
-    } else if (reasoning.score === "FAIL") {
-      lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
-    } else {
-      const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
-      lines.push(fail(`Error: ${errMsg}`));
-    }
+    reportReasoningScore(lines, reasoning);
     lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
     lines.push(section("INSTRUCTION FOLLOWING TEST"));
     lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
@@ -985,15 +982,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const instructions = await testInstructionFollowingProvider(providerInfo, model);
     lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
-    if (instructions.score === "STRONG") {
-      lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
-    } else if (instructions.score === "MODERATE") {
-      lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
-    } else if (instructions.score === "WEAK") {
-      lines.push(warn(`Partial JSON compliance (${instructions.score})`));
-    } else {
-      lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
-    }
+    reportInstructionScore(lines, instructions);
     lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
     lines.push(section("TOOL USAGE TEST"));
     lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
@@ -1001,32 +990,7 @@ function model_test_temp_default(pi) {
     await rateLimitDelay(lines);
     const toolTest = await testToolUsageProvider(providerInfo, model);
     lines.push(info(`Time: ${msHuman(toolTest.elapsedMs)}`));
-    if (toolTest.score === "STRONG") {
-      lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
-      if (toolTest.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
-      }
-    } else if (toolTest.score === "MODERATE") {
-      lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
-      if (toolTest.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
-      }
-    } else if (toolTest.score === "WEAK") {
-      lines.push(warn(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
-      if (toolTest.response) {
-        lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
-      }
-    } else if (toolTest.score === "FAIL") {
-      const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
-      lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
-      if (hasResponse) {
-        lines.push(info(`Text response: ${sanitizeForReport(toolTest.response)}`));
-      } else {
-        lines.push(info("Text response: (empty)"));
-      }
-    } else {
-      lines.push(fail(`Error: ${toolTest.toolCall}`));
-    }
+    reportToolScore(lines, toolTest);
     lines.push(section("SKIPPED TESTS (OLLAMA-ONLY)"));
     lines.push(warn("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
     lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vtstech/pi-model-test",
-  "version": "1.1.8",
+  "version": "1.1.9",
   "description": "Model benchmark/testing extension for Pi Coding Agent",
   "main": "model-test.js",
   "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
     "url": "https://github.com/VTSTech/pi-coding-agent"
   },
   "dependencies": {
-    "@vtstech/pi-shared": "1.1.8"
+    "@vtstech/pi-shared": "1.1.9"
   },
   "peerDependencies": {
     "@mariozechner/pi-coding-agent": ">=0.66"