@vtstech/pi-model-test 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +58 -94
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -10,6 +10,7 @@ import {
10
10
  sanitizeForReport
11
11
  } from "@vtstech/pi-shared/format";
12
12
  import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
13
+ import { debugLog } from "@vtstech/pi-shared/debug";
13
14
  import {
14
15
  ALL_DIALECT_PATTERNS,
15
16
  parseReactWithPatterns,
@@ -46,6 +47,57 @@ function model_test_temp_default(pi) {
46
47
  await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
47
48
  }
48
49
  }
50
+ function reportScore(lines, score, descriptions, fallback) {
51
+ const desc = descriptions[score] || descriptions["*"] || `(${score})`;
52
+ if (score === "STRONG" || score === "MODERATE") {
53
+ lines.push(ok(desc));
54
+ } else if (score === "WEAK") {
55
+ lines.push(warn(desc));
56
+ } else if (score === "FAIL") {
57
+ lines.push(fail(desc));
58
+ } else {
59
+ lines.push(fail(fallback));
60
+ }
61
+ }
62
+ function reportReasoningScore(lines, result) {
63
+ reportScore(lines, result.score, {
64
+ STRONG: `Answer: ${result.answer} \u2014 Correct with clear reasoning (${result.score})`,
65
+ MODERATE: `Answer: ${result.answer} \u2014 Correct but weak reasoning (${result.score})`,
66
+ WEAK: `Answer: ${result.answer} \u2014 Reasoned but wrong answer (${result.score})`,
67
+ FAIL: `Answer: ${result.answer} \u2014 No reasoning detected (${result.score})`
68
+ }, `Error: ${result.reasoning.includes("<!DOCTYPE") || result.reasoning.includes("<html") ? result.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(result.reasoning, 300)}`);
69
+ }
70
+ function reportInstructionScore(lines, result) {
71
+ reportScore(lines, result.score, {
72
+ STRONG: `JSON output valid with correct values (${result.score})`,
73
+ MODERATE: `JSON output valid but some values incorrect (${result.score})`,
74
+ WEAK: `Partial JSON compliance (${result.score})`
75
+ }, `Failed to produce valid JSON (${result.score})`);
76
+ }
77
+ function reportToolScore(lines, result) {
78
+ if (result.score === "STRONG" || result.score === "MODERATE") {
79
+ lines.push(ok(`Tool call: ${result.toolCall} (${result.score})`));
80
+ } else if (result.score === "WEAK") {
81
+ lines.push(warn(`Tool call: ${result.toolCall} (${result.score}) \u2014 malformed call`));
82
+ } else if (result.score === "FAIL") {
83
+ const hasResponse = result.response && result.response.trim().length > 0;
84
+ lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${result.score})`));
85
+ } else {
86
+ lines.push(fail(`Error: ${result.toolCall}`));
87
+ }
88
+ if (result.score === "STRONG" || result.score === "MODERATE" || result.score === "WEAK") {
89
+ if (result.response) {
90
+ lines.push(info(`Raw response: ${sanitizeForReport(result.response)}`));
91
+ }
92
+ } else if (result.score === "FAIL") {
93
+ const hasResponse = result.response && result.response.trim().length > 0;
94
+ if (hasResponse) {
95
+ lines.push(info(`Text response: ${sanitizeForReport(result.response)}`));
96
+ } else {
97
+ lines.push(info("Text response: (empty)"));
98
+ }
99
+ }
100
+ }
49
101
  function makeOllamaChatFn(useStreaming = true) {
50
102
  return async (model, messages, _options) => {
51
103
  const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
@@ -741,18 +793,7 @@ function model_test_temp_default(pi) {
741
793
  lines.push(info("Testing..."));
742
794
  const reasoning = await testReasoning(model);
743
795
  lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
744
- if (reasoning.score === "STRONG") {
745
- lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
746
- } else if (reasoning.score === "MODERATE") {
747
- lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
748
- } else if (reasoning.score === "WEAK") {
749
- lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
750
- } else if (reasoning.score === "FAIL") {
751
- lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
752
- } else {
753
- const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
754
- lines.push(fail(`Error: ${errMsg}`));
755
- }
796
+ reportReasoningScore(lines, reasoning);
756
797
  lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
757
798
  lines.push(section("THINKING TEST"));
758
799
  lines.push(info('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
@@ -775,32 +816,7 @@ function model_test_temp_default(pi) {
775
816
  await rateLimitDelay(lines);
776
817
  const tools = await testToolUsage(model);
777
818
  lines.push(info(`Time: ${msHuman(tools.elapsedMs)}`));
778
- if (tools.score === "STRONG") {
779
- lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
780
- if (tools.response) {
781
- lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
782
- }
783
- } else if (tools.score === "MODERATE") {
784
- lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
785
- if (tools.response) {
786
- lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
787
- }
788
- } else if (tools.score === "WEAK") {
789
- lines.push(warn(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
790
- if (tools.response) {
791
- lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
792
- }
793
- } else if (tools.score === "FAIL") {
794
- const hasResponse = tools.response && tools.response.trim().length > 0;
795
- lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
796
- if (hasResponse) {
797
- lines.push(info(`Text response: ${sanitizeForReport(tools.response)}`));
798
- } else {
799
- lines.push(info("Text response: (empty)"));
800
- }
801
- } else {
802
- lines.push(fail(`Error: ${tools.toolCall}`));
803
- }
819
+ reportToolScore(lines, tools);
804
820
  lines.push(section("REACT PARSING TEST"));
805
821
  lines.push(info(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
806
822
  lines.push(info("Testing..."));
@@ -837,15 +853,7 @@ function model_test_temp_default(pi) {
837
853
  await rateLimitDelay(lines);
838
854
  const instructions = await testInstructionFollowing(model);
839
855
  lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
840
- if (instructions.score === "STRONG") {
841
- lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
842
- } else if (instructions.score === "MODERATE") {
843
- lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
844
- } else if (instructions.score === "WEAK") {
845
- lines.push(warn(`Partial JSON compliance (${instructions.score})`));
846
- } else {
847
- lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
848
- }
856
+ reportInstructionScore(lines, instructions);
849
857
  lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
850
858
  lines.push(section("TOOL SUPPORT DETECTION"));
851
859
  lines.push(info("Probing model for tool calling capability (native / ReAct / none)"));
@@ -966,18 +974,7 @@ function model_test_temp_default(pi) {
966
974
  await rateLimitDelay(lines);
967
975
  const reasoning = await testReasoningProvider(providerInfo, model);
968
976
  lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
969
- if (reasoning.score === "STRONG") {
970
- lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
971
- } else if (reasoning.score === "MODERATE") {
972
- lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
973
- } else if (reasoning.score === "WEAK") {
974
- lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
975
- } else if (reasoning.score === "FAIL") {
976
- lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
977
- } else {
978
- const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
979
- lines.push(fail(`Error: ${errMsg}`));
980
- }
977
+ reportReasoningScore(lines, reasoning);
981
978
  lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
982
979
  lines.push(section("INSTRUCTION FOLLOWING TEST"));
983
980
  lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
@@ -985,15 +982,7 @@ function model_test_temp_default(pi) {
985
982
  await rateLimitDelay(lines);
986
983
  const instructions = await testInstructionFollowingProvider(providerInfo, model);
987
984
  lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
988
- if (instructions.score === "STRONG") {
989
- lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
990
- } else if (instructions.score === "MODERATE") {
991
- lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
992
- } else if (instructions.score === "WEAK") {
993
- lines.push(warn(`Partial JSON compliance (${instructions.score})`));
994
- } else {
995
- lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
996
- }
985
+ reportInstructionScore(lines, instructions);
997
986
  lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
998
987
  lines.push(section("TOOL USAGE TEST"));
999
988
  lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
@@ -1001,32 +990,7 @@ function model_test_temp_default(pi) {
1001
990
  await rateLimitDelay(lines);
1002
991
  const toolTest = await testToolUsageProvider(providerInfo, model);
1003
992
  lines.push(info(`Time: ${msHuman(toolTest.elapsedMs)}`));
1004
- if (toolTest.score === "STRONG") {
1005
- lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
1006
- if (toolTest.response) {
1007
- lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1008
- }
1009
- } else if (toolTest.score === "MODERATE") {
1010
- lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
1011
- if (toolTest.response) {
1012
- lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1013
- }
1014
- } else if (toolTest.score === "WEAK") {
1015
- lines.push(warn(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
1016
- if (toolTest.response) {
1017
- lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1018
- }
1019
- } else if (toolTest.score === "FAIL") {
1020
- const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
1021
- lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
1022
- if (hasResponse) {
1023
- lines.push(info(`Text response: ${sanitizeForReport(toolTest.response)}`));
1024
- } else {
1025
- lines.push(info("Text response: (empty)"));
1026
- }
1027
- } else {
1028
- lines.push(fail(`Error: ${toolTest.toolCall}`));
1029
- }
993
+ reportToolScore(lines, toolTest);
1030
994
  lines.push(section("SKIPPED TESTS (OLLAMA-ONLY)"));
1031
995
  lines.push(warn("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
1032
996
  lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.1.8",
3
+ "version": "1.1.9",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.1.8"
17
+ "@vtstech/pi-shared": "1.1.9"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"