@vtstech/pi-model-test 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +58 -94
- package/package.json +2 -2
package/model-test.js
CHANGED
|
@@ -10,6 +10,7 @@ import {
|
|
|
10
10
|
sanitizeForReport
|
|
11
11
|
} from "@vtstech/pi-shared/format";
|
|
12
12
|
import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
|
|
13
|
+
import { debugLog } from "@vtstech/pi-shared/debug";
|
|
13
14
|
import {
|
|
14
15
|
ALL_DIALECT_PATTERNS,
|
|
15
16
|
parseReactWithPatterns,
|
|
@@ -46,6 +47,57 @@ function model_test_temp_default(pi) {
|
|
|
46
47
|
await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
|
|
47
48
|
}
|
|
48
49
|
}
|
|
50
|
+
function reportScore(lines, score, descriptions, fallback) {
|
|
51
|
+
const desc = descriptions[score] || descriptions["*"] || `(${score})`;
|
|
52
|
+
if (score === "STRONG" || score === "MODERATE") {
|
|
53
|
+
lines.push(ok(desc));
|
|
54
|
+
} else if (score === "WEAK") {
|
|
55
|
+
lines.push(warn(desc));
|
|
56
|
+
} else if (score === "FAIL") {
|
|
57
|
+
lines.push(fail(desc));
|
|
58
|
+
} else {
|
|
59
|
+
lines.push(fail(fallback));
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
function reportReasoningScore(lines, result) {
|
|
63
|
+
reportScore(lines, result.score, {
|
|
64
|
+
STRONG: `Answer: ${result.answer} \u2014 Correct with clear reasoning (${result.score})`,
|
|
65
|
+
MODERATE: `Answer: ${result.answer} \u2014 Correct but weak reasoning (${result.score})`,
|
|
66
|
+
WEAK: `Answer: ${result.answer} \u2014 Reasoned but wrong answer (${result.score})`,
|
|
67
|
+
FAIL: `Answer: ${result.answer} \u2014 No reasoning detected (${result.score})`
|
|
68
|
+
}, `Error: ${result.reasoning.includes("<!DOCTYPE") || result.reasoning.includes("<html") ? result.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(result.reasoning, 300)}`);
|
|
69
|
+
}
|
|
70
|
+
function reportInstructionScore(lines, result) {
|
|
71
|
+
reportScore(lines, result.score, {
|
|
72
|
+
STRONG: `JSON output valid with correct values (${result.score})`,
|
|
73
|
+
MODERATE: `JSON output valid but some values incorrect (${result.score})`,
|
|
74
|
+
WEAK: `Partial JSON compliance (${result.score})`
|
|
75
|
+
}, `Failed to produce valid JSON (${result.score})`);
|
|
76
|
+
}
|
|
77
|
+
function reportToolScore(lines, result) {
|
|
78
|
+
if (result.score === "STRONG" || result.score === "MODERATE") {
|
|
79
|
+
lines.push(ok(`Tool call: ${result.toolCall} (${result.score})`));
|
|
80
|
+
} else if (result.score === "WEAK") {
|
|
81
|
+
lines.push(warn(`Tool call: ${result.toolCall} (${result.score}) \u2014 malformed call`));
|
|
82
|
+
} else if (result.score === "FAIL") {
|
|
83
|
+
const hasResponse = result.response && result.response.trim().length > 0;
|
|
84
|
+
lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${result.score})`));
|
|
85
|
+
} else {
|
|
86
|
+
lines.push(fail(`Error: ${result.toolCall}`));
|
|
87
|
+
}
|
|
88
|
+
if (result.score === "STRONG" || result.score === "MODERATE" || result.score === "WEAK") {
|
|
89
|
+
if (result.response) {
|
|
90
|
+
lines.push(info(`Raw response: ${sanitizeForReport(result.response)}`));
|
|
91
|
+
}
|
|
92
|
+
} else if (result.score === "FAIL") {
|
|
93
|
+
const hasResponse = result.response && result.response.trim().length > 0;
|
|
94
|
+
if (hasResponse) {
|
|
95
|
+
lines.push(info(`Text response: ${sanitizeForReport(result.response)}`));
|
|
96
|
+
} else {
|
|
97
|
+
lines.push(info("Text response: (empty)"));
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
49
101
|
function makeOllamaChatFn(useStreaming = true) {
|
|
50
102
|
return async (model, messages, _options) => {
|
|
51
103
|
const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
|
|
@@ -741,18 +793,7 @@ function model_test_temp_default(pi) {
|
|
|
741
793
|
lines.push(info("Testing..."));
|
|
742
794
|
const reasoning = await testReasoning(model);
|
|
743
795
|
lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
|
|
744
|
-
|
|
745
|
-
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
|
|
746
|
-
} else if (reasoning.score === "MODERATE") {
|
|
747
|
-
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
|
|
748
|
-
} else if (reasoning.score === "WEAK") {
|
|
749
|
-
lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
|
|
750
|
-
} else if (reasoning.score === "FAIL") {
|
|
751
|
-
lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
|
|
752
|
-
} else {
|
|
753
|
-
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
|
|
754
|
-
lines.push(fail(`Error: ${errMsg}`));
|
|
755
|
-
}
|
|
796
|
+
reportReasoningScore(lines, reasoning);
|
|
756
797
|
lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
|
|
757
798
|
lines.push(section("THINKING TEST"));
|
|
758
799
|
lines.push(info('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
|
|
@@ -775,32 +816,7 @@ function model_test_temp_default(pi) {
|
|
|
775
816
|
await rateLimitDelay(lines);
|
|
776
817
|
const tools = await testToolUsage(model);
|
|
777
818
|
lines.push(info(`Time: ${msHuman(tools.elapsedMs)}`));
|
|
778
|
-
|
|
779
|
-
lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
|
|
780
|
-
if (tools.response) {
|
|
781
|
-
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
782
|
-
}
|
|
783
|
-
} else if (tools.score === "MODERATE") {
|
|
784
|
-
lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
|
|
785
|
-
if (tools.response) {
|
|
786
|
-
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
787
|
-
}
|
|
788
|
-
} else if (tools.score === "WEAK") {
|
|
789
|
-
lines.push(warn(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
|
|
790
|
-
if (tools.response) {
|
|
791
|
-
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
792
|
-
}
|
|
793
|
-
} else if (tools.score === "FAIL") {
|
|
794
|
-
const hasResponse = tools.response && tools.response.trim().length > 0;
|
|
795
|
-
lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
|
|
796
|
-
if (hasResponse) {
|
|
797
|
-
lines.push(info(`Text response: ${sanitizeForReport(tools.response)}`));
|
|
798
|
-
} else {
|
|
799
|
-
lines.push(info("Text response: (empty)"));
|
|
800
|
-
}
|
|
801
|
-
} else {
|
|
802
|
-
lines.push(fail(`Error: ${tools.toolCall}`));
|
|
803
|
-
}
|
|
819
|
+
reportToolScore(lines, tools);
|
|
804
820
|
lines.push(section("REACT PARSING TEST"));
|
|
805
821
|
lines.push(info(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
|
|
806
822
|
lines.push(info("Testing..."));
|
|
@@ -837,15 +853,7 @@ function model_test_temp_default(pi) {
|
|
|
837
853
|
await rateLimitDelay(lines);
|
|
838
854
|
const instructions = await testInstructionFollowing(model);
|
|
839
855
|
lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
|
|
840
|
-
|
|
841
|
-
lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
|
|
842
|
-
} else if (instructions.score === "MODERATE") {
|
|
843
|
-
lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
|
|
844
|
-
} else if (instructions.score === "WEAK") {
|
|
845
|
-
lines.push(warn(`Partial JSON compliance (${instructions.score})`));
|
|
846
|
-
} else {
|
|
847
|
-
lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
|
|
848
|
-
}
|
|
856
|
+
reportInstructionScore(lines, instructions);
|
|
849
857
|
lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
|
|
850
858
|
lines.push(section("TOOL SUPPORT DETECTION"));
|
|
851
859
|
lines.push(info("Probing model for tool calling capability (native / ReAct / none)"));
|
|
@@ -966,18 +974,7 @@ function model_test_temp_default(pi) {
|
|
|
966
974
|
await rateLimitDelay(lines);
|
|
967
975
|
const reasoning = await testReasoningProvider(providerInfo, model);
|
|
968
976
|
lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
|
|
969
|
-
|
|
970
|
-
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
|
|
971
|
-
} else if (reasoning.score === "MODERATE") {
|
|
972
|
-
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
|
|
973
|
-
} else if (reasoning.score === "WEAK") {
|
|
974
|
-
lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
|
|
975
|
-
} else if (reasoning.score === "FAIL") {
|
|
976
|
-
lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
|
|
977
|
-
} else {
|
|
978
|
-
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
|
|
979
|
-
lines.push(fail(`Error: ${errMsg}`));
|
|
980
|
-
}
|
|
977
|
+
reportReasoningScore(lines, reasoning);
|
|
981
978
|
lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
|
|
982
979
|
lines.push(section("INSTRUCTION FOLLOWING TEST"));
|
|
983
980
|
lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
|
|
@@ -985,15 +982,7 @@ function model_test_temp_default(pi) {
|
|
|
985
982
|
await rateLimitDelay(lines);
|
|
986
983
|
const instructions = await testInstructionFollowingProvider(providerInfo, model);
|
|
987
984
|
lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
|
|
988
|
-
|
|
989
|
-
lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
|
|
990
|
-
} else if (instructions.score === "MODERATE") {
|
|
991
|
-
lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
|
|
992
|
-
} else if (instructions.score === "WEAK") {
|
|
993
|
-
lines.push(warn(`Partial JSON compliance (${instructions.score})`));
|
|
994
|
-
} else {
|
|
995
|
-
lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
|
|
996
|
-
}
|
|
985
|
+
reportInstructionScore(lines, instructions);
|
|
997
986
|
lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
|
|
998
987
|
lines.push(section("TOOL USAGE TEST"));
|
|
999
988
|
lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
|
|
@@ -1001,32 +990,7 @@ function model_test_temp_default(pi) {
|
|
|
1001
990
|
await rateLimitDelay(lines);
|
|
1002
991
|
const toolTest = await testToolUsageProvider(providerInfo, model);
|
|
1003
992
|
lines.push(info(`Time: ${msHuman(toolTest.elapsedMs)}`));
|
|
1004
|
-
|
|
1005
|
-
lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
|
|
1006
|
-
if (toolTest.response) {
|
|
1007
|
-
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1008
|
-
}
|
|
1009
|
-
} else if (toolTest.score === "MODERATE") {
|
|
1010
|
-
lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
|
|
1011
|
-
if (toolTest.response) {
|
|
1012
|
-
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1013
|
-
}
|
|
1014
|
-
} else if (toolTest.score === "WEAK") {
|
|
1015
|
-
lines.push(warn(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
|
|
1016
|
-
if (toolTest.response) {
|
|
1017
|
-
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1018
|
-
}
|
|
1019
|
-
} else if (toolTest.score === "FAIL") {
|
|
1020
|
-
const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
|
|
1021
|
-
lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
|
|
1022
|
-
if (hasResponse) {
|
|
1023
|
-
lines.push(info(`Text response: ${sanitizeForReport(toolTest.response)}`));
|
|
1024
|
-
} else {
|
|
1025
|
-
lines.push(info("Text response: (empty)"));
|
|
1026
|
-
}
|
|
1027
|
-
} else {
|
|
1028
|
-
lines.push(fail(`Error: ${toolTest.toolCall}`));
|
|
1029
|
-
}
|
|
993
|
+
reportToolScore(lines, toolTest);
|
|
1030
994
|
lines.push(section("SKIPPED TESTS (OLLAMA-ONLY)"));
|
|
1031
995
|
lines.push(warn("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
|
|
1032
996
|
lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.9",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.1.
|
|
17
|
+
"@vtstech/pi-shared": "1.1.9"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|