@vtstech/pi-model-test 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +137 -249
- package/package.json +2 -2
package/model-test.js
CHANGED
|
@@ -9,10 +9,13 @@ import {
|
|
|
9
9
|
truncate,
|
|
10
10
|
sanitizeForReport
|
|
11
11
|
} from "@vtstech/pi-shared/format";
|
|
12
|
-
import { getOllamaBaseUrl, detectModelFamily, readModelsJson,
|
|
12
|
+
import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
|
|
13
|
+
import { debugLog } from "@vtstech/pi-shared/debug";
|
|
13
14
|
import {
|
|
14
15
|
ALL_DIALECT_PATTERNS,
|
|
15
|
-
parseReactWithPatterns
|
|
16
|
+
parseReactWithPatterns,
|
|
17
|
+
detectReactDialect,
|
|
18
|
+
extractBraceJson
|
|
16
19
|
} from "@vtstech/pi-shared/react-parser";
|
|
17
20
|
import {
|
|
18
21
|
CONFIG,
|
|
@@ -28,15 +31,71 @@ import {
|
|
|
28
31
|
testInstructionFollowingUnified,
|
|
29
32
|
TOOL_SUPPORT_CACHE_PATH
|
|
30
33
|
} from "@vtstech/pi-shared/model-test-utils";
|
|
34
|
+
import {
|
|
35
|
+
branding as sharedBranding,
|
|
36
|
+
formatTestSummary,
|
|
37
|
+
formatRecommendation
|
|
38
|
+
} from "@vtstech/pi-shared/test-report";
|
|
31
39
|
function model_test_temp_default(pi) {
|
|
32
40
|
const effectiveConfig = getEffectiveConfig();
|
|
33
41
|
function ollamaBase() {
|
|
34
42
|
return getOllamaBaseUrl();
|
|
35
43
|
}
|
|
36
44
|
async function rateLimitDelay(lines) {
|
|
37
|
-
if (
|
|
38
|
-
lines.push(info(`Waiting ${msHuman(
|
|
39
|
-
await new Promise((r) => setTimeout(r,
|
|
45
|
+
if (effectiveConfig.TEST_DELAY_MS > 0) {
|
|
46
|
+
lines.push(info(`Waiting ${msHuman(effectiveConfig.TEST_DELAY_MS)} to avoid rate limiting...`));
|
|
47
|
+
await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
function reportScore(lines, score, descriptions, fallback) {
|
|
51
|
+
const desc = descriptions[score] || descriptions["*"] || `(${score})`;
|
|
52
|
+
if (score === "STRONG" || score === "MODERATE") {
|
|
53
|
+
lines.push(ok(desc));
|
|
54
|
+
} else if (score === "WEAK") {
|
|
55
|
+
lines.push(warn(desc));
|
|
56
|
+
} else if (score === "FAIL") {
|
|
57
|
+
lines.push(fail(desc));
|
|
58
|
+
} else {
|
|
59
|
+
lines.push(fail(fallback));
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
function reportReasoningScore(lines, result) {
|
|
63
|
+
reportScore(lines, result.score, {
|
|
64
|
+
STRONG: `Answer: ${result.answer} \u2014 Correct with clear reasoning (${result.score})`,
|
|
65
|
+
MODERATE: `Answer: ${result.answer} \u2014 Correct but weak reasoning (${result.score})`,
|
|
66
|
+
WEAK: `Answer: ${result.answer} \u2014 Reasoned but wrong answer (${result.score})`,
|
|
67
|
+
FAIL: `Answer: ${result.answer} \u2014 No reasoning detected (${result.score})`
|
|
68
|
+
}, `Error: ${result.reasoning.includes("<!DOCTYPE") || result.reasoning.includes("<html") ? result.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(result.reasoning, 300)}`);
|
|
69
|
+
}
|
|
70
|
+
function reportInstructionScore(lines, result) {
|
|
71
|
+
reportScore(lines, result.score, {
|
|
72
|
+
STRONG: `JSON output valid with correct values (${result.score})`,
|
|
73
|
+
MODERATE: `JSON output valid but some values incorrect (${result.score})`,
|
|
74
|
+
WEAK: `Partial JSON compliance (${result.score})`
|
|
75
|
+
}, `Failed to produce valid JSON (${result.score})`);
|
|
76
|
+
}
|
|
77
|
+
function reportToolScore(lines, result) {
|
|
78
|
+
if (result.score === "STRONG" || result.score === "MODERATE") {
|
|
79
|
+
lines.push(ok(`Tool call: ${result.toolCall} (${result.score})`));
|
|
80
|
+
} else if (result.score === "WEAK") {
|
|
81
|
+
lines.push(warn(`Tool call: ${result.toolCall} (${result.score}) \u2014 malformed call`));
|
|
82
|
+
} else if (result.score === "FAIL") {
|
|
83
|
+
const hasResponse = result.response && result.response.trim().length > 0;
|
|
84
|
+
lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${result.score})`));
|
|
85
|
+
} else {
|
|
86
|
+
lines.push(fail(`Error: ${result.toolCall}`));
|
|
87
|
+
}
|
|
88
|
+
if (result.score === "STRONG" || result.score === "MODERATE" || result.score === "WEAK") {
|
|
89
|
+
if (result.response) {
|
|
90
|
+
lines.push(info(`Raw response: ${sanitizeForReport(result.response)}`));
|
|
91
|
+
}
|
|
92
|
+
} else if (result.score === "FAIL") {
|
|
93
|
+
const hasResponse = result.response && result.response.trim().length > 0;
|
|
94
|
+
if (hasResponse) {
|
|
95
|
+
lines.push(info(`Text response: ${sanitizeForReport(result.response)}`));
|
|
96
|
+
} else {
|
|
97
|
+
lines.push(info("Text response: (empty)"));
|
|
98
|
+
}
|
|
40
99
|
}
|
|
41
100
|
}
|
|
42
101
|
function makeOllamaChatFn(useStreaming = true) {
|
|
@@ -195,7 +254,8 @@ function model_test_temp_default(pi) {
|
|
|
195
254
|
if (parsed.message?.content) messageContent += parsed.message.content;
|
|
196
255
|
if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
|
|
197
256
|
if (parsed.done) done = true;
|
|
198
|
-
} catch {
|
|
257
|
+
} catch (err) {
|
|
258
|
+
debugLog("model-test", "skipped malformed JSON chunk in streaming response", err);
|
|
199
259
|
}
|
|
200
260
|
}
|
|
201
261
|
}
|
|
@@ -392,22 +452,6 @@ function model_test_temp_default(pi) {
|
|
|
392
452
|
async function testToolUsageProvider(providerInfo, model) {
|
|
393
453
|
return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
|
|
394
454
|
}
|
|
395
|
-
function extractBraceJson(raw) {
|
|
396
|
-
const jsonStart = raw.indexOf("{");
|
|
397
|
-
if (jsonStart === -1) return "";
|
|
398
|
-
let depth = 0, jsonEnd = -1;
|
|
399
|
-
for (let i = jsonStart; i < raw.length; i++) {
|
|
400
|
-
if (raw[i] === "{") depth++;
|
|
401
|
-
else if (raw[i] === "}") {
|
|
402
|
-
depth--;
|
|
403
|
-
if (depth === 0) {
|
|
404
|
-
jsonEnd = i;
|
|
405
|
-
break;
|
|
406
|
-
}
|
|
407
|
-
}
|
|
408
|
-
}
|
|
409
|
-
return jsonEnd !== -1 ? raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
410
|
-
}
|
|
411
455
|
async function testReactParsing(model) {
|
|
412
456
|
const systemPrompt = [
|
|
413
457
|
"You are a helpful assistant with access to tools.",
|
|
@@ -451,41 +495,20 @@ function model_test_temp_default(pi) {
|
|
|
451
495
|
return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
|
|
452
496
|
}
|
|
453
497
|
let parsedResult = null;
|
|
454
|
-
const
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
argsStr = extractBraceJson(result.raw);
|
|
466
|
-
} else {
|
|
467
|
-
argsStr = "";
|
|
468
|
-
}
|
|
469
|
-
parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
|
|
470
|
-
break;
|
|
471
|
-
}
|
|
472
|
-
}
|
|
473
|
-
} else {
|
|
474
|
-
for (const dp of ALL_DIALECT_PATTERNS) {
|
|
475
|
-
const result = parseReactWithPatterns(content, dp, true);
|
|
476
|
-
if (result) {
|
|
477
|
-
let argsStr;
|
|
478
|
-
const rawArgs = result.args ? JSON.stringify(result.args) : "";
|
|
479
|
-
if (rawArgs && rawArgs !== "{}") {
|
|
480
|
-
argsStr = rawArgs;
|
|
481
|
-
} else if (result.raw) {
|
|
482
|
-
argsStr = extractBraceJson(result.raw);
|
|
483
|
-
} else {
|
|
484
|
-
argsStr = "";
|
|
485
|
-
}
|
|
486
|
-
parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
|
|
487
|
-
break;
|
|
498
|
+
for (const dp of ALL_DIALECT_PATTERNS) {
|
|
499
|
+
const result = parseReactWithPatterns(content, dp, true);
|
|
500
|
+
if (result) {
|
|
501
|
+
let argsStr;
|
|
502
|
+
const rawArgs = result.args ? JSON.stringify(result.args) : "";
|
|
503
|
+
if (rawArgs && rawArgs !== "{}") {
|
|
504
|
+
argsStr = rawArgs;
|
|
505
|
+
} else if (result.raw) {
|
|
506
|
+
argsStr = extractBraceJson(result.raw);
|
|
507
|
+
} else {
|
|
508
|
+
argsStr = "";
|
|
488
509
|
}
|
|
510
|
+
parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
|
|
511
|
+
break;
|
|
489
512
|
}
|
|
490
513
|
}
|
|
491
514
|
if (parsedResult) {
|
|
@@ -576,7 +599,7 @@ function model_test_temp_default(pi) {
|
|
|
576
599
|
try {
|
|
577
600
|
const start = Date.now();
|
|
578
601
|
const controller = new AbortController();
|
|
579
|
-
const timeoutId = setTimeout(() => controller.abort(),
|
|
602
|
+
const timeoutId = setTimeout(() => controller.abort(), effectiveConfig.TOOL_SUPPORT_TIMEOUT_MS);
|
|
580
603
|
const res = await fetch(`${ollamaBase()}/api/chat`, {
|
|
581
604
|
method: "POST",
|
|
582
605
|
headers: { "Content-Type": "application/json" },
|
|
@@ -607,7 +630,8 @@ function model_test_temp_default(pi) {
|
|
|
607
630
|
try {
|
|
608
631
|
const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
|
|
609
632
|
argsStr = JSON.stringify(args);
|
|
610
|
-
} catch {
|
|
633
|
+
} catch (err) {
|
|
634
|
+
debugLog("model-test", "failed to parse tool call arguments", err);
|
|
611
635
|
argsStr = String(fn.arguments);
|
|
612
636
|
}
|
|
613
637
|
const level2 = "native";
|
|
@@ -619,41 +643,14 @@ function model_test_temp_default(pi) {
|
|
|
619
643
|
elapsedMs
|
|
620
644
|
};
|
|
621
645
|
}
|
|
622
|
-
const
|
|
623
|
-
|
|
624
|
-
/^\s*Action:\s*/im,
|
|
625
|
-
/^\s*Action Input:\s*/im,
|
|
626
|
-
/^\s*Thought:\s*/im,
|
|
627
|
-
/Action:\s*\w+/i,
|
|
628
|
-
/Action Input:\s*\{/i,
|
|
629
|
-
// Function dialect
|
|
630
|
-
/^\s*Function:\s*/im,
|
|
631
|
-
/^\s*Function Input:\s*/im,
|
|
632
|
-
/Function:\s*\w+/i,
|
|
633
|
-
// Tool dialect
|
|
634
|
-
/^\s*Tool:\s*/im,
|
|
635
|
-
/^\s*Tool Input:\s*/im,
|
|
636
|
-
/Tool:\s*\w+/i,
|
|
637
|
-
// Call dialect
|
|
638
|
-
/^\s*Call:\s*/im,
|
|
639
|
-
/^\s*Input:\s*/im,
|
|
640
|
-
/Call:\s*\w+/i
|
|
641
|
-
];
|
|
642
|
-
const matchedPatterns = [];
|
|
643
|
-
for (const p of reactPatterns) {
|
|
644
|
-
if (p.test(content)) matchedPatterns.push(p.source);
|
|
645
|
-
}
|
|
646
|
-
if (matchedPatterns.length > 0) {
|
|
647
|
-
let dialectName = "react";
|
|
648
|
-
if (/Function:/i.test(content)) dialectName = "function";
|
|
649
|
-
else if (/Tool:/i.test(content)) dialectName = "tool";
|
|
650
|
-
else if (/Call:/i.test(content)) dialectName = "call";
|
|
646
|
+
const detectedDialect = detectReactDialect(content);
|
|
647
|
+
if (detectedDialect) {
|
|
651
648
|
const level2 = "react";
|
|
652
649
|
cacheToolSupport(model, level2, family);
|
|
653
650
|
return {
|
|
654
651
|
level: level2,
|
|
655
652
|
cached: false,
|
|
656
|
-
evidence: `ReAct format detected (${
|
|
653
|
+
evidence: `ReAct format detected (${detectedDialect.name} dialect) in text response`,
|
|
657
654
|
elapsedMs
|
|
658
655
|
};
|
|
659
656
|
}
|
|
@@ -697,7 +694,8 @@ function model_test_temp_default(pi) {
|
|
|
697
694
|
if (!res.ok) return [];
|
|
698
695
|
const data = await res.json();
|
|
699
696
|
return (data.models || []).map((m) => m.name).filter(Boolean);
|
|
700
|
-
} catch {
|
|
697
|
+
} catch (err) {
|
|
698
|
+
debugLog("model-test", "failed to list Ollama models", err);
|
|
701
699
|
return [];
|
|
702
700
|
}
|
|
703
701
|
}
|
|
@@ -706,43 +704,44 @@ function model_test_temp_default(pi) {
|
|
|
706
704
|
}
|
|
707
705
|
function updateModelsJsonReasoning(model, hasReasoning) {
|
|
708
706
|
try {
|
|
707
|
+
const written = readModifyWriteModelsJson((config2) => {
|
|
708
|
+
for (const provider of Object.values(config2.providers || {})) {
|
|
709
|
+
const models = provider.models || [];
|
|
710
|
+
for (const m of models) {
|
|
711
|
+
if (m.id === model) {
|
|
712
|
+
const current = m.reasoning;
|
|
713
|
+
if (current === hasReasoning) {
|
|
714
|
+
return null;
|
|
715
|
+
}
|
|
716
|
+
m.reasoning = hasReasoning;
|
|
717
|
+
return config2;
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
return null;
|
|
722
|
+
});
|
|
723
|
+
if (!written) {
|
|
724
|
+
return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
|
|
725
|
+
}
|
|
709
726
|
const config = readModelsJson();
|
|
710
|
-
let updated = false;
|
|
711
727
|
for (const provider of Object.values(config.providers || {})) {
|
|
712
728
|
const models = provider.models || [];
|
|
713
729
|
for (const m of models) {
|
|
714
|
-
if (m.id === model) {
|
|
715
|
-
|
|
716
|
-
if (current === hasReasoning) {
|
|
717
|
-
return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
|
|
718
|
-
}
|
|
719
|
-
m.reasoning = hasReasoning;
|
|
720
|
-
updated = true;
|
|
721
|
-
break;
|
|
730
|
+
if (m.id === model && m.reasoning === hasReasoning) {
|
|
731
|
+
return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
|
|
722
732
|
}
|
|
723
733
|
}
|
|
724
|
-
if (updated) break;
|
|
725
734
|
}
|
|
726
|
-
if (!updated) {
|
|
727
|
-
return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
|
|
728
|
-
}
|
|
729
|
-
writeModelsJson(config);
|
|
730
735
|
const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
|
|
731
|
-
return { updated: true, message:
|
|
736
|
+
return { updated: true, message: `Updated ${model}: ${action}` };
|
|
732
737
|
} catch (e) {
|
|
733
738
|
return { updated: false, message: `Failed to update models.json: ${e.message}` };
|
|
734
739
|
}
|
|
735
740
|
}
|
|
736
|
-
const branding = [
|
|
737
|
-
` \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
|
|
738
|
-
` Written by VTSTech`,
|
|
739
|
-
` GitHub: https://github.com/VTSTech`,
|
|
740
|
-
` Website: www.vts-tech.org`
|
|
741
|
-
].join("\n");
|
|
742
741
|
async function testModelOllama(model, providerInfo, ctx) {
|
|
743
742
|
const lines = [];
|
|
744
743
|
const totalStart = Date.now();
|
|
745
|
-
lines.push(
|
|
744
|
+
lines.push(sharedBranding);
|
|
746
745
|
lines.push(section(`MODEL: ${model}`));
|
|
747
746
|
lines.push(info("Provider: Ollama (local/remote)"));
|
|
748
747
|
const modelsJson = readModelsJson();
|
|
@@ -783,7 +782,8 @@ function model_test_temp_default(pi) {
|
|
|
783
782
|
modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
|
|
784
783
|
}
|
|
785
784
|
}
|
|
786
|
-
} catch {
|
|
785
|
+
} catch (err) {
|
|
786
|
+
debugLog("model-test", "failed to fetch model metadata from /api/show", err);
|
|
787
787
|
}
|
|
788
788
|
const detectedFamily = detectModelFamily(model);
|
|
789
789
|
lines.push(info(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
|
|
@@ -793,18 +793,7 @@ function model_test_temp_default(pi) {
|
|
|
793
793
|
lines.push(info("Testing..."));
|
|
794
794
|
const reasoning = await testReasoning(model);
|
|
795
795
|
lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
|
|
796
|
-
|
|
797
|
-
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
|
|
798
|
-
} else if (reasoning.score === "MODERATE") {
|
|
799
|
-
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
|
|
800
|
-
} else if (reasoning.score === "WEAK") {
|
|
801
|
-
lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
|
|
802
|
-
} else if (reasoning.score === "FAIL") {
|
|
803
|
-
lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
|
|
804
|
-
} else {
|
|
805
|
-
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
|
|
806
|
-
lines.push(fail(`Error: ${errMsg}`));
|
|
807
|
-
}
|
|
796
|
+
reportReasoningScore(lines, reasoning);
|
|
808
797
|
lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
|
|
809
798
|
lines.push(section("THINKING TEST"));
|
|
810
799
|
lines.push(info('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
|
|
@@ -827,32 +816,7 @@ function model_test_temp_default(pi) {
|
|
|
827
816
|
await rateLimitDelay(lines);
|
|
828
817
|
const tools = await testToolUsage(model);
|
|
829
818
|
lines.push(info(`Time: ${msHuman(tools.elapsedMs)}`));
|
|
830
|
-
|
|
831
|
-
lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
|
|
832
|
-
if (tools.response) {
|
|
833
|
-
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
834
|
-
}
|
|
835
|
-
} else if (tools.score === "MODERATE") {
|
|
836
|
-
lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
|
|
837
|
-
if (tools.response) {
|
|
838
|
-
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
839
|
-
}
|
|
840
|
-
} else if (tools.score === "WEAK") {
|
|
841
|
-
lines.push(warn(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
|
|
842
|
-
if (tools.response) {
|
|
843
|
-
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
844
|
-
}
|
|
845
|
-
} else if (tools.score === "FAIL") {
|
|
846
|
-
const hasResponse = tools.response && tools.response.trim().length > 0;
|
|
847
|
-
lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
|
|
848
|
-
if (hasResponse) {
|
|
849
|
-
lines.push(info(`Text response: ${sanitizeForReport(tools.response)}`));
|
|
850
|
-
} else {
|
|
851
|
-
lines.push(info("Text response: (empty)"));
|
|
852
|
-
}
|
|
853
|
-
} else {
|
|
854
|
-
lines.push(fail(`Error: ${tools.toolCall}`));
|
|
855
|
-
}
|
|
819
|
+
reportToolScore(lines, tools);
|
|
856
820
|
lines.push(section("REACT PARSING TEST"));
|
|
857
821
|
lines.push(info(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
|
|
858
822
|
lines.push(info("Testing..."));
|
|
@@ -889,15 +853,7 @@ function model_test_temp_default(pi) {
|
|
|
889
853
|
await rateLimitDelay(lines);
|
|
890
854
|
const instructions = await testInstructionFollowing(model);
|
|
891
855
|
lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
|
|
892
|
-
|
|
893
|
-
lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
|
|
894
|
-
} else if (instructions.score === "MODERATE") {
|
|
895
|
-
lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
|
|
896
|
-
} else if (instructions.score === "WEAK") {
|
|
897
|
-
lines.push(warn(`Partial JSON compliance (${instructions.score})`));
|
|
898
|
-
} else {
|
|
899
|
-
lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
|
|
900
|
-
}
|
|
856
|
+
reportInstructionScore(lines, instructions);
|
|
901
857
|
lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
|
|
902
858
|
lines.push(section("TOOL SUPPORT DETECTION"));
|
|
903
859
|
lines.push(info("Probing model for tool calling capability (native / ReAct / none)"));
|
|
@@ -930,11 +886,10 @@ function model_test_temp_default(pi) {
|
|
|
930
886
|
}
|
|
931
887
|
lines.push(info(`Evidence: ${toolSupport.evidence}`));
|
|
932
888
|
lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
|
|
933
|
-
lines.push(section("SUMMARY"));
|
|
934
889
|
const totalMs = Date.now() - totalStart;
|
|
935
890
|
const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
|
|
936
891
|
const reactPass = react.score === "STRONG" || react.score === "MODERATE";
|
|
937
|
-
const
|
|
892
|
+
const ollamaTests = [
|
|
938
893
|
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
939
894
|
{ name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
|
|
940
895
|
{ name: "Tool Usage", pass: toolPass, score: tools.score },
|
|
@@ -942,23 +897,10 @@ function model_test_temp_default(pi) {
|
|
|
942
897
|
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
943
898
|
{ name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
|
|
944
899
|
];
|
|
945
|
-
const passed =
|
|
946
|
-
const total =
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
}
|
|
950
|
-
lines.push(info(`Total time: ${msHuman(totalMs)}`));
|
|
951
|
-
lines.push(info(`Score: ${passed}/${total} tests passed`));
|
|
952
|
-
lines.push(section("RECOMMENDATION"));
|
|
953
|
-
if (passed === 6) {
|
|
954
|
-
lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
|
|
955
|
-
} else if (passed >= 5) {
|
|
956
|
-
lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
|
|
957
|
-
} else if (passed >= 4) {
|
|
958
|
-
lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
|
|
959
|
-
} else {
|
|
960
|
-
lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
|
|
961
|
-
}
|
|
900
|
+
const passed = ollamaTests.filter((t) => t.pass).length;
|
|
901
|
+
const total = ollamaTests.length;
|
|
902
|
+
lines.push(...formatTestSummary(ollamaTests, totalMs));
|
|
903
|
+
lines.push(...formatRecommendation(model, passed, total));
|
|
962
904
|
try {
|
|
963
905
|
const historyEntry = {
|
|
964
906
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -985,14 +927,15 @@ function model_test_temp_default(pi) {
|
|
|
985
927
|
lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
|
|
986
928
|
}
|
|
987
929
|
}
|
|
988
|
-
} catch {
|
|
930
|
+
} catch (err) {
|
|
931
|
+
debugLog("model-test", "failed to save test history", err);
|
|
989
932
|
}
|
|
990
933
|
return lines.join("\n");
|
|
991
934
|
}
|
|
992
935
|
async function testModelProvider(providerInfo, model, ctx) {
|
|
993
936
|
const lines = [];
|
|
994
937
|
const totalStart = Date.now();
|
|
995
|
-
lines.push(
|
|
938
|
+
lines.push(sharedBranding);
|
|
996
939
|
lines.push(section(`MODEL: ${model}`));
|
|
997
940
|
lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
|
|
998
941
|
lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
|
|
@@ -1031,18 +974,7 @@ function model_test_temp_default(pi) {
|
|
|
1031
974
|
await rateLimitDelay(lines);
|
|
1032
975
|
const reasoning = await testReasoningProvider(providerInfo, model);
|
|
1033
976
|
lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
|
|
1034
|
-
|
|
1035
|
-
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
|
|
1036
|
-
} else if (reasoning.score === "MODERATE") {
|
|
1037
|
-
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
|
|
1038
|
-
} else if (reasoning.score === "WEAK") {
|
|
1039
|
-
lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
|
|
1040
|
-
} else if (reasoning.score === "FAIL") {
|
|
1041
|
-
lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
|
|
1042
|
-
} else {
|
|
1043
|
-
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
|
|
1044
|
-
lines.push(fail(`Error: ${errMsg}`));
|
|
1045
|
-
}
|
|
977
|
+
reportReasoningScore(lines, reasoning);
|
|
1046
978
|
lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
|
|
1047
979
|
lines.push(section("INSTRUCTION FOLLOWING TEST"));
|
|
1048
980
|
lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
|
|
@@ -1050,15 +982,7 @@ function model_test_temp_default(pi) {
|
|
|
1050
982
|
await rateLimitDelay(lines);
|
|
1051
983
|
const instructions = await testInstructionFollowingProvider(providerInfo, model);
|
|
1052
984
|
lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
|
|
1053
|
-
|
|
1054
|
-
lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
|
|
1055
|
-
} else if (instructions.score === "MODERATE") {
|
|
1056
|
-
lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
|
|
1057
|
-
} else if (instructions.score === "WEAK") {
|
|
1058
|
-
lines.push(warn(`Partial JSON compliance (${instructions.score})`));
|
|
1059
|
-
} else {
|
|
1060
|
-
lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
|
|
1061
|
-
}
|
|
985
|
+
reportInstructionScore(lines, instructions);
|
|
1062
986
|
lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
|
|
1063
987
|
lines.push(section("TOOL USAGE TEST"));
|
|
1064
988
|
lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
|
|
@@ -1066,62 +990,23 @@ function model_test_temp_default(pi) {
|
|
|
1066
990
|
await rateLimitDelay(lines);
|
|
1067
991
|
const toolTest = await testToolUsageProvider(providerInfo, model);
|
|
1068
992
|
lines.push(info(`Time: ${msHuman(toolTest.elapsedMs)}`));
|
|
1069
|
-
|
|
1070
|
-
lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
|
|
1071
|
-
if (toolTest.response) {
|
|
1072
|
-
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1073
|
-
}
|
|
1074
|
-
} else if (toolTest.score === "MODERATE") {
|
|
1075
|
-
lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
|
|
1076
|
-
if (toolTest.response) {
|
|
1077
|
-
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1078
|
-
}
|
|
1079
|
-
} else if (toolTest.score === "WEAK") {
|
|
1080
|
-
lines.push(warn(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
|
|
1081
|
-
if (toolTest.response) {
|
|
1082
|
-
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1083
|
-
}
|
|
1084
|
-
} else if (toolTest.score === "FAIL") {
|
|
1085
|
-
const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
|
|
1086
|
-
lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
|
|
1087
|
-
if (hasResponse) {
|
|
1088
|
-
lines.push(info(`Text response: ${sanitizeForReport(toolTest.response)}`));
|
|
1089
|
-
} else {
|
|
1090
|
-
lines.push(info("Text response: (empty)"));
|
|
1091
|
-
}
|
|
1092
|
-
} else {
|
|
1093
|
-
lines.push(fail(`Error: ${toolTest.toolCall}`));
|
|
1094
|
-
}
|
|
993
|
+
reportToolScore(lines, toolTest);
|
|
1095
994
|
lines.push(section("SKIPPED TESTS (OLLAMA-ONLY)"));
|
|
1096
995
|
lines.push(warn("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
|
|
1097
996
|
lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
|
|
1098
997
|
lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
|
|
1099
998
|
lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
|
|
1100
|
-
lines.push(section("SUMMARY"));
|
|
1101
999
|
const totalMs = Date.now() - totalStart;
|
|
1102
|
-
const
|
|
1000
|
+
const providerTests = [
|
|
1103
1001
|
{ name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
|
|
1104
1002
|
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
1105
1003
|
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
1106
1004
|
{ name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
|
|
1107
1005
|
];
|
|
1108
|
-
const passed =
|
|
1109
|
-
const total =
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
}
|
|
1113
|
-
lines.push(info(`Total time: ${msHuman(totalMs)}`));
|
|
1114
|
-
lines.push(info(`Score: ${passed}/${total} tests passed`));
|
|
1115
|
-
lines.push(section("RECOMMENDATION"));
|
|
1116
|
-
if (passed === 4) {
|
|
1117
|
-
lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
|
|
1118
|
-
} else if (passed >= 3) {
|
|
1119
|
-
lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
|
|
1120
|
-
} else if (passed >= 2) {
|
|
1121
|
-
lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
|
|
1122
|
-
} else {
|
|
1123
|
-
lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
|
|
1124
|
-
}
|
|
1006
|
+
const passed = providerTests.filter((t) => t.pass).length;
|
|
1007
|
+
const total = providerTests.length;
|
|
1008
|
+
lines.push(...formatTestSummary(providerTests, totalMs));
|
|
1009
|
+
lines.push(...formatRecommendation(model, passed, total, providerInfo.name));
|
|
1125
1010
|
try {
|
|
1126
1011
|
const historyEntry = {
|
|
1127
1012
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -1148,7 +1033,8 @@ function model_test_temp_default(pi) {
|
|
|
1148
1033
|
lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
|
|
1149
1034
|
}
|
|
1150
1035
|
}
|
|
1151
|
-
} catch {
|
|
1036
|
+
} catch (err) {
|
|
1037
|
+
debugLog("model-test", "failed to save provider test history", err);
|
|
1152
1038
|
}
|
|
1153
1039
|
return lines.join("\n");
|
|
1154
1040
|
}
|
|
@@ -1168,7 +1054,8 @@ function model_test_temp_default(pi) {
|
|
|
1168
1054
|
try {
|
|
1169
1055
|
const models = await getOllamaModels();
|
|
1170
1056
|
return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
|
|
1171
|
-
} catch {
|
|
1057
|
+
} catch (err) {
|
|
1058
|
+
debugLog("model-test", "failed to get model completions", err);
|
|
1172
1059
|
return [];
|
|
1173
1060
|
}
|
|
1174
1061
|
},
|
|
@@ -1188,7 +1075,8 @@ function model_test_temp_default(pi) {
|
|
|
1188
1075
|
let models;
|
|
1189
1076
|
try {
|
|
1190
1077
|
models = await getOllamaModels();
|
|
1191
|
-
} catch {
|
|
1078
|
+
} catch (err) {
|
|
1079
|
+
debugLog("model-test", "failed to list Ollama models for --all", err);
|
|
1192
1080
|
ctx.ui.notify("Could not list Ollama models", "error");
|
|
1193
1081
|
return;
|
|
1194
1082
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.9",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.1.
|
|
17
|
+
"@vtstech/pi-shared": "1.1.9"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|