@vtstech/pi-model-test 1.1.6 → 1.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/model-test.js +199 -168
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -44,8 +44,11 @@ pi install "npm:@vtstech/pi-model-test"
|
|
|
44
44
|
|
|
45
45
|
- Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
|
|
46
46
|
- Uses native `fetch()` for all HTTP communication (no shell subprocess or curl dependency)
|
|
47
|
+
- **Streaming Ollama chat** — uses `/api/chat` with `stream: true` for earlier timeout detection and reduced memory
|
|
47
48
|
- Automatic remote Ollama URL resolution (reads from `models.json` on every call — picks up config changes immediately)
|
|
48
|
-
- Timeout resilience with
|
|
49
|
+
- Timeout resilience with exponential backoff retry on connection failures
|
|
50
|
+
- **Configurable test parameters** — override timeouts, delays, temperature via `~/.pi/agent/model-test-config.json`
|
|
51
|
+
- **Test history with regression detection** — tracks results at `~/.pi/agent/cache/model-test-history.json`, flags score degradation
|
|
49
52
|
- Rate limit delay between tests (configurable)
|
|
50
53
|
- Thinking model fallback (retries with `think: true`)
|
|
51
54
|
- Tool support cache (`~/.pi/agent/cache/tool_support.json`)
|
package/model-test.js
CHANGED
|
@@ -9,10 +9,12 @@ import {
|
|
|
9
9
|
truncate,
|
|
10
10
|
sanitizeForReport
|
|
11
11
|
} from "@vtstech/pi-shared/format";
|
|
12
|
-
import { getOllamaBaseUrl, detectModelFamily, readModelsJson,
|
|
12
|
+
import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
|
|
13
13
|
import {
|
|
14
14
|
ALL_DIALECT_PATTERNS,
|
|
15
|
-
parseReactWithPatterns
|
|
15
|
+
parseReactWithPatterns,
|
|
16
|
+
detectReactDialect,
|
|
17
|
+
extractBraceJson
|
|
16
18
|
} from "@vtstech/pi-shared/react-parser";
|
|
17
19
|
import {
|
|
18
20
|
CONFIG,
|
|
@@ -20,24 +22,34 @@ import {
|
|
|
20
22
|
scoreReasoning,
|
|
21
23
|
getCachedToolSupport,
|
|
22
24
|
cacheToolSupport,
|
|
25
|
+
getEffectiveConfig,
|
|
26
|
+
appendTestHistory,
|
|
27
|
+
detectRegression,
|
|
23
28
|
testToolUsageUnified,
|
|
24
29
|
testReasoningUnified,
|
|
25
30
|
testInstructionFollowingUnified,
|
|
26
31
|
TOOL_SUPPORT_CACHE_PATH
|
|
27
32
|
} from "@vtstech/pi-shared/model-test-utils";
|
|
33
|
+
import {
|
|
34
|
+
branding as sharedBranding,
|
|
35
|
+
formatTestSummary,
|
|
36
|
+
formatRecommendation
|
|
37
|
+
} from "@vtstech/pi-shared/test-report";
|
|
28
38
|
function model_test_temp_default(pi) {
|
|
39
|
+
const effectiveConfig = getEffectiveConfig();
|
|
29
40
|
function ollamaBase() {
|
|
30
41
|
return getOllamaBaseUrl();
|
|
31
42
|
}
|
|
32
43
|
async function rateLimitDelay(lines) {
|
|
33
|
-
if (
|
|
34
|
-
lines.push(info(`Waiting ${msHuman(
|
|
35
|
-
await new Promise((r) => setTimeout(r,
|
|
44
|
+
if (effectiveConfig.TEST_DELAY_MS > 0) {
|
|
45
|
+
lines.push(info(`Waiting ${msHuman(effectiveConfig.TEST_DELAY_MS)} to avoid rate limiting...`));
|
|
46
|
+
await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
|
|
36
47
|
}
|
|
37
48
|
}
|
|
38
|
-
function makeOllamaChatFn() {
|
|
49
|
+
function makeOllamaChatFn(useStreaming = true) {
|
|
39
50
|
return async (model, messages, _options) => {
|
|
40
|
-
const
|
|
51
|
+
const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
|
|
52
|
+
const result = await chatFn(model, messages);
|
|
41
53
|
return {
|
|
42
54
|
content: result.response?.message?.content || "",
|
|
43
55
|
elapsedMs: result.elapsedMs,
|
|
@@ -154,6 +166,69 @@ function model_test_temp_default(pi) {
|
|
|
154
166
|
}
|
|
155
167
|
throw new Error("Unreachable");
|
|
156
168
|
}
|
|
169
|
+
async function ollamaChatStream(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS) {
|
|
170
|
+
const body = { model, messages, stream: true, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
|
|
171
|
+
const url = `${ollamaBase()}/api/chat`;
|
|
172
|
+
const controller = new AbortController();
|
|
173
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
174
|
+
const start = Date.now();
|
|
175
|
+
try {
|
|
176
|
+
const res = await fetch(url, {
|
|
177
|
+
method: "POST",
|
|
178
|
+
headers: { "Content-Type": "application/json" },
|
|
179
|
+
body: JSON.stringify(body),
|
|
180
|
+
signal: controller.signal
|
|
181
|
+
});
|
|
182
|
+
if (!res.ok) {
|
|
183
|
+
const errorText = await res.text().catch(() => "unknown error");
|
|
184
|
+
throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
|
|
185
|
+
}
|
|
186
|
+
if (!res.body) {
|
|
187
|
+
throw new Error("Ollama streaming response has no body");
|
|
188
|
+
}
|
|
189
|
+
let messageContent = "";
|
|
190
|
+
let thinkingContent = "";
|
|
191
|
+
let done = false;
|
|
192
|
+
const reader = res.body.getReader();
|
|
193
|
+
const decoder = new TextDecoder();
|
|
194
|
+
while (!done) {
|
|
195
|
+
const { value, done: streamDone } = await reader.read();
|
|
196
|
+
if (streamDone) break;
|
|
197
|
+
const chunk = decoder.decode(value, { stream: true });
|
|
198
|
+
const lines = chunk.split("\n").filter((line) => line.trim().length > 0);
|
|
199
|
+
for (const line of lines) {
|
|
200
|
+
try {
|
|
201
|
+
const parsed = JSON.parse(line);
|
|
202
|
+
if (parsed.message?.content) messageContent += parsed.message.content;
|
|
203
|
+
if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
|
|
204
|
+
if (parsed.done) done = true;
|
|
205
|
+
} catch (err) {
|
|
206
|
+
debugLog("model-test", "skipped malformed JSON chunk in streaming response", err);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
const elapsedMs = Date.now() - start;
|
|
211
|
+
if (!messageContent.trim() && !thinkingContent.trim()) {
|
|
212
|
+
throw new Error("Empty streaming response from Ollama");
|
|
213
|
+
}
|
|
214
|
+
const response = {
|
|
215
|
+
message: {
|
|
216
|
+
content: messageContent,
|
|
217
|
+
thinking: thinkingContent,
|
|
218
|
+
role: "assistant"
|
|
219
|
+
},
|
|
220
|
+
done: true
|
|
221
|
+
};
|
|
222
|
+
return { response, elapsedMs };
|
|
223
|
+
} catch (e) {
|
|
224
|
+
if (e instanceof Error && e.name === "AbortError") {
|
|
225
|
+
throw new Error(`Ollama API timed out after ${msHuman(timeoutMs)}`);
|
|
226
|
+
}
|
|
227
|
+
throw e;
|
|
228
|
+
} finally {
|
|
229
|
+
clearTimeout(timeoutId);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
157
232
|
async function providerChat(providerInfo, model, messages, options = {}) {
|
|
158
233
|
const { baseUrl, apiKey } = providerInfo;
|
|
159
234
|
const maxTokens = options.maxTokens ?? CONFIG.NUM_PREDICT;
|
|
@@ -368,73 +443,20 @@ function model_test_temp_default(pi) {
|
|
|
368
443
|
return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
|
|
369
444
|
}
|
|
370
445
|
let parsedResult = null;
|
|
371
|
-
const
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
const jsonStart = result.raw.indexOf("{");
|
|
383
|
-
if (jsonStart !== -1) {
|
|
384
|
-
let depth = 0, jsonEnd = -1;
|
|
385
|
-
for (let i = jsonStart; i < result.raw.length; i++) {
|
|
386
|
-
if (result.raw[i] === "{") depth++;
|
|
387
|
-
else if (result.raw[i] === "}") {
|
|
388
|
-
depth--;
|
|
389
|
-
if (depth === 0) {
|
|
390
|
-
jsonEnd = i;
|
|
391
|
-
break;
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
}
|
|
395
|
-
argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
396
|
-
} else {
|
|
397
|
-
argsStr = "";
|
|
398
|
-
}
|
|
399
|
-
} else {
|
|
400
|
-
argsStr = "";
|
|
401
|
-
}
|
|
402
|
-
parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
|
|
403
|
-
break;
|
|
404
|
-
}
|
|
405
|
-
}
|
|
406
|
-
} else {
|
|
407
|
-
for (const dp of ALL_DIALECT_PATTERNS) {
|
|
408
|
-
const result = parseReactWithPatterns(content, dp, true);
|
|
409
|
-
if (result) {
|
|
410
|
-
let argsStr;
|
|
411
|
-
const rawArgs = result.args ? JSON.stringify(result.args) : "";
|
|
412
|
-
if (rawArgs && rawArgs !== "{}") {
|
|
413
|
-
argsStr = rawArgs;
|
|
414
|
-
} else if (result.raw) {
|
|
415
|
-
const jsonStart = result.raw.indexOf("{");
|
|
416
|
-
if (jsonStart !== -1) {
|
|
417
|
-
let depth = 0, jsonEnd = -1;
|
|
418
|
-
for (let i = jsonStart; i < result.raw.length; i++) {
|
|
419
|
-
if (result.raw[i] === "{") depth++;
|
|
420
|
-
else if (result.raw[i] === "}") {
|
|
421
|
-
depth--;
|
|
422
|
-
if (depth === 0) {
|
|
423
|
-
jsonEnd = i;
|
|
424
|
-
break;
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
429
|
-
} else {
|
|
430
|
-
argsStr = "";
|
|
431
|
-
}
|
|
432
|
-
} else {
|
|
433
|
-
argsStr = "";
|
|
434
|
-
}
|
|
435
|
-
parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
|
|
436
|
-
break;
|
|
446
|
+
for (const dp of ALL_DIALECT_PATTERNS) {
|
|
447
|
+
const result = parseReactWithPatterns(content, dp, true);
|
|
448
|
+
if (result) {
|
|
449
|
+
let argsStr;
|
|
450
|
+
const rawArgs = result.args ? JSON.stringify(result.args) : "";
|
|
451
|
+
if (rawArgs && rawArgs !== "{}") {
|
|
452
|
+
argsStr = rawArgs;
|
|
453
|
+
} else if (result.raw) {
|
|
454
|
+
argsStr = extractBraceJson(result.raw);
|
|
455
|
+
} else {
|
|
456
|
+
argsStr = "";
|
|
437
457
|
}
|
|
458
|
+
parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
|
|
459
|
+
break;
|
|
438
460
|
}
|
|
439
461
|
}
|
|
440
462
|
if (parsedResult) {
|
|
@@ -525,7 +547,7 @@ function model_test_temp_default(pi) {
|
|
|
525
547
|
try {
|
|
526
548
|
const start = Date.now();
|
|
527
549
|
const controller = new AbortController();
|
|
528
|
-
const timeoutId = setTimeout(() => controller.abort(),
|
|
550
|
+
const timeoutId = setTimeout(() => controller.abort(), effectiveConfig.TOOL_SUPPORT_TIMEOUT_MS);
|
|
529
551
|
const res = await fetch(`${ollamaBase()}/api/chat`, {
|
|
530
552
|
method: "POST",
|
|
531
553
|
headers: { "Content-Type": "application/json" },
|
|
@@ -556,7 +578,8 @@ function model_test_temp_default(pi) {
|
|
|
556
578
|
try {
|
|
557
579
|
const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
|
|
558
580
|
argsStr = JSON.stringify(args);
|
|
559
|
-
} catch {
|
|
581
|
+
} catch (err) {
|
|
582
|
+
debugLog("model-test", "failed to parse tool call arguments", err);
|
|
560
583
|
argsStr = String(fn.arguments);
|
|
561
584
|
}
|
|
562
585
|
const level2 = "native";
|
|
@@ -568,41 +591,14 @@ function model_test_temp_default(pi) {
|
|
|
568
591
|
elapsedMs
|
|
569
592
|
};
|
|
570
593
|
}
|
|
571
|
-
const
|
|
572
|
-
|
|
573
|
-
/^\s*Action:\s*/im,
|
|
574
|
-
/^\s*Action Input:\s*/im,
|
|
575
|
-
/^\s*Thought:\s*/im,
|
|
576
|
-
/Action:\s*\w+/i,
|
|
577
|
-
/Action Input:\s*\{/i,
|
|
578
|
-
// Function dialect
|
|
579
|
-
/^\s*Function:\s*/im,
|
|
580
|
-
/^\s*Function Input:\s*/im,
|
|
581
|
-
/Function:\s*\w+/i,
|
|
582
|
-
// Tool dialect
|
|
583
|
-
/^\s*Tool:\s*/im,
|
|
584
|
-
/^\s*Tool Input:\s*/im,
|
|
585
|
-
/Tool:\s*\w+/i,
|
|
586
|
-
// Call dialect
|
|
587
|
-
/^\s*Call:\s*/im,
|
|
588
|
-
/^\s*Input:\s*/im,
|
|
589
|
-
/Call:\s*\w+/i
|
|
590
|
-
];
|
|
591
|
-
const matchedPatterns = [];
|
|
592
|
-
for (const p of reactPatterns) {
|
|
593
|
-
if (p.test(content)) matchedPatterns.push(p.source);
|
|
594
|
-
}
|
|
595
|
-
if (matchedPatterns.length > 0) {
|
|
596
|
-
let dialectName = "react";
|
|
597
|
-
if (/Function:/i.test(content)) dialectName = "function";
|
|
598
|
-
else if (/Tool:/i.test(content)) dialectName = "tool";
|
|
599
|
-
else if (/Call:/i.test(content)) dialectName = "call";
|
|
594
|
+
const detectedDialect = detectReactDialect(content);
|
|
595
|
+
if (detectedDialect) {
|
|
600
596
|
const level2 = "react";
|
|
601
597
|
cacheToolSupport(model, level2, family);
|
|
602
598
|
return {
|
|
603
599
|
level: level2,
|
|
604
600
|
cached: false,
|
|
605
|
-
evidence: `ReAct format detected (${
|
|
601
|
+
evidence: `ReAct format detected (${detectedDialect.name} dialect) in text response`,
|
|
606
602
|
elapsedMs
|
|
607
603
|
};
|
|
608
604
|
}
|
|
@@ -646,7 +642,8 @@ function model_test_temp_default(pi) {
|
|
|
646
642
|
if (!res.ok) return [];
|
|
647
643
|
const data = await res.json();
|
|
648
644
|
return (data.models || []).map((m) => m.name).filter(Boolean);
|
|
649
|
-
} catch {
|
|
645
|
+
} catch (err) {
|
|
646
|
+
debugLog("model-test", "failed to list Ollama models", err);
|
|
650
647
|
return [];
|
|
651
648
|
}
|
|
652
649
|
}
|
|
@@ -655,43 +652,44 @@ function model_test_temp_default(pi) {
|
|
|
655
652
|
}
|
|
656
653
|
function updateModelsJsonReasoning(model, hasReasoning) {
|
|
657
654
|
try {
|
|
655
|
+
const written = readModifyWriteModelsJson((config2) => {
|
|
656
|
+
for (const provider of Object.values(config2.providers || {})) {
|
|
657
|
+
const models = provider.models || [];
|
|
658
|
+
for (const m of models) {
|
|
659
|
+
if (m.id === model) {
|
|
660
|
+
const current = m.reasoning;
|
|
661
|
+
if (current === hasReasoning) {
|
|
662
|
+
return null;
|
|
663
|
+
}
|
|
664
|
+
m.reasoning = hasReasoning;
|
|
665
|
+
return config2;
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
return null;
|
|
670
|
+
});
|
|
671
|
+
if (!written) {
|
|
672
|
+
return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
|
|
673
|
+
}
|
|
658
674
|
const config = readModelsJson();
|
|
659
|
-
let updated = false;
|
|
660
675
|
for (const provider of Object.values(config.providers || {})) {
|
|
661
676
|
const models = provider.models || [];
|
|
662
677
|
for (const m of models) {
|
|
663
|
-
if (m.id === model) {
|
|
664
|
-
|
|
665
|
-
if (current === hasReasoning) {
|
|
666
|
-
return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
|
|
667
|
-
}
|
|
668
|
-
m.reasoning = hasReasoning;
|
|
669
|
-
updated = true;
|
|
670
|
-
break;
|
|
678
|
+
if (m.id === model && m.reasoning === hasReasoning) {
|
|
679
|
+
return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
|
|
671
680
|
}
|
|
672
681
|
}
|
|
673
|
-
if (updated) break;
|
|
674
|
-
}
|
|
675
|
-
if (!updated) {
|
|
676
|
-
return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
|
|
677
682
|
}
|
|
678
|
-
writeModelsJson(config);
|
|
679
683
|
const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
|
|
680
|
-
return { updated: true, message:
|
|
684
|
+
return { updated: true, message: `Updated ${model}: ${action}` };
|
|
681
685
|
} catch (e) {
|
|
682
686
|
return { updated: false, message: `Failed to update models.json: ${e.message}` };
|
|
683
687
|
}
|
|
684
688
|
}
|
|
685
|
-
const branding = [
|
|
686
|
-
` \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
|
|
687
|
-
` Written by VTSTech`,
|
|
688
|
-
` GitHub: https://github.com/VTSTech`,
|
|
689
|
-
` Website: www.vts-tech.org`
|
|
690
|
-
].join("\n");
|
|
691
689
|
async function testModelOllama(model, providerInfo, ctx) {
|
|
692
690
|
const lines = [];
|
|
693
691
|
const totalStart = Date.now();
|
|
694
|
-
lines.push(
|
|
692
|
+
lines.push(sharedBranding);
|
|
695
693
|
lines.push(section(`MODEL: ${model}`));
|
|
696
694
|
lines.push(info("Provider: Ollama (local/remote)"));
|
|
697
695
|
const modelsJson = readModelsJson();
|
|
@@ -732,7 +730,8 @@ function model_test_temp_default(pi) {
|
|
|
732
730
|
modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
|
|
733
731
|
}
|
|
734
732
|
}
|
|
735
|
-
} catch {
|
|
733
|
+
} catch (err) {
|
|
734
|
+
debugLog("model-test", "failed to fetch model metadata from /api/show", err);
|
|
736
735
|
}
|
|
737
736
|
const detectedFamily = detectModelFamily(model);
|
|
738
737
|
lines.push(info(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
|
|
@@ -879,11 +878,10 @@ function model_test_temp_default(pi) {
|
|
|
879
878
|
}
|
|
880
879
|
lines.push(info(`Evidence: ${toolSupport.evidence}`));
|
|
881
880
|
lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
|
|
882
|
-
lines.push(section("SUMMARY"));
|
|
883
881
|
const totalMs = Date.now() - totalStart;
|
|
884
882
|
const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
|
|
885
883
|
const reactPass = react.score === "STRONG" || react.score === "MODERATE";
|
|
886
|
-
const
|
|
884
|
+
const ollamaTests = [
|
|
887
885
|
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
888
886
|
{ name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
|
|
889
887
|
{ name: "Tool Usage", pass: toolPass, score: tools.score },
|
|
@@ -891,29 +889,45 @@ function model_test_temp_default(pi) {
|
|
|
891
889
|
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
892
890
|
{ name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
|
|
893
891
|
];
|
|
894
|
-
const passed =
|
|
895
|
-
const total =
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
892
|
+
const passed = ollamaTests.filter((t) => t.pass).length;
|
|
893
|
+
const total = ollamaTests.length;
|
|
894
|
+
lines.push(...formatTestSummary(ollamaTests, totalMs));
|
|
895
|
+
lines.push(...formatRecommendation(model, passed, total));
|
|
896
|
+
try {
|
|
897
|
+
const historyEntry = {
|
|
898
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
899
|
+
model,
|
|
900
|
+
providerKind: "ollama",
|
|
901
|
+
providerName: providerName || "ollama",
|
|
902
|
+
tests: {
|
|
903
|
+
reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
|
|
904
|
+
thinking: { supported: thinking.supported },
|
|
905
|
+
toolUsage: { score: tools.score, pass: tools.score === "STRONG" || tools.score === "MODERATE", toolCall: tools.toolCall },
|
|
906
|
+
reactParsing: { score: react.score, pass: react.score === "STRONG" || react.score === "MODERATE", toolCall: react.toolCall, dialect: react.dialect },
|
|
907
|
+
instructionFollowing: { score: instructions.score, pass: instructions.pass },
|
|
908
|
+
toolSupport: { level: toolSupport.level, evidence: toolSupport.evidence }
|
|
909
|
+
},
|
|
910
|
+
passedCount: passed,
|
|
911
|
+
totalCount: total,
|
|
912
|
+
totalMs
|
|
913
|
+
};
|
|
914
|
+
appendTestHistory(historyEntry);
|
|
915
|
+
const regressions = detectRegression(model, historyEntry);
|
|
916
|
+
if (regressions.length > 0) {
|
|
917
|
+
lines.push(section("REGRESSION DETECTED"));
|
|
918
|
+
for (const reg of regressions) {
|
|
919
|
+
lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
} catch (err) {
|
|
923
|
+
debugLog("model-test", "failed to save test history", err);
|
|
910
924
|
}
|
|
911
925
|
return lines.join("\n");
|
|
912
926
|
}
|
|
913
927
|
async function testModelProvider(providerInfo, model, ctx) {
|
|
914
928
|
const lines = [];
|
|
915
929
|
const totalStart = Date.now();
|
|
916
|
-
lines.push(
|
|
930
|
+
lines.push(sharedBranding);
|
|
917
931
|
lines.push(section(`MODEL: ${model}`));
|
|
918
932
|
lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
|
|
919
933
|
lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
|
|
@@ -1018,30 +1032,45 @@ function model_test_temp_default(pi) {
|
|
|
1018
1032
|
lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
|
|
1019
1033
|
lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
|
|
1020
1034
|
lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
|
|
1021
|
-
lines.push(section("SUMMARY"));
|
|
1022
1035
|
const totalMs = Date.now() - totalStart;
|
|
1023
|
-
const
|
|
1036
|
+
const providerTests = [
|
|
1024
1037
|
{ name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
|
|
1025
1038
|
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
1026
1039
|
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
1027
1040
|
{ name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
|
|
1028
1041
|
];
|
|
1029
|
-
const passed =
|
|
1030
|
-
const total =
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1042
|
+
const passed = providerTests.filter((t) => t.pass).length;
|
|
1043
|
+
const total = providerTests.length;
|
|
1044
|
+
lines.push(...formatTestSummary(providerTests, totalMs));
|
|
1045
|
+
lines.push(...formatRecommendation(model, passed, total, providerInfo.name));
|
|
1046
|
+
try {
|
|
1047
|
+
const historyEntry = {
|
|
1048
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1049
|
+
model,
|
|
1050
|
+
providerKind: "builtin",
|
|
1051
|
+
providerName: providerInfo.name,
|
|
1052
|
+
tests: {
|
|
1053
|
+
reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
|
|
1054
|
+
thinking: { supported: false },
|
|
1055
|
+
toolUsage: { score: toolTest.score, pass: toolTest.pass, toolCall: toolTest.toolCall },
|
|
1056
|
+
reactParsing: { score: "SKIP", pass: false, toolCall: "n/a" },
|
|
1057
|
+
instructionFollowing: { score: instructions.score, pass: instructions.pass },
|
|
1058
|
+
toolSupport: { level: "native", evidence: "provider-native (not probed)" }
|
|
1059
|
+
},
|
|
1060
|
+
passedCount: passed,
|
|
1061
|
+
totalCount: total,
|
|
1062
|
+
totalMs
|
|
1063
|
+
};
|
|
1064
|
+
appendTestHistory(historyEntry);
|
|
1065
|
+
const regressions = detectRegression(model, historyEntry);
|
|
1066
|
+
if (regressions.length > 0) {
|
|
1067
|
+
lines.push(section("REGRESSION DETECTED"));
|
|
1068
|
+
for (const reg of regressions) {
|
|
1069
|
+
lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
} catch (err) {
|
|
1073
|
+
debugLog("model-test", "failed to save provider test history", err);
|
|
1045
1074
|
}
|
|
1046
1075
|
return lines.join("\n");
|
|
1047
1076
|
}
|
|
@@ -1061,7 +1090,8 @@ function model_test_temp_default(pi) {
|
|
|
1061
1090
|
try {
|
|
1062
1091
|
const models = await getOllamaModels();
|
|
1063
1092
|
return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
|
|
1064
|
-
} catch {
|
|
1093
|
+
} catch (err) {
|
|
1094
|
+
debugLog("model-test", "failed to get model completions", err);
|
|
1065
1095
|
return [];
|
|
1066
1096
|
}
|
|
1067
1097
|
},
|
|
@@ -1081,7 +1111,8 @@ function model_test_temp_default(pi) {
|
|
|
1081
1111
|
let models;
|
|
1082
1112
|
try {
|
|
1083
1113
|
models = await getOllamaModels();
|
|
1084
|
-
} catch {
|
|
1114
|
+
} catch (err) {
|
|
1115
|
+
debugLog("model-test", "failed to list Ollama models for --all", err);
|
|
1085
1116
|
ctx.ui.notify("Could not list Ollama models", "error");
|
|
1086
1117
|
return;
|
|
1087
1118
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.8",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.1.
|
|
17
|
+
"@vtstech/pi-shared": "1.1.8"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|