@vtstech/pi-model-test 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +137 -249
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -9,10 +9,13 @@ import {
9
9
  truncate,
10
10
  sanitizeForReport
11
11
  } from "@vtstech/pi-shared/format";
12
- import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
12
+ import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
13
+ import { debugLog } from "@vtstech/pi-shared/debug";
13
14
  import {
14
15
  ALL_DIALECT_PATTERNS,
15
- parseReactWithPatterns
16
+ parseReactWithPatterns,
17
+ detectReactDialect,
18
+ extractBraceJson
16
19
  } from "@vtstech/pi-shared/react-parser";
17
20
  import {
18
21
  CONFIG,
@@ -28,15 +31,71 @@ import {
28
31
  testInstructionFollowingUnified,
29
32
  TOOL_SUPPORT_CACHE_PATH
30
33
  } from "@vtstech/pi-shared/model-test-utils";
34
+ import {
35
+ branding as sharedBranding,
36
+ formatTestSummary,
37
+ formatRecommendation
38
+ } from "@vtstech/pi-shared/test-report";
31
39
  function model_test_temp_default(pi) {
32
40
  const effectiveConfig = getEffectiveConfig();
33
41
  function ollamaBase() {
34
42
  return getOllamaBaseUrl();
35
43
  }
36
44
  async function rateLimitDelay(lines) {
37
- if (CONFIG.TEST_DELAY_MS > 0) {
38
- lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
39
- await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
45
+ if (effectiveConfig.TEST_DELAY_MS > 0) {
46
+ lines.push(info(`Waiting ${msHuman(effectiveConfig.TEST_DELAY_MS)} to avoid rate limiting...`));
47
+ await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
48
+ }
49
+ }
50
+ function reportScore(lines, score, descriptions, fallback) {
51
+ const desc = descriptions[score] || descriptions["*"] || `(${score})`;
52
+ if (score === "STRONG" || score === "MODERATE") {
53
+ lines.push(ok(desc));
54
+ } else if (score === "WEAK") {
55
+ lines.push(warn(desc));
56
+ } else if (score === "FAIL") {
57
+ lines.push(fail(desc));
58
+ } else {
59
+ lines.push(fail(fallback));
60
+ }
61
+ }
62
+ function reportReasoningScore(lines, result) {
63
+ reportScore(lines, result.score, {
64
+ STRONG: `Answer: ${result.answer} \u2014 Correct with clear reasoning (${result.score})`,
65
+ MODERATE: `Answer: ${result.answer} \u2014 Correct but weak reasoning (${result.score})`,
66
+ WEAK: `Answer: ${result.answer} \u2014 Reasoned but wrong answer (${result.score})`,
67
+ FAIL: `Answer: ${result.answer} \u2014 No reasoning detected (${result.score})`
68
+ }, `Error: ${result.reasoning.includes("<!DOCTYPE") || result.reasoning.includes("<html") ? result.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(result.reasoning, 300)}`);
69
+ }
70
+ function reportInstructionScore(lines, result) {
71
+ reportScore(lines, result.score, {
72
+ STRONG: `JSON output valid with correct values (${result.score})`,
73
+ MODERATE: `JSON output valid but some values incorrect (${result.score})`,
74
+ WEAK: `Partial JSON compliance (${result.score})`
75
+ }, `Failed to produce valid JSON (${result.score})`);
76
+ }
77
+ function reportToolScore(lines, result) {
78
+ if (result.score === "STRONG" || result.score === "MODERATE") {
79
+ lines.push(ok(`Tool call: ${result.toolCall} (${result.score})`));
80
+ } else if (result.score === "WEAK") {
81
+ lines.push(warn(`Tool call: ${result.toolCall} (${result.score}) \u2014 malformed call`));
82
+ } else if (result.score === "FAIL") {
83
+ const hasResponse = result.response && result.response.trim().length > 0;
84
+ lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${result.score})`));
85
+ } else {
86
+ lines.push(fail(`Error: ${result.toolCall}`));
87
+ }
88
+ if (result.score === "STRONG" || result.score === "MODERATE" || result.score === "WEAK") {
89
+ if (result.response) {
90
+ lines.push(info(`Raw response: ${sanitizeForReport(result.response)}`));
91
+ }
92
+ } else if (result.score === "FAIL") {
93
+ const hasResponse = result.response && result.response.trim().length > 0;
94
+ if (hasResponse) {
95
+ lines.push(info(`Text response: ${sanitizeForReport(result.response)}`));
96
+ } else {
97
+ lines.push(info("Text response: (empty)"));
98
+ }
40
99
  }
41
100
  }
42
101
  function makeOllamaChatFn(useStreaming = true) {
@@ -195,7 +254,8 @@ function model_test_temp_default(pi) {
195
254
  if (parsed.message?.content) messageContent += parsed.message.content;
196
255
  if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
197
256
  if (parsed.done) done = true;
198
- } catch {
257
+ } catch (err) {
258
+ debugLog("model-test", "skipped malformed JSON chunk in streaming response", err);
199
259
  }
200
260
  }
201
261
  }
@@ -392,22 +452,6 @@ function model_test_temp_default(pi) {
392
452
  async function testToolUsageProvider(providerInfo, model) {
393
453
  return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
394
454
  }
395
- function extractBraceJson(raw) {
396
- const jsonStart = raw.indexOf("{");
397
- if (jsonStart === -1) return "";
398
- let depth = 0, jsonEnd = -1;
399
- for (let i = jsonStart; i < raw.length; i++) {
400
- if (raw[i] === "{") depth++;
401
- else if (raw[i] === "}") {
402
- depth--;
403
- if (depth === 0) {
404
- jsonEnd = i;
405
- break;
406
- }
407
- }
408
- }
409
- return jsonEnd !== -1 ? raw.slice(jsonStart, jsonEnd + 1) : "";
410
- }
411
455
  async function testReactParsing(model) {
412
456
  const systemPrompt = [
413
457
  "You are a helpful assistant with access to tools.",
@@ -451,41 +495,20 @@ function model_test_temp_default(pi) {
451
495
  return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
452
496
  }
453
497
  let parsedResult = null;
454
- const sharedParser = pi._reactParser;
455
- if (sharedParser?.ALL_DIALECT_PATTERNS) {
456
- for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
457
- const result = sharedParser.parseReactWithPatterns(content, dp, true);
458
- if (result) {
459
- let toolName = result.name;
460
- let argsStr;
461
- const rawArgs = result.args ? JSON.stringify(result.args) : "";
462
- if (rawArgs && rawArgs !== "{}") {
463
- argsStr = rawArgs;
464
- } else if (result.raw) {
465
- argsStr = extractBraceJson(result.raw);
466
- } else {
467
- argsStr = "";
468
- }
469
- parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
470
- break;
471
- }
472
- }
473
- } else {
474
- for (const dp of ALL_DIALECT_PATTERNS) {
475
- const result = parseReactWithPatterns(content, dp, true);
476
- if (result) {
477
- let argsStr;
478
- const rawArgs = result.args ? JSON.stringify(result.args) : "";
479
- if (rawArgs && rawArgs !== "{}") {
480
- argsStr = rawArgs;
481
- } else if (result.raw) {
482
- argsStr = extractBraceJson(result.raw);
483
- } else {
484
- argsStr = "";
485
- }
486
- parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
487
- break;
498
+ for (const dp of ALL_DIALECT_PATTERNS) {
499
+ const result = parseReactWithPatterns(content, dp, true);
500
+ if (result) {
501
+ let argsStr;
502
+ const rawArgs = result.args ? JSON.stringify(result.args) : "";
503
+ if (rawArgs && rawArgs !== "{}") {
504
+ argsStr = rawArgs;
505
+ } else if (result.raw) {
506
+ argsStr = extractBraceJson(result.raw);
507
+ } else {
508
+ argsStr = "";
488
509
  }
510
+ parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
511
+ break;
489
512
  }
490
513
  }
491
514
  if (parsedResult) {
@@ -576,7 +599,7 @@ function model_test_temp_default(pi) {
576
599
  try {
577
600
  const start = Date.now();
578
601
  const controller = new AbortController();
579
- const timeoutId = setTimeout(() => controller.abort(), 13e4);
602
+ const timeoutId = setTimeout(() => controller.abort(), effectiveConfig.TOOL_SUPPORT_TIMEOUT_MS);
580
603
  const res = await fetch(`${ollamaBase()}/api/chat`, {
581
604
  method: "POST",
582
605
  headers: { "Content-Type": "application/json" },
@@ -607,7 +630,8 @@ function model_test_temp_default(pi) {
607
630
  try {
608
631
  const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
609
632
  argsStr = JSON.stringify(args);
610
- } catch {
633
+ } catch (err) {
634
+ debugLog("model-test", "failed to parse tool call arguments", err);
611
635
  argsStr = String(fn.arguments);
612
636
  }
613
637
  const level2 = "native";
@@ -619,41 +643,14 @@ function model_test_temp_default(pi) {
619
643
  elapsedMs
620
644
  };
621
645
  }
622
- const reactPatterns = [
623
- // Classic ReAct
624
- /^\s*Action:\s*/im,
625
- /^\s*Action Input:\s*/im,
626
- /^\s*Thought:\s*/im,
627
- /Action:\s*\w+/i,
628
- /Action Input:\s*\{/i,
629
- // Function dialect
630
- /^\s*Function:\s*/im,
631
- /^\s*Function Input:\s*/im,
632
- /Function:\s*\w+/i,
633
- // Tool dialect
634
- /^\s*Tool:\s*/im,
635
- /^\s*Tool Input:\s*/im,
636
- /Tool:\s*\w+/i,
637
- // Call dialect
638
- /^\s*Call:\s*/im,
639
- /^\s*Input:\s*/im,
640
- /Call:\s*\w+/i
641
- ];
642
- const matchedPatterns = [];
643
- for (const p of reactPatterns) {
644
- if (p.test(content)) matchedPatterns.push(p.source);
645
- }
646
- if (matchedPatterns.length > 0) {
647
- let dialectName = "react";
648
- if (/Function:/i.test(content)) dialectName = "function";
649
- else if (/Tool:/i.test(content)) dialectName = "tool";
650
- else if (/Call:/i.test(content)) dialectName = "call";
646
+ const detectedDialect = detectReactDialect(content);
647
+ if (detectedDialect) {
651
648
  const level2 = "react";
652
649
  cacheToolSupport(model, level2, family);
653
650
  return {
654
651
  level: level2,
655
652
  cached: false,
656
- evidence: `ReAct format detected (${dialectName} dialect) in text response`,
653
+ evidence: `ReAct format detected (${detectedDialect.name} dialect) in text response`,
657
654
  elapsedMs
658
655
  };
659
656
  }
@@ -697,7 +694,8 @@ function model_test_temp_default(pi) {
697
694
  if (!res.ok) return [];
698
695
  const data = await res.json();
699
696
  return (data.models || []).map((m) => m.name).filter(Boolean);
700
- } catch {
697
+ } catch (err) {
698
+ debugLog("model-test", "failed to list Ollama models", err);
701
699
  return [];
702
700
  }
703
701
  }
@@ -706,43 +704,44 @@ function model_test_temp_default(pi) {
706
704
  }
707
705
  function updateModelsJsonReasoning(model, hasReasoning) {
708
706
  try {
707
+ const written = readModifyWriteModelsJson((config2) => {
708
+ for (const provider of Object.values(config2.providers || {})) {
709
+ const models = provider.models || [];
710
+ for (const m of models) {
711
+ if (m.id === model) {
712
+ const current = m.reasoning;
713
+ if (current === hasReasoning) {
714
+ return null;
715
+ }
716
+ m.reasoning = hasReasoning;
717
+ return config2;
718
+ }
719
+ }
720
+ }
721
+ return null;
722
+ });
723
+ if (!written) {
724
+ return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
725
+ }
709
726
  const config = readModelsJson();
710
- let updated = false;
711
727
  for (const provider of Object.values(config.providers || {})) {
712
728
  const models = provider.models || [];
713
729
  for (const m of models) {
714
- if (m.id === model) {
715
- const current = m.reasoning;
716
- if (current === hasReasoning) {
717
- return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
718
- }
719
- m.reasoning = hasReasoning;
720
- updated = true;
721
- break;
730
+ if (m.id === model && m.reasoning === hasReasoning) {
731
+ return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
722
732
  }
723
733
  }
724
- if (updated) break;
725
734
  }
726
- if (!updated) {
727
- return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
728
- }
729
- writeModelsJson(config);
730
735
  const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
731
- return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
736
+ return { updated: true, message: `Updated ${model}: ${action}` };
732
737
  } catch (e) {
733
738
  return { updated: false, message: `Failed to update models.json: ${e.message}` };
734
739
  }
735
740
  }
736
- const branding = [
737
- ` \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
738
- ` Written by VTSTech`,
739
- ` GitHub: https://github.com/VTSTech`,
740
- ` Website: www.vts-tech.org`
741
- ].join("\n");
742
741
  async function testModelOllama(model, providerInfo, ctx) {
743
742
  const lines = [];
744
743
  const totalStart = Date.now();
745
- lines.push(branding);
744
+ lines.push(sharedBranding);
746
745
  lines.push(section(`MODEL: ${model}`));
747
746
  lines.push(info("Provider: Ollama (local/remote)"));
748
747
  const modelsJson = readModelsJson();
@@ -783,7 +782,8 @@ function model_test_temp_default(pi) {
783
782
  modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
784
783
  }
785
784
  }
786
- } catch {
785
+ } catch (err) {
786
+ debugLog("model-test", "failed to fetch model metadata from /api/show", err);
787
787
  }
788
788
  const detectedFamily = detectModelFamily(model);
789
789
  lines.push(info(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
@@ -793,18 +793,7 @@ function model_test_temp_default(pi) {
793
793
  lines.push(info("Testing..."));
794
794
  const reasoning = await testReasoning(model);
795
795
  lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
796
- if (reasoning.score === "STRONG") {
797
- lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
798
- } else if (reasoning.score === "MODERATE") {
799
- lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
800
- } else if (reasoning.score === "WEAK") {
801
- lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
802
- } else if (reasoning.score === "FAIL") {
803
- lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
804
- } else {
805
- const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
806
- lines.push(fail(`Error: ${errMsg}`));
807
- }
796
+ reportReasoningScore(lines, reasoning);
808
797
  lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
809
798
  lines.push(section("THINKING TEST"));
810
799
  lines.push(info('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
@@ -827,32 +816,7 @@ function model_test_temp_default(pi) {
827
816
  await rateLimitDelay(lines);
828
817
  const tools = await testToolUsage(model);
829
818
  lines.push(info(`Time: ${msHuman(tools.elapsedMs)}`));
830
- if (tools.score === "STRONG") {
831
- lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
832
- if (tools.response) {
833
- lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
834
- }
835
- } else if (tools.score === "MODERATE") {
836
- lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
837
- if (tools.response) {
838
- lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
839
- }
840
- } else if (tools.score === "WEAK") {
841
- lines.push(warn(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
842
- if (tools.response) {
843
- lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
844
- }
845
- } else if (tools.score === "FAIL") {
846
- const hasResponse = tools.response && tools.response.trim().length > 0;
847
- lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
848
- if (hasResponse) {
849
- lines.push(info(`Text response: ${sanitizeForReport(tools.response)}`));
850
- } else {
851
- lines.push(info("Text response: (empty)"));
852
- }
853
- } else {
854
- lines.push(fail(`Error: ${tools.toolCall}`));
855
- }
819
+ reportToolScore(lines, tools);
856
820
  lines.push(section("REACT PARSING TEST"));
857
821
  lines.push(info(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
858
822
  lines.push(info("Testing..."));
@@ -889,15 +853,7 @@ function model_test_temp_default(pi) {
889
853
  await rateLimitDelay(lines);
890
854
  const instructions = await testInstructionFollowing(model);
891
855
  lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
892
- if (instructions.score === "STRONG") {
893
- lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
894
- } else if (instructions.score === "MODERATE") {
895
- lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
896
- } else if (instructions.score === "WEAK") {
897
- lines.push(warn(`Partial JSON compliance (${instructions.score})`));
898
- } else {
899
- lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
900
- }
856
+ reportInstructionScore(lines, instructions);
901
857
  lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
902
858
  lines.push(section("TOOL SUPPORT DETECTION"));
903
859
  lines.push(info("Probing model for tool calling capability (native / ReAct / none)"));
@@ -930,11 +886,10 @@ function model_test_temp_default(pi) {
930
886
  }
931
887
  lines.push(info(`Evidence: ${toolSupport.evidence}`));
932
888
  lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
933
- lines.push(section("SUMMARY"));
934
889
  const totalMs = Date.now() - totalStart;
935
890
  const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
936
891
  const reactPass = react.score === "STRONG" || react.score === "MODERATE";
937
- const tests = [
892
+ const ollamaTests = [
938
893
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
939
894
  { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
940
895
  { name: "Tool Usage", pass: toolPass, score: tools.score },
@@ -942,23 +897,10 @@ function model_test_temp_default(pi) {
942
897
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
943
898
  { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
944
899
  ];
945
- const passed = tests.filter((t) => t.pass).length;
946
- const total = tests.length;
947
- for (const t of tests) {
948
- lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
949
- }
950
- lines.push(info(`Total time: ${msHuman(totalMs)}`));
951
- lines.push(info(`Score: ${passed}/${total} tests passed`));
952
- lines.push(section("RECOMMENDATION"));
953
- if (passed === 6) {
954
- lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
955
- } else if (passed >= 5) {
956
- lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
957
- } else if (passed >= 4) {
958
- lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
959
- } else {
960
- lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
961
- }
900
+ const passed = ollamaTests.filter((t) => t.pass).length;
901
+ const total = ollamaTests.length;
902
+ lines.push(...formatTestSummary(ollamaTests, totalMs));
903
+ lines.push(...formatRecommendation(model, passed, total));
962
904
  try {
963
905
  const historyEntry = {
964
906
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -985,14 +927,15 @@ function model_test_temp_default(pi) {
985
927
  lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
986
928
  }
987
929
  }
988
- } catch {
930
+ } catch (err) {
931
+ debugLog("model-test", "failed to save test history", err);
989
932
  }
990
933
  return lines.join("\n");
991
934
  }
992
935
  async function testModelProvider(providerInfo, model, ctx) {
993
936
  const lines = [];
994
937
  const totalStart = Date.now();
995
- lines.push(branding);
938
+ lines.push(sharedBranding);
996
939
  lines.push(section(`MODEL: ${model}`));
997
940
  lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
998
941
  lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
@@ -1031,18 +974,7 @@ function model_test_temp_default(pi) {
1031
974
  await rateLimitDelay(lines);
1032
975
  const reasoning = await testReasoningProvider(providerInfo, model);
1033
976
  lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
1034
- if (reasoning.score === "STRONG") {
1035
- lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
1036
- } else if (reasoning.score === "MODERATE") {
1037
- lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
1038
- } else if (reasoning.score === "WEAK") {
1039
- lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
1040
- } else if (reasoning.score === "FAIL") {
1041
- lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
1042
- } else {
1043
- const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
1044
- lines.push(fail(`Error: ${errMsg}`));
1045
- }
977
+ reportReasoningScore(lines, reasoning);
1046
978
  lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
1047
979
  lines.push(section("INSTRUCTION FOLLOWING TEST"));
1048
980
  lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
@@ -1050,15 +982,7 @@ function model_test_temp_default(pi) {
1050
982
  await rateLimitDelay(lines);
1051
983
  const instructions = await testInstructionFollowingProvider(providerInfo, model);
1052
984
  lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
1053
- if (instructions.score === "STRONG") {
1054
- lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
1055
- } else if (instructions.score === "MODERATE") {
1056
- lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
1057
- } else if (instructions.score === "WEAK") {
1058
- lines.push(warn(`Partial JSON compliance (${instructions.score})`));
1059
- } else {
1060
- lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
1061
- }
985
+ reportInstructionScore(lines, instructions);
1062
986
  lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
1063
987
  lines.push(section("TOOL USAGE TEST"));
1064
988
  lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
@@ -1066,62 +990,23 @@ function model_test_temp_default(pi) {
1066
990
  await rateLimitDelay(lines);
1067
991
  const toolTest = await testToolUsageProvider(providerInfo, model);
1068
992
  lines.push(info(`Time: ${msHuman(toolTest.elapsedMs)}`));
1069
- if (toolTest.score === "STRONG") {
1070
- lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
1071
- if (toolTest.response) {
1072
- lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1073
- }
1074
- } else if (toolTest.score === "MODERATE") {
1075
- lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
1076
- if (toolTest.response) {
1077
- lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1078
- }
1079
- } else if (toolTest.score === "WEAK") {
1080
- lines.push(warn(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
1081
- if (toolTest.response) {
1082
- lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1083
- }
1084
- } else if (toolTest.score === "FAIL") {
1085
- const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
1086
- lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
1087
- if (hasResponse) {
1088
- lines.push(info(`Text response: ${sanitizeForReport(toolTest.response)}`));
1089
- } else {
1090
- lines.push(info("Text response: (empty)"));
1091
- }
1092
- } else {
1093
- lines.push(fail(`Error: ${toolTest.toolCall}`));
1094
- }
993
+ reportToolScore(lines, toolTest);
1095
994
  lines.push(section("SKIPPED TESTS (OLLAMA-ONLY)"));
1096
995
  lines.push(warn("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
1097
996
  lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
1098
997
  lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
1099
998
  lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
1100
- lines.push(section("SUMMARY"));
1101
999
  const totalMs = Date.now() - totalStart;
1102
- const tests = [
1000
+ const providerTests = [
1103
1001
  { name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
1104
1002
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
1105
1003
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
1106
1004
  { name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
1107
1005
  ];
1108
- const passed = tests.filter((t) => t.pass).length;
1109
- const total = tests.length;
1110
- for (const t of tests) {
1111
- lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
1112
- }
1113
- lines.push(info(`Total time: ${msHuman(totalMs)}`));
1114
- lines.push(info(`Score: ${passed}/${total} tests passed`));
1115
- lines.push(section("RECOMMENDATION"));
1116
- if (passed === 4) {
1117
- lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
1118
- } else if (passed >= 3) {
1119
- lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
1120
- } else if (passed >= 2) {
1121
- lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
1122
- } else {
1123
- lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
1124
- }
1006
+ const passed = providerTests.filter((t) => t.pass).length;
1007
+ const total = providerTests.length;
1008
+ lines.push(...formatTestSummary(providerTests, totalMs));
1009
+ lines.push(...formatRecommendation(model, passed, total, providerInfo.name));
1125
1010
  try {
1126
1011
  const historyEntry = {
1127
1012
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -1148,7 +1033,8 @@ function model_test_temp_default(pi) {
1148
1033
  lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
1149
1034
  }
1150
1035
  }
1151
- } catch {
1036
+ } catch (err) {
1037
+ debugLog("model-test", "failed to save provider test history", err);
1152
1038
  }
1153
1039
  return lines.join("\n");
1154
1040
  }
@@ -1168,7 +1054,8 @@ function model_test_temp_default(pi) {
1168
1054
  try {
1169
1055
  const models = await getOllamaModels();
1170
1056
  return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
1171
- } catch {
1057
+ } catch (err) {
1058
+ debugLog("model-test", "failed to get model completions", err);
1172
1059
  return [];
1173
1060
  }
1174
1061
  },
@@ -1188,7 +1075,8 @@ function model_test_temp_default(pi) {
1188
1075
  let models;
1189
1076
  try {
1190
1077
  models = await getOllamaModels();
1191
- } catch {
1078
+ } catch (err) {
1079
+ debugLog("model-test", "failed to list Ollama models for --all", err);
1192
1080
  ctx.ui.notify("Could not list Ollama models", "error");
1193
1081
  return;
1194
1082
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.1.7",
3
+ "version": "1.1.9",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.1.7"
17
+ "@vtstech/pi-shared": "1.1.9"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"