@vtstech/pi-model-test 1.1.7 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +79 -155
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -9,10 +9,12 @@ import {
9
9
  truncate,
10
10
  sanitizeForReport
11
11
  } from "@vtstech/pi-shared/format";
12
- import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
12
+ import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
13
13
  import {
14
14
  ALL_DIALECT_PATTERNS,
15
- parseReactWithPatterns
15
+ parseReactWithPatterns,
16
+ detectReactDialect,
17
+ extractBraceJson
16
18
  } from "@vtstech/pi-shared/react-parser";
17
19
  import {
18
20
  CONFIG,
@@ -28,15 +30,20 @@ import {
28
30
  testInstructionFollowingUnified,
29
31
  TOOL_SUPPORT_CACHE_PATH
30
32
  } from "@vtstech/pi-shared/model-test-utils";
33
+ import {
34
+ branding as sharedBranding,
35
+ formatTestSummary,
36
+ formatRecommendation
37
+ } from "@vtstech/pi-shared/test-report";
31
38
  function model_test_temp_default(pi) {
32
39
  const effectiveConfig = getEffectiveConfig();
33
40
  function ollamaBase() {
34
41
  return getOllamaBaseUrl();
35
42
  }
36
43
  async function rateLimitDelay(lines) {
37
- if (CONFIG.TEST_DELAY_MS > 0) {
38
- lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
39
- await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
44
+ if (effectiveConfig.TEST_DELAY_MS > 0) {
45
+ lines.push(info(`Waiting ${msHuman(effectiveConfig.TEST_DELAY_MS)} to avoid rate limiting...`));
46
+ await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
40
47
  }
41
48
  }
42
49
  function makeOllamaChatFn(useStreaming = true) {
@@ -195,7 +202,8 @@ function model_test_temp_default(pi) {
195
202
  if (parsed.message?.content) messageContent += parsed.message.content;
196
203
  if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
197
204
  if (parsed.done) done = true;
198
- } catch {
205
+ } catch (err) {
206
+ debugLog("model-test", "skipped malformed JSON chunk in streaming response", err);
199
207
  }
200
208
  }
201
209
  }
@@ -392,22 +400,6 @@ function model_test_temp_default(pi) {
392
400
  async function testToolUsageProvider(providerInfo, model) {
393
401
  return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
394
402
  }
395
- function extractBraceJson(raw) {
396
- const jsonStart = raw.indexOf("{");
397
- if (jsonStart === -1) return "";
398
- let depth = 0, jsonEnd = -1;
399
- for (let i = jsonStart; i < raw.length; i++) {
400
- if (raw[i] === "{") depth++;
401
- else if (raw[i] === "}") {
402
- depth--;
403
- if (depth === 0) {
404
- jsonEnd = i;
405
- break;
406
- }
407
- }
408
- }
409
- return jsonEnd !== -1 ? raw.slice(jsonStart, jsonEnd + 1) : "";
410
- }
411
403
  async function testReactParsing(model) {
412
404
  const systemPrompt = [
413
405
  "You are a helpful assistant with access to tools.",
@@ -451,41 +443,20 @@ function model_test_temp_default(pi) {
451
443
  return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
452
444
  }
453
445
  let parsedResult = null;
454
- const sharedParser = pi._reactParser;
455
- if (sharedParser?.ALL_DIALECT_PATTERNS) {
456
- for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
457
- const result = sharedParser.parseReactWithPatterns(content, dp, true);
458
- if (result) {
459
- let toolName = result.name;
460
- let argsStr;
461
- const rawArgs = result.args ? JSON.stringify(result.args) : "";
462
- if (rawArgs && rawArgs !== "{}") {
463
- argsStr = rawArgs;
464
- } else if (result.raw) {
465
- argsStr = extractBraceJson(result.raw);
466
- } else {
467
- argsStr = "";
468
- }
469
- parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
470
- break;
471
- }
472
- }
473
- } else {
474
- for (const dp of ALL_DIALECT_PATTERNS) {
475
- const result = parseReactWithPatterns(content, dp, true);
476
- if (result) {
477
- let argsStr;
478
- const rawArgs = result.args ? JSON.stringify(result.args) : "";
479
- if (rawArgs && rawArgs !== "{}") {
480
- argsStr = rawArgs;
481
- } else if (result.raw) {
482
- argsStr = extractBraceJson(result.raw);
483
- } else {
484
- argsStr = "";
485
- }
486
- parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
487
- break;
446
+ for (const dp of ALL_DIALECT_PATTERNS) {
447
+ const result = parseReactWithPatterns(content, dp, true);
448
+ if (result) {
449
+ let argsStr;
450
+ const rawArgs = result.args ? JSON.stringify(result.args) : "";
451
+ if (rawArgs && rawArgs !== "{}") {
452
+ argsStr = rawArgs;
453
+ } else if (result.raw) {
454
+ argsStr = extractBraceJson(result.raw);
455
+ } else {
456
+ argsStr = "";
488
457
  }
458
+ parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
459
+ break;
489
460
  }
490
461
  }
491
462
  if (parsedResult) {
@@ -576,7 +547,7 @@ function model_test_temp_default(pi) {
576
547
  try {
577
548
  const start = Date.now();
578
549
  const controller = new AbortController();
579
- const timeoutId = setTimeout(() => controller.abort(), 13e4);
550
+ const timeoutId = setTimeout(() => controller.abort(), effectiveConfig.TOOL_SUPPORT_TIMEOUT_MS);
580
551
  const res = await fetch(`${ollamaBase()}/api/chat`, {
581
552
  method: "POST",
582
553
  headers: { "Content-Type": "application/json" },
@@ -607,7 +578,8 @@ function model_test_temp_default(pi) {
607
578
  try {
608
579
  const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
609
580
  argsStr = JSON.stringify(args);
610
- } catch {
581
+ } catch (err) {
582
+ debugLog("model-test", "failed to parse tool call arguments", err);
611
583
  argsStr = String(fn.arguments);
612
584
  }
613
585
  const level2 = "native";
@@ -619,41 +591,14 @@ function model_test_temp_default(pi) {
619
591
  elapsedMs
620
592
  };
621
593
  }
622
- const reactPatterns = [
623
- // Classic ReAct
624
- /^\s*Action:\s*/im,
625
- /^\s*Action Input:\s*/im,
626
- /^\s*Thought:\s*/im,
627
- /Action:\s*\w+/i,
628
- /Action Input:\s*\{/i,
629
- // Function dialect
630
- /^\s*Function:\s*/im,
631
- /^\s*Function Input:\s*/im,
632
- /Function:\s*\w+/i,
633
- // Tool dialect
634
- /^\s*Tool:\s*/im,
635
- /^\s*Tool Input:\s*/im,
636
- /Tool:\s*\w+/i,
637
- // Call dialect
638
- /^\s*Call:\s*/im,
639
- /^\s*Input:\s*/im,
640
- /Call:\s*\w+/i
641
- ];
642
- const matchedPatterns = [];
643
- for (const p of reactPatterns) {
644
- if (p.test(content)) matchedPatterns.push(p.source);
645
- }
646
- if (matchedPatterns.length > 0) {
647
- let dialectName = "react";
648
- if (/Function:/i.test(content)) dialectName = "function";
649
- else if (/Tool:/i.test(content)) dialectName = "tool";
650
- else if (/Call:/i.test(content)) dialectName = "call";
594
+ const detectedDialect = detectReactDialect(content);
595
+ if (detectedDialect) {
651
596
  const level2 = "react";
652
597
  cacheToolSupport(model, level2, family);
653
598
  return {
654
599
  level: level2,
655
600
  cached: false,
656
- evidence: `ReAct format detected (${dialectName} dialect) in text response`,
601
+ evidence: `ReAct format detected (${detectedDialect.name} dialect) in text response`,
657
602
  elapsedMs
658
603
  };
659
604
  }
@@ -697,7 +642,8 @@ function model_test_temp_default(pi) {
697
642
  if (!res.ok) return [];
698
643
  const data = await res.json();
699
644
  return (data.models || []).map((m) => m.name).filter(Boolean);
700
- } catch {
645
+ } catch (err) {
646
+ debugLog("model-test", "failed to list Ollama models", err);
701
647
  return [];
702
648
  }
703
649
  }
@@ -706,43 +652,44 @@ function model_test_temp_default(pi) {
706
652
  }
707
653
  function updateModelsJsonReasoning(model, hasReasoning) {
708
654
  try {
655
+ const written = readModifyWriteModelsJson((config2) => {
656
+ for (const provider of Object.values(config2.providers || {})) {
657
+ const models = provider.models || [];
658
+ for (const m of models) {
659
+ if (m.id === model) {
660
+ const current = m.reasoning;
661
+ if (current === hasReasoning) {
662
+ return null;
663
+ }
664
+ m.reasoning = hasReasoning;
665
+ return config2;
666
+ }
667
+ }
668
+ }
669
+ return null;
670
+ });
671
+ if (!written) {
672
+ return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
673
+ }
709
674
  const config = readModelsJson();
710
- let updated = false;
711
675
  for (const provider of Object.values(config.providers || {})) {
712
676
  const models = provider.models || [];
713
677
  for (const m of models) {
714
- if (m.id === model) {
715
- const current = m.reasoning;
716
- if (current === hasReasoning) {
717
- return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
718
- }
719
- m.reasoning = hasReasoning;
720
- updated = true;
721
- break;
678
+ if (m.id === model && m.reasoning === hasReasoning) {
679
+ return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
722
680
  }
723
681
  }
724
- if (updated) break;
725
682
  }
726
- if (!updated) {
727
- return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
728
- }
729
- writeModelsJson(config);
730
683
  const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
731
- return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
684
+ return { updated: true, message: `Updated ${model}: ${action}` };
732
685
  } catch (e) {
733
686
  return { updated: false, message: `Failed to update models.json: ${e.message}` };
734
687
  }
735
688
  }
736
- const branding = [
737
- ` \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
738
- ` Written by VTSTech`,
739
- ` GitHub: https://github.com/VTSTech`,
740
- ` Website: www.vts-tech.org`
741
- ].join("\n");
742
689
  async function testModelOllama(model, providerInfo, ctx) {
743
690
  const lines = [];
744
691
  const totalStart = Date.now();
745
- lines.push(branding);
692
+ lines.push(sharedBranding);
746
693
  lines.push(section(`MODEL: ${model}`));
747
694
  lines.push(info("Provider: Ollama (local/remote)"));
748
695
  const modelsJson = readModelsJson();
@@ -783,7 +730,8 @@ function model_test_temp_default(pi) {
783
730
  modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
784
731
  }
785
732
  }
786
- } catch {
733
+ } catch (err) {
734
+ debugLog("model-test", "failed to fetch model metadata from /api/show", err);
787
735
  }
788
736
  const detectedFamily = detectModelFamily(model);
789
737
  lines.push(info(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
@@ -930,11 +878,10 @@ function model_test_temp_default(pi) {
930
878
  }
931
879
  lines.push(info(`Evidence: ${toolSupport.evidence}`));
932
880
  lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
933
- lines.push(section("SUMMARY"));
934
881
  const totalMs = Date.now() - totalStart;
935
882
  const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
936
883
  const reactPass = react.score === "STRONG" || react.score === "MODERATE";
937
- const tests = [
884
+ const ollamaTests = [
938
885
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
939
886
  { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
940
887
  { name: "Tool Usage", pass: toolPass, score: tools.score },
@@ -942,23 +889,10 @@ function model_test_temp_default(pi) {
942
889
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
943
890
  { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
944
891
  ];
945
- const passed = tests.filter((t) => t.pass).length;
946
- const total = tests.length;
947
- for (const t of tests) {
948
- lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
949
- }
950
- lines.push(info(`Total time: ${msHuman(totalMs)}`));
951
- lines.push(info(`Score: ${passed}/${total} tests passed`));
952
- lines.push(section("RECOMMENDATION"));
953
- if (passed === 6) {
954
- lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
955
- } else if (passed >= 5) {
956
- lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
957
- } else if (passed >= 4) {
958
- lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
959
- } else {
960
- lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
961
- }
892
+ const passed = ollamaTests.filter((t) => t.pass).length;
893
+ const total = ollamaTests.length;
894
+ lines.push(...formatTestSummary(ollamaTests, totalMs));
895
+ lines.push(...formatRecommendation(model, passed, total));
962
896
  try {
963
897
  const historyEntry = {
964
898
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -985,14 +919,15 @@ function model_test_temp_default(pi) {
985
919
  lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
986
920
  }
987
921
  }
988
- } catch {
922
+ } catch (err) {
923
+ debugLog("model-test", "failed to save test history", err);
989
924
  }
990
925
  return lines.join("\n");
991
926
  }
992
927
  async function testModelProvider(providerInfo, model, ctx) {
993
928
  const lines = [];
994
929
  const totalStart = Date.now();
995
- lines.push(branding);
930
+ lines.push(sharedBranding);
996
931
  lines.push(section(`MODEL: ${model}`));
997
932
  lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
998
933
  lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
@@ -1097,31 +1032,17 @@ function model_test_temp_default(pi) {
1097
1032
  lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
1098
1033
  lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
1099
1034
  lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
1100
- lines.push(section("SUMMARY"));
1101
1035
  const totalMs = Date.now() - totalStart;
1102
- const tests = [
1036
+ const providerTests = [
1103
1037
  { name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
1104
1038
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
1105
1039
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
1106
1040
  { name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
1107
1041
  ];
1108
- const passed = tests.filter((t) => t.pass).length;
1109
- const total = tests.length;
1110
- for (const t of tests) {
1111
- lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
1112
- }
1113
- lines.push(info(`Total time: ${msHuman(totalMs)}`));
1114
- lines.push(info(`Score: ${passed}/${total} tests passed`));
1115
- lines.push(section("RECOMMENDATION"));
1116
- if (passed === 4) {
1117
- lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
1118
- } else if (passed >= 3) {
1119
- lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
1120
- } else if (passed >= 2) {
1121
- lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
1122
- } else {
1123
- lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
1124
- }
1042
+ const passed = providerTests.filter((t) => t.pass).length;
1043
+ const total = providerTests.length;
1044
+ lines.push(...formatTestSummary(providerTests, totalMs));
1045
+ lines.push(...formatRecommendation(model, passed, total, providerInfo.name));
1125
1046
  try {
1126
1047
  const historyEntry = {
1127
1048
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -1148,7 +1069,8 @@ function model_test_temp_default(pi) {
1148
1069
  lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
1149
1070
  }
1150
1071
  }
1151
- } catch {
1072
+ } catch (err) {
1073
+ debugLog("model-test", "failed to save provider test history", err);
1152
1074
  }
1153
1075
  return lines.join("\n");
1154
1076
  }
@@ -1168,7 +1090,8 @@ function model_test_temp_default(pi) {
1168
1090
  try {
1169
1091
  const models = await getOllamaModels();
1170
1092
  return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
1171
- } catch {
1093
+ } catch (err) {
1094
+ debugLog("model-test", "failed to get model completions", err);
1172
1095
  return [];
1173
1096
  }
1174
1097
  },
@@ -1188,7 +1111,8 @@ function model_test_temp_default(pi) {
1188
1111
  let models;
1189
1112
  try {
1190
1113
  models = await getOllamaModels();
1191
- } catch {
1114
+ } catch (err) {
1115
+ debugLog("model-test", "failed to list Ollama models for --all", err);
1192
1116
  ctx.ui.notify("Could not list Ollama models", "error");
1193
1117
  return;
1194
1118
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.1.7",
3
+ "version": "1.1.8",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.1.7"
17
+ "@vtstech/pi-shared": "1.1.8"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"