@vtstech/pi-model-test 1.1.6 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +4 -1
  2. package/model-test.js +199 -168
  3. package/package.json +2 -2
package/README.md CHANGED
@@ -44,8 +44,11 @@ pi install "npm:@vtstech/pi-model-test"
44
44
 
45
45
  - Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
46
46
  - Uses native `fetch()` for all HTTP communication (no shell subprocess or curl dependency)
47
+ - **Streaming Ollama chat** — uses `/api/chat` with `stream: true` for earlier timeout detection and reduced memory
47
48
  - Automatic remote Ollama URL resolution (reads from `models.json` on every call — picks up config changes immediately)
48
- - Timeout resilience with auto-retry on empty responses
49
+ - Timeout resilience with exponential backoff retry on connection failures
50
+ - **Configurable test parameters** — override timeouts, delays, temperature via `~/.pi/agent/model-test-config.json`
51
+ - **Test history with regression detection** — tracks results at `~/.pi/agent/cache/model-test-history.json`, flags score degradation
49
52
  - Rate limit delay between tests (configurable)
50
53
  - Thinking model fallback (retries with `think: true`)
51
54
  - Tool support cache (`~/.pi/agent/cache/tool_support.json`)
package/model-test.js CHANGED
@@ -9,10 +9,12 @@ import {
9
9
  truncate,
10
10
  sanitizeForReport
11
11
  } from "@vtstech/pi-shared/format";
12
- import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
12
+ import { getOllamaBaseUrl, detectModelFamily, readModelsJson, readModifyWriteModelsJson, fetchModelContextLength, detectProvider } from "@vtstech/pi-shared/ollama";
13
13
  import {
14
14
  ALL_DIALECT_PATTERNS,
15
- parseReactWithPatterns
15
+ parseReactWithPatterns,
16
+ detectReactDialect,
17
+ extractBraceJson
16
18
  } from "@vtstech/pi-shared/react-parser";
17
19
  import {
18
20
  CONFIG,
@@ -20,24 +22,34 @@ import {
20
22
  scoreReasoning,
21
23
  getCachedToolSupport,
22
24
  cacheToolSupport,
25
+ getEffectiveConfig,
26
+ appendTestHistory,
27
+ detectRegression,
23
28
  testToolUsageUnified,
24
29
  testReasoningUnified,
25
30
  testInstructionFollowingUnified,
26
31
  TOOL_SUPPORT_CACHE_PATH
27
32
  } from "@vtstech/pi-shared/model-test-utils";
33
+ import {
34
+ branding as sharedBranding,
35
+ formatTestSummary,
36
+ formatRecommendation
37
+ } from "@vtstech/pi-shared/test-report";
28
38
  function model_test_temp_default(pi) {
39
+ const effectiveConfig = getEffectiveConfig();
29
40
  function ollamaBase() {
30
41
  return getOllamaBaseUrl();
31
42
  }
32
43
  async function rateLimitDelay(lines) {
33
- if (CONFIG.TEST_DELAY_MS > 0) {
34
- lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
35
- await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
44
+ if (effectiveConfig.TEST_DELAY_MS > 0) {
45
+ lines.push(info(`Waiting ${msHuman(effectiveConfig.TEST_DELAY_MS)} to avoid rate limiting...`));
46
+ await new Promise((r) => setTimeout(r, effectiveConfig.TEST_DELAY_MS));
36
47
  }
37
48
  }
38
- function makeOllamaChatFn() {
49
+ function makeOllamaChatFn(useStreaming = true) {
39
50
  return async (model, messages, _options) => {
40
- const result = await ollamaChat(model, messages);
51
+ const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
52
+ const result = await chatFn(model, messages);
41
53
  return {
42
54
  content: result.response?.message?.content || "",
43
55
  elapsedMs: result.elapsedMs,
@@ -154,6 +166,69 @@ function model_test_temp_default(pi) {
154
166
  }
155
167
  throw new Error("Unreachable");
156
168
  }
169
+ async function ollamaChatStream(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS) {
170
+ const body = { model, messages, stream: true, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
171
+ const url = `${ollamaBase()}/api/chat`;
172
+ const controller = new AbortController();
173
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
174
+ const start = Date.now();
175
+ try {
176
+ const res = await fetch(url, {
177
+ method: "POST",
178
+ headers: { "Content-Type": "application/json" },
179
+ body: JSON.stringify(body),
180
+ signal: controller.signal
181
+ });
182
+ if (!res.ok) {
183
+ const errorText = await res.text().catch(() => "unknown error");
184
+ throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
185
+ }
186
+ if (!res.body) {
187
+ throw new Error("Ollama streaming response has no body");
188
+ }
189
+ let messageContent = "";
190
+ let thinkingContent = "";
191
+ let done = false;
192
+ const reader = res.body.getReader();
193
+ const decoder = new TextDecoder();
194
+ while (!done) {
195
+ const { value, done: streamDone } = await reader.read();
196
+ if (streamDone) break;
197
+ const chunk = decoder.decode(value, { stream: true });
198
+ const lines = chunk.split("\n").filter((line) => line.trim().length > 0);
199
+ for (const line of lines) {
200
+ try {
201
+ const parsed = JSON.parse(line);
202
+ if (parsed.message?.content) messageContent += parsed.message.content;
203
+ if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
204
+ if (parsed.done) done = true;
205
+ } catch (err) {
206
+ debugLog("model-test", "skipped malformed JSON chunk in streaming response", err);
207
+ }
208
+ }
209
+ }
210
+ const elapsedMs = Date.now() - start;
211
+ if (!messageContent.trim() && !thinkingContent.trim()) {
212
+ throw new Error("Empty streaming response from Ollama");
213
+ }
214
+ const response = {
215
+ message: {
216
+ content: messageContent,
217
+ thinking: thinkingContent,
218
+ role: "assistant"
219
+ },
220
+ done: true
221
+ };
222
+ return { response, elapsedMs };
223
+ } catch (e) {
224
+ if (e instanceof Error && e.name === "AbortError") {
225
+ throw new Error(`Ollama API timed out after ${msHuman(timeoutMs)}`);
226
+ }
227
+ throw e;
228
+ } finally {
229
+ clearTimeout(timeoutId);
230
+ }
231
+ }
157
232
  async function providerChat(providerInfo, model, messages, options = {}) {
158
233
  const { baseUrl, apiKey } = providerInfo;
159
234
  const maxTokens = options.maxTokens ?? CONFIG.NUM_PREDICT;
@@ -368,73 +443,20 @@ function model_test_temp_default(pi) {
368
443
  return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
369
444
  }
370
445
  let parsedResult = null;
371
- const sharedParser = pi._reactParser;
372
- if (sharedParser?.ALL_DIALECT_PATTERNS) {
373
- for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
374
- const result = sharedParser.parseReactWithPatterns(content, dp, true);
375
- if (result) {
376
- let toolName = result.name;
377
- let argsStr;
378
- const rawArgs = result.args ? JSON.stringify(result.args) : "";
379
- if (rawArgs && rawArgs !== "{}") {
380
- argsStr = rawArgs;
381
- } else if (result.raw) {
382
- const jsonStart = result.raw.indexOf("{");
383
- if (jsonStart !== -1) {
384
- let depth = 0, jsonEnd = -1;
385
- for (let i = jsonStart; i < result.raw.length; i++) {
386
- if (result.raw[i] === "{") depth++;
387
- else if (result.raw[i] === "}") {
388
- depth--;
389
- if (depth === 0) {
390
- jsonEnd = i;
391
- break;
392
- }
393
- }
394
- }
395
- argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
396
- } else {
397
- argsStr = "";
398
- }
399
- } else {
400
- argsStr = "";
401
- }
402
- parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
403
- break;
404
- }
405
- }
406
- } else {
407
- for (const dp of ALL_DIALECT_PATTERNS) {
408
- const result = parseReactWithPatterns(content, dp, true);
409
- if (result) {
410
- let argsStr;
411
- const rawArgs = result.args ? JSON.stringify(result.args) : "";
412
- if (rawArgs && rawArgs !== "{}") {
413
- argsStr = rawArgs;
414
- } else if (result.raw) {
415
- const jsonStart = result.raw.indexOf("{");
416
- if (jsonStart !== -1) {
417
- let depth = 0, jsonEnd = -1;
418
- for (let i = jsonStart; i < result.raw.length; i++) {
419
- if (result.raw[i] === "{") depth++;
420
- else if (result.raw[i] === "}") {
421
- depth--;
422
- if (depth === 0) {
423
- jsonEnd = i;
424
- break;
425
- }
426
- }
427
- }
428
- argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
429
- } else {
430
- argsStr = "";
431
- }
432
- } else {
433
- argsStr = "";
434
- }
435
- parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
436
- break;
446
+ for (const dp of ALL_DIALECT_PATTERNS) {
447
+ const result = parseReactWithPatterns(content, dp, true);
448
+ if (result) {
449
+ let argsStr;
450
+ const rawArgs = result.args ? JSON.stringify(result.args) : "";
451
+ if (rawArgs && rawArgs !== "{}") {
452
+ argsStr = rawArgs;
453
+ } else if (result.raw) {
454
+ argsStr = extractBraceJson(result.raw);
455
+ } else {
456
+ argsStr = "";
437
457
  }
458
+ parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
459
+ break;
438
460
  }
439
461
  }
440
462
  if (parsedResult) {
@@ -525,7 +547,7 @@ function model_test_temp_default(pi) {
525
547
  try {
526
548
  const start = Date.now();
527
549
  const controller = new AbortController();
528
- const timeoutId = setTimeout(() => controller.abort(), 13e4);
550
+ const timeoutId = setTimeout(() => controller.abort(), effectiveConfig.TOOL_SUPPORT_TIMEOUT_MS);
529
551
  const res = await fetch(`${ollamaBase()}/api/chat`, {
530
552
  method: "POST",
531
553
  headers: { "Content-Type": "application/json" },
@@ -556,7 +578,8 @@ function model_test_temp_default(pi) {
556
578
  try {
557
579
  const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
558
580
  argsStr = JSON.stringify(args);
559
- } catch {
581
+ } catch (err) {
582
+ debugLog("model-test", "failed to parse tool call arguments", err);
560
583
  argsStr = String(fn.arguments);
561
584
  }
562
585
  const level2 = "native";
@@ -568,41 +591,14 @@ function model_test_temp_default(pi) {
568
591
  elapsedMs
569
592
  };
570
593
  }
571
- const reactPatterns = [
572
- // Classic ReAct
573
- /^\s*Action:\s*/im,
574
- /^\s*Action Input:\s*/im,
575
- /^\s*Thought:\s*/im,
576
- /Action:\s*\w+/i,
577
- /Action Input:\s*\{/i,
578
- // Function dialect
579
- /^\s*Function:\s*/im,
580
- /^\s*Function Input:\s*/im,
581
- /Function:\s*\w+/i,
582
- // Tool dialect
583
- /^\s*Tool:\s*/im,
584
- /^\s*Tool Input:\s*/im,
585
- /Tool:\s*\w+/i,
586
- // Call dialect
587
- /^\s*Call:\s*/im,
588
- /^\s*Input:\s*/im,
589
- /Call:\s*\w+/i
590
- ];
591
- const matchedPatterns = [];
592
- for (const p of reactPatterns) {
593
- if (p.test(content)) matchedPatterns.push(p.source);
594
- }
595
- if (matchedPatterns.length > 0) {
596
- let dialectName = "react";
597
- if (/Function:/i.test(content)) dialectName = "function";
598
- else if (/Tool:/i.test(content)) dialectName = "tool";
599
- else if (/Call:/i.test(content)) dialectName = "call";
594
+ const detectedDialect = detectReactDialect(content);
595
+ if (detectedDialect) {
600
596
  const level2 = "react";
601
597
  cacheToolSupport(model, level2, family);
602
598
  return {
603
599
  level: level2,
604
600
  cached: false,
605
- evidence: `ReAct format detected (${dialectName} dialect) in text response`,
601
+ evidence: `ReAct format detected (${detectedDialect.name} dialect) in text response`,
606
602
  elapsedMs
607
603
  };
608
604
  }
@@ -646,7 +642,8 @@ function model_test_temp_default(pi) {
646
642
  if (!res.ok) return [];
647
643
  const data = await res.json();
648
644
  return (data.models || []).map((m) => m.name).filter(Boolean);
649
- } catch {
645
+ } catch (err) {
646
+ debugLog("model-test", "failed to list Ollama models", err);
650
647
  return [];
651
648
  }
652
649
  }
@@ -655,43 +652,44 @@ function model_test_temp_default(pi) {
655
652
  }
656
653
  function updateModelsJsonReasoning(model, hasReasoning) {
657
654
  try {
655
+ const written = readModifyWriteModelsJson((config2) => {
656
+ for (const provider of Object.values(config2.providers || {})) {
657
+ const models = provider.models || [];
658
+ for (const m of models) {
659
+ if (m.id === model) {
660
+ const current = m.reasoning;
661
+ if (current === hasReasoning) {
662
+ return null;
663
+ }
664
+ m.reasoning = hasReasoning;
665
+ return config2;
666
+ }
667
+ }
668
+ }
669
+ return null;
670
+ });
671
+ if (!written) {
672
+ return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
673
+ }
658
674
  const config = readModelsJson();
659
- let updated = false;
660
675
  for (const provider of Object.values(config.providers || {})) {
661
676
  const models = provider.models || [];
662
677
  for (const m of models) {
663
- if (m.id === model) {
664
- const current = m.reasoning;
665
- if (current === hasReasoning) {
666
- return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
667
- }
668
- m.reasoning = hasReasoning;
669
- updated = true;
670
- break;
678
+ if (m.id === model && m.reasoning === hasReasoning) {
679
+ return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
671
680
  }
672
681
  }
673
- if (updated) break;
674
- }
675
- if (!updated) {
676
- return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
677
682
  }
678
- writeModelsJson(config);
679
683
  const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
680
- return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
684
+ return { updated: true, message: `Updated ${model}: ${action}` };
681
685
  } catch (e) {
682
686
  return { updated: false, message: `Failed to update models.json: ${e.message}` };
683
687
  }
684
688
  }
685
- const branding = [
686
- ` \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
687
- ` Written by VTSTech`,
688
- ` GitHub: https://github.com/VTSTech`,
689
- ` Website: www.vts-tech.org`
690
- ].join("\n");
691
689
  async function testModelOllama(model, providerInfo, ctx) {
692
690
  const lines = [];
693
691
  const totalStart = Date.now();
694
- lines.push(branding);
692
+ lines.push(sharedBranding);
695
693
  lines.push(section(`MODEL: ${model}`));
696
694
  lines.push(info("Provider: Ollama (local/remote)"));
697
695
  const modelsJson = readModelsJson();
@@ -732,7 +730,8 @@ function model_test_temp_default(pi) {
732
730
  modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
733
731
  }
734
732
  }
735
- } catch {
733
+ } catch (err) {
734
+ debugLog("model-test", "failed to fetch model metadata from /api/show", err);
736
735
  }
737
736
  const detectedFamily = detectModelFamily(model);
738
737
  lines.push(info(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
@@ -879,11 +878,10 @@ function model_test_temp_default(pi) {
879
878
  }
880
879
  lines.push(info(`Evidence: ${toolSupport.evidence}`));
881
880
  lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
882
- lines.push(section("SUMMARY"));
883
881
  const totalMs = Date.now() - totalStart;
884
882
  const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
885
883
  const reactPass = react.score === "STRONG" || react.score === "MODERATE";
886
- const tests = [
884
+ const ollamaTests = [
887
885
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
888
886
  { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
889
887
  { name: "Tool Usage", pass: toolPass, score: tools.score },
@@ -891,29 +889,45 @@ function model_test_temp_default(pi) {
891
889
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
892
890
  { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
893
891
  ];
894
- const passed = tests.filter((t) => t.pass).length;
895
- const total = tests.length;
896
- for (const t of tests) {
897
- lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
898
- }
899
- lines.push(info(`Total time: ${msHuman(totalMs)}`));
900
- lines.push(info(`Score: ${passed}/${total} tests passed`));
901
- lines.push(section("RECOMMENDATION"));
902
- if (passed === 6) {
903
- lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
904
- } else if (passed >= 5) {
905
- lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
906
- } else if (passed >= 4) {
907
- lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
908
- } else {
909
- lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
892
+ const passed = ollamaTests.filter((t) => t.pass).length;
893
+ const total = ollamaTests.length;
894
+ lines.push(...formatTestSummary(ollamaTests, totalMs));
895
+ lines.push(...formatRecommendation(model, passed, total));
896
+ try {
897
+ const historyEntry = {
898
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
899
+ model,
900
+ providerKind: "ollama",
901
+ providerName: providerName || "ollama",
902
+ tests: {
903
+ reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
904
+ thinking: { supported: thinking.supported },
905
+ toolUsage: { score: tools.score, pass: tools.score === "STRONG" || tools.score === "MODERATE", toolCall: tools.toolCall },
906
+ reactParsing: { score: react.score, pass: react.score === "STRONG" || react.score === "MODERATE", toolCall: react.toolCall, dialect: react.dialect },
907
+ instructionFollowing: { score: instructions.score, pass: instructions.pass },
908
+ toolSupport: { level: toolSupport.level, evidence: toolSupport.evidence }
909
+ },
910
+ passedCount: passed,
911
+ totalCount: total,
912
+ totalMs
913
+ };
914
+ appendTestHistory(historyEntry);
915
+ const regressions = detectRegression(model, historyEntry);
916
+ if (regressions.length > 0) {
917
+ lines.push(section("REGRESSION DETECTED"));
918
+ for (const reg of regressions) {
919
+ lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
920
+ }
921
+ }
922
+ } catch (err) {
923
+ debugLog("model-test", "failed to save test history", err);
910
924
  }
911
925
  return lines.join("\n");
912
926
  }
913
927
  async function testModelProvider(providerInfo, model, ctx) {
914
928
  const lines = [];
915
929
  const totalStart = Date.now();
916
- lines.push(branding);
930
+ lines.push(sharedBranding);
917
931
  lines.push(section(`MODEL: ${model}`));
918
932
  lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
919
933
  lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
@@ -1018,30 +1032,45 @@ function model_test_temp_default(pi) {
1018
1032
  lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
1019
1033
  lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
1020
1034
  lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
1021
- lines.push(section("SUMMARY"));
1022
1035
  const totalMs = Date.now() - totalStart;
1023
- const tests = [
1036
+ const providerTests = [
1024
1037
  { name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
1025
1038
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
1026
1039
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
1027
1040
  { name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
1028
1041
  ];
1029
- const passed = tests.filter((t) => t.pass).length;
1030
- const total = tests.length;
1031
- for (const t of tests) {
1032
- lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
1033
- }
1034
- lines.push(info(`Total time: ${msHuman(totalMs)}`));
1035
- lines.push(info(`Score: ${passed}/${total} tests passed`));
1036
- lines.push(section("RECOMMENDATION"));
1037
- if (passed === 4) {
1038
- lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
1039
- } else if (passed >= 3) {
1040
- lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
1041
- } else if (passed >= 2) {
1042
- lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
1043
- } else {
1044
- lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
1042
+ const passed = providerTests.filter((t) => t.pass).length;
1043
+ const total = providerTests.length;
1044
+ lines.push(...formatTestSummary(providerTests, totalMs));
1045
+ lines.push(...formatRecommendation(model, passed, total, providerInfo.name));
1046
+ try {
1047
+ const historyEntry = {
1048
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1049
+ model,
1050
+ providerKind: "builtin",
1051
+ providerName: providerInfo.name,
1052
+ tests: {
1053
+ reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
1054
+ thinking: { supported: false },
1055
+ toolUsage: { score: toolTest.score, pass: toolTest.pass, toolCall: toolTest.toolCall },
1056
+ reactParsing: { score: "SKIP", pass: false, toolCall: "n/a" },
1057
+ instructionFollowing: { score: instructions.score, pass: instructions.pass },
1058
+ toolSupport: { level: "native", evidence: "provider-native (not probed)" }
1059
+ },
1060
+ passedCount: passed,
1061
+ totalCount: total,
1062
+ totalMs
1063
+ };
1064
+ appendTestHistory(historyEntry);
1065
+ const regressions = detectRegression(model, historyEntry);
1066
+ if (regressions.length > 0) {
1067
+ lines.push(section("REGRESSION DETECTED"));
1068
+ for (const reg of regressions) {
1069
+ lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
1070
+ }
1071
+ }
1072
+ } catch (err) {
1073
+ debugLog("model-test", "failed to save provider test history", err);
1045
1074
  }
1046
1075
  return lines.join("\n");
1047
1076
  }
@@ -1061,7 +1090,8 @@ function model_test_temp_default(pi) {
1061
1090
  try {
1062
1091
  const models = await getOllamaModels();
1063
1092
  return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
1064
- } catch {
1093
+ } catch (err) {
1094
+ debugLog("model-test", "failed to get model completions", err);
1065
1095
  return [];
1066
1096
  }
1067
1097
  },
@@ -1081,7 +1111,8 @@ function model_test_temp_default(pi) {
1081
1111
  let models;
1082
1112
  try {
1083
1113
  models = await getOllamaModels();
1084
- } catch {
1114
+ } catch (err) {
1115
+ debugLog("model-test", "failed to list Ollama models for --all", err);
1085
1116
  ctx.ui.notify("Could not list Ollama models", "error");
1086
1117
  return;
1087
1118
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.1.6",
3
+ "version": "1.1.8",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.1.6"
17
+ "@vtstech/pi-shared": "1.1.8"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"