@vtstech/pi-model-test 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +4 -1
  2. package/model-test.js +143 -36
  3. package/package.json +2 -2
package/README.md CHANGED
@@ -44,8 +44,11 @@ pi install "npm:@vtstech/pi-model-test"
44
44
 
45
45
  - Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
46
46
  - Uses native `fetch()` for all HTTP communication (no shell subprocess or curl dependency)
47
+ - **Streaming Ollama chat** — uses `/api/chat` with `stream: true` for earlier timeout detection and reduced memory
47
48
  - Automatic remote Ollama URL resolution (reads from `models.json` on every call — picks up config changes immediately)
48
- - Timeout resilience with auto-retry on empty responses
49
+ - Timeout resilience with exponential backoff retry on connection failures
50
+ - **Configurable test parameters** — override timeouts, delays, temperature via `~/.pi/agent/model-test-config.json`
51
+ - **Test history with regression detection** — tracks results at `~/.pi/agent/cache/model-test-history.json`, flags score degradation
49
52
  - Rate limit delay between tests (configurable)
50
53
  - Thinking model fallback (retries with `think: true`)
51
54
  - Tool support cache (`~/.pi/agent/cache/tool_support.json`)
package/model-test.js CHANGED
@@ -20,12 +20,16 @@ import {
20
20
  scoreReasoning,
21
21
  getCachedToolSupport,
22
22
  cacheToolSupport,
23
+ getEffectiveConfig,
24
+ appendTestHistory,
25
+ detectRegression,
23
26
  testToolUsageUnified,
24
27
  testReasoningUnified,
25
28
  testInstructionFollowingUnified,
26
29
  TOOL_SUPPORT_CACHE_PATH
27
30
  } from "@vtstech/pi-shared/model-test-utils";
28
31
  function model_test_temp_default(pi) {
32
+ const effectiveConfig = getEffectiveConfig();
29
33
  function ollamaBase() {
30
34
  return getOllamaBaseUrl();
31
35
  }
@@ -35,9 +39,10 @@ function model_test_temp_default(pi) {
35
39
  await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
36
40
  }
37
41
  }
38
- function makeOllamaChatFn() {
42
+ function makeOllamaChatFn(useStreaming = true) {
39
43
  return async (model, messages, _options) => {
40
- const result = await ollamaChat(model, messages);
44
+ const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
45
+ const result = await chatFn(model, messages);
41
46
  return {
42
47
  content: result.response?.message?.content || "",
43
48
  elapsedMs: result.elapsedMs,
@@ -154,6 +159,68 @@ function model_test_temp_default(pi) {
154
159
  }
155
160
  throw new Error("Unreachable");
156
161
  }
162
+ async function ollamaChatStream(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS) {
163
+ const body = { model, messages, stream: true, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
164
+ const url = `${ollamaBase()}/api/chat`;
165
+ const controller = new AbortController();
166
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
167
+ const start = Date.now();
168
+ try {
169
+ const res = await fetch(url, {
170
+ method: "POST",
171
+ headers: { "Content-Type": "application/json" },
172
+ body: JSON.stringify(body),
173
+ signal: controller.signal
174
+ });
175
+ if (!res.ok) {
176
+ const errorText = await res.text().catch(() => "unknown error");
177
+ throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
178
+ }
179
+ if (!res.body) {
180
+ throw new Error("Ollama streaming response has no body");
181
+ }
182
+ let messageContent = "";
183
+ let thinkingContent = "";
184
+ let done = false;
185
+ const reader = res.body.getReader();
186
+ const decoder = new TextDecoder();
187
+ while (!done) {
188
+ const { value, done: streamDone } = await reader.read();
189
+ if (streamDone) break;
190
+ const chunk = decoder.decode(value, { stream: true });
191
+ const lines = chunk.split("\n").filter((line) => line.trim().length > 0);
192
+ for (const line of lines) {
193
+ try {
194
+ const parsed = JSON.parse(line);
195
+ if (parsed.message?.content) messageContent += parsed.message.content;
196
+ if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
197
+ if (parsed.done) done = true;
198
+ } catch {
199
+ }
200
+ }
201
+ }
202
+ const elapsedMs = Date.now() - start;
203
+ if (!messageContent.trim() && !thinkingContent.trim()) {
204
+ throw new Error("Empty streaming response from Ollama");
205
+ }
206
+ const response = {
207
+ message: {
208
+ content: messageContent,
209
+ thinking: thinkingContent,
210
+ role: "assistant"
211
+ },
212
+ done: true
213
+ };
214
+ return { response, elapsedMs };
215
+ } catch (e) {
216
+ if (e instanceof Error && e.name === "AbortError") {
217
+ throw new Error(`Ollama API timed out after ${msHuman(timeoutMs)}`);
218
+ }
219
+ throw e;
220
+ } finally {
221
+ clearTimeout(timeoutId);
222
+ }
223
+ }
157
224
  async function providerChat(providerInfo, model, messages, options = {}) {
158
225
  const { baseUrl, apiKey } = providerInfo;
159
226
  const maxTokens = options.maxTokens ?? CONFIG.NUM_PREDICT;
@@ -325,6 +392,22 @@ function model_test_temp_default(pi) {
325
392
  async function testToolUsageProvider(providerInfo, model) {
326
393
  return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
327
394
  }
395
+ function extractBraceJson(raw) {
396
+ const jsonStart = raw.indexOf("{");
397
+ if (jsonStart === -1) return "";
398
+ let depth = 0, jsonEnd = -1;
399
+ for (let i = jsonStart; i < raw.length; i++) {
400
+ if (raw[i] === "{") depth++;
401
+ else if (raw[i] === "}") {
402
+ depth--;
403
+ if (depth === 0) {
404
+ jsonEnd = i;
405
+ break;
406
+ }
407
+ }
408
+ }
409
+ return jsonEnd !== -1 ? raw.slice(jsonStart, jsonEnd + 1) : "";
410
+ }
328
411
  async function testReactParsing(model) {
329
412
  const systemPrompt = [
330
413
  "You are a helpful assistant with access to tools.",
@@ -379,23 +462,7 @@ function model_test_temp_default(pi) {
379
462
  if (rawArgs && rawArgs !== "{}") {
380
463
  argsStr = rawArgs;
381
464
  } else if (result.raw) {
382
- const jsonStart = result.raw.indexOf("{");
383
- if (jsonStart !== -1) {
384
- let depth = 0, jsonEnd = -1;
385
- for (let i = jsonStart; i < result.raw.length; i++) {
386
- if (result.raw[i] === "{") depth++;
387
- else if (result.raw[i] === "}") {
388
- depth--;
389
- if (depth === 0) {
390
- jsonEnd = i;
391
- break;
392
- }
393
- }
394
- }
395
- argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
396
- } else {
397
- argsStr = "";
398
- }
465
+ argsStr = extractBraceJson(result.raw);
399
466
  } else {
400
467
  argsStr = "";
401
468
  }
@@ -412,23 +479,7 @@ function model_test_temp_default(pi) {
412
479
  if (rawArgs && rawArgs !== "{}") {
413
480
  argsStr = rawArgs;
414
481
  } else if (result.raw) {
415
- const jsonStart = result.raw.indexOf("{");
416
- if (jsonStart !== -1) {
417
- let depth = 0, jsonEnd = -1;
418
- for (let i = jsonStart; i < result.raw.length; i++) {
419
- if (result.raw[i] === "{") depth++;
420
- else if (result.raw[i] === "}") {
421
- depth--;
422
- if (depth === 0) {
423
- jsonEnd = i;
424
- break;
425
- }
426
- }
427
- }
428
- argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
429
- } else {
430
- argsStr = "";
431
- }
482
+ argsStr = extractBraceJson(result.raw);
432
483
  } else {
433
484
  argsStr = "";
434
485
  }
@@ -908,6 +959,34 @@ function model_test_temp_default(pi) {
908
959
  } else {
909
960
  lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
910
961
  }
962
+ try {
963
+ const historyEntry = {
964
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
965
+ model,
966
+ providerKind: "ollama",
967
+ providerName: providerName || "ollama",
968
+ tests: {
969
+ reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
970
+ thinking: { supported: thinking.supported },
971
+ toolUsage: { score: tools.score, pass: tools.score === "STRONG" || tools.score === "MODERATE", toolCall: tools.toolCall },
972
+ reactParsing: { score: react.score, pass: react.score === "STRONG" || react.score === "MODERATE", toolCall: react.toolCall, dialect: react.dialect },
973
+ instructionFollowing: { score: instructions.score, pass: instructions.pass },
974
+ toolSupport: { level: toolSupport.level, evidence: toolSupport.evidence }
975
+ },
976
+ passedCount: passed,
977
+ totalCount: total,
978
+ totalMs
979
+ };
980
+ appendTestHistory(historyEntry);
981
+ const regressions = detectRegression(model, historyEntry);
982
+ if (regressions.length > 0) {
983
+ lines.push(section("REGRESSION DETECTED"));
984
+ for (const reg of regressions) {
985
+ lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
986
+ }
987
+ }
988
+ } catch {
989
+ }
911
990
  return lines.join("\n");
912
991
  }
913
992
  async function testModelProvider(providerInfo, model, ctx) {
@@ -1043,6 +1122,34 @@ function model_test_temp_default(pi) {
1043
1122
  } else {
1044
1123
  lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
1045
1124
  }
1125
+ try {
1126
+ const historyEntry = {
1127
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1128
+ model,
1129
+ providerKind: "builtin",
1130
+ providerName: providerInfo.name,
1131
+ tests: {
1132
+ reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
1133
+ thinking: { supported: false },
1134
+ toolUsage: { score: toolTest.score, pass: toolTest.pass, toolCall: toolTest.toolCall },
1135
+ reactParsing: { score: "SKIP", pass: false, toolCall: "n/a" },
1136
+ instructionFollowing: { score: instructions.score, pass: instructions.pass },
1137
+ toolSupport: { level: "native", evidence: "provider-native (not probed)" }
1138
+ },
1139
+ passedCount: passed,
1140
+ totalCount: total,
1141
+ totalMs
1142
+ };
1143
+ appendTestHistory(historyEntry);
1144
+ const regressions = detectRegression(model, historyEntry);
1145
+ if (regressions.length > 0) {
1146
+ lines.push(section("REGRESSION DETECTED"));
1147
+ for (const reg of regressions) {
1148
+ lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
1149
+ }
1150
+ }
1151
+ } catch {
1152
+ }
1046
1153
  return lines.join("\n");
1047
1154
  }
1048
1155
  async function testModel(model, ctx) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.1.5",
3
+ "version": "1.1.7",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.1.5"
17
+ "@vtstech/pi-shared": "1.1.7"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"