@vtstech/pi-model-test 1.1.6 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/model-test.js +143 -36
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -44,8 +44,11 @@ pi install "npm:@vtstech/pi-model-test"
|
|
|
44
44
|
|
|
45
45
|
- Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
|
|
46
46
|
- Uses native `fetch()` for all HTTP communication (no shell subprocess or curl dependency)
|
|
47
|
+
- **Streaming Ollama chat** — uses `/api/chat` with `stream: true` for earlier timeout detection and reduced memory
|
|
47
48
|
- Automatic remote Ollama URL resolution (reads from `models.json` on every call — picks up config changes immediately)
|
|
48
|
-
- Timeout resilience with
|
|
49
|
+
- Timeout resilience with exponential backoff retry on connection failures
|
|
50
|
+
- **Configurable test parameters** — override timeouts, delays, temperature via `~/.pi/agent/model-test-config.json`
|
|
51
|
+
- **Test history with regression detection** — tracks results at `~/.pi/agent/cache/model-test-history.json`, flags score degradation
|
|
49
52
|
- Rate limit delay between tests (configurable)
|
|
50
53
|
- Thinking model fallback (retries with `think: true`)
|
|
51
54
|
- Tool support cache (`~/.pi/agent/cache/tool_support.json`)
|
package/model-test.js
CHANGED
|
@@ -20,12 +20,16 @@ import {
|
|
|
20
20
|
scoreReasoning,
|
|
21
21
|
getCachedToolSupport,
|
|
22
22
|
cacheToolSupport,
|
|
23
|
+
getEffectiveConfig,
|
|
24
|
+
appendTestHistory,
|
|
25
|
+
detectRegression,
|
|
23
26
|
testToolUsageUnified,
|
|
24
27
|
testReasoningUnified,
|
|
25
28
|
testInstructionFollowingUnified,
|
|
26
29
|
TOOL_SUPPORT_CACHE_PATH
|
|
27
30
|
} from "@vtstech/pi-shared/model-test-utils";
|
|
28
31
|
function model_test_temp_default(pi) {
|
|
32
|
+
const effectiveConfig = getEffectiveConfig();
|
|
29
33
|
function ollamaBase() {
|
|
30
34
|
return getOllamaBaseUrl();
|
|
31
35
|
}
|
|
@@ -35,9 +39,10 @@ function model_test_temp_default(pi) {
|
|
|
35
39
|
await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
|
|
36
40
|
}
|
|
37
41
|
}
|
|
38
|
-
function makeOllamaChatFn() {
|
|
42
|
+
function makeOllamaChatFn(useStreaming = true) {
|
|
39
43
|
return async (model, messages, _options) => {
|
|
40
|
-
const
|
|
44
|
+
const chatFn = useStreaming ? ollamaChatStream : ollamaChat;
|
|
45
|
+
const result = await chatFn(model, messages);
|
|
41
46
|
return {
|
|
42
47
|
content: result.response?.message?.content || "",
|
|
43
48
|
elapsedMs: result.elapsedMs,
|
|
@@ -154,6 +159,68 @@ function model_test_temp_default(pi) {
|
|
|
154
159
|
}
|
|
155
160
|
throw new Error("Unreachable");
|
|
156
161
|
}
|
|
162
|
+
async function ollamaChatStream(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS) {
|
|
163
|
+
const body = { model, messages, stream: true, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
|
|
164
|
+
const url = `${ollamaBase()}/api/chat`;
|
|
165
|
+
const controller = new AbortController();
|
|
166
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
167
|
+
const start = Date.now();
|
|
168
|
+
try {
|
|
169
|
+
const res = await fetch(url, {
|
|
170
|
+
method: "POST",
|
|
171
|
+
headers: { "Content-Type": "application/json" },
|
|
172
|
+
body: JSON.stringify(body),
|
|
173
|
+
signal: controller.signal
|
|
174
|
+
});
|
|
175
|
+
if (!res.ok) {
|
|
176
|
+
const errorText = await res.text().catch(() => "unknown error");
|
|
177
|
+
throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
|
|
178
|
+
}
|
|
179
|
+
if (!res.body) {
|
|
180
|
+
throw new Error("Ollama streaming response has no body");
|
|
181
|
+
}
|
|
182
|
+
let messageContent = "";
|
|
183
|
+
let thinkingContent = "";
|
|
184
|
+
let done = false;
|
|
185
|
+
const reader = res.body.getReader();
|
|
186
|
+
const decoder = new TextDecoder();
|
|
187
|
+
while (!done) {
|
|
188
|
+
const { value, done: streamDone } = await reader.read();
|
|
189
|
+
if (streamDone) break;
|
|
190
|
+
const chunk = decoder.decode(value, { stream: true });
|
|
191
|
+
const lines = chunk.split("\n").filter((line) => line.trim().length > 0);
|
|
192
|
+
for (const line of lines) {
|
|
193
|
+
try {
|
|
194
|
+
const parsed = JSON.parse(line);
|
|
195
|
+
if (parsed.message?.content) messageContent += parsed.message.content;
|
|
196
|
+
if (parsed.message?.thinking) thinkingContent += parsed.message.thinking;
|
|
197
|
+
if (parsed.done) done = true;
|
|
198
|
+
} catch {
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
const elapsedMs = Date.now() - start;
|
|
203
|
+
if (!messageContent.trim() && !thinkingContent.trim()) {
|
|
204
|
+
throw new Error("Empty streaming response from Ollama");
|
|
205
|
+
}
|
|
206
|
+
const response = {
|
|
207
|
+
message: {
|
|
208
|
+
content: messageContent,
|
|
209
|
+
thinking: thinkingContent,
|
|
210
|
+
role: "assistant"
|
|
211
|
+
},
|
|
212
|
+
done: true
|
|
213
|
+
};
|
|
214
|
+
return { response, elapsedMs };
|
|
215
|
+
} catch (e) {
|
|
216
|
+
if (e instanceof Error && e.name === "AbortError") {
|
|
217
|
+
throw new Error(`Ollama API timed out after ${msHuman(timeoutMs)}`);
|
|
218
|
+
}
|
|
219
|
+
throw e;
|
|
220
|
+
} finally {
|
|
221
|
+
clearTimeout(timeoutId);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
157
224
|
async function providerChat(providerInfo, model, messages, options = {}) {
|
|
158
225
|
const { baseUrl, apiKey } = providerInfo;
|
|
159
226
|
const maxTokens = options.maxTokens ?? CONFIG.NUM_PREDICT;
|
|
@@ -325,6 +392,22 @@ function model_test_temp_default(pi) {
|
|
|
325
392
|
async function testToolUsageProvider(providerInfo, model) {
|
|
326
393
|
return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
|
|
327
394
|
}
|
|
395
|
+
function extractBraceJson(raw) {
|
|
396
|
+
const jsonStart = raw.indexOf("{");
|
|
397
|
+
if (jsonStart === -1) return "";
|
|
398
|
+
let depth = 0, jsonEnd = -1;
|
|
399
|
+
for (let i = jsonStart; i < raw.length; i++) {
|
|
400
|
+
if (raw[i] === "{") depth++;
|
|
401
|
+
else if (raw[i] === "}") {
|
|
402
|
+
depth--;
|
|
403
|
+
if (depth === 0) {
|
|
404
|
+
jsonEnd = i;
|
|
405
|
+
break;
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
return jsonEnd !== -1 ? raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
410
|
+
}
|
|
328
411
|
async function testReactParsing(model) {
|
|
329
412
|
const systemPrompt = [
|
|
330
413
|
"You are a helpful assistant with access to tools.",
|
|
@@ -379,23 +462,7 @@ function model_test_temp_default(pi) {
|
|
|
379
462
|
if (rawArgs && rawArgs !== "{}") {
|
|
380
463
|
argsStr = rawArgs;
|
|
381
464
|
} else if (result.raw) {
|
|
382
|
-
|
|
383
|
-
if (jsonStart !== -1) {
|
|
384
|
-
let depth = 0, jsonEnd = -1;
|
|
385
|
-
for (let i = jsonStart; i < result.raw.length; i++) {
|
|
386
|
-
if (result.raw[i] === "{") depth++;
|
|
387
|
-
else if (result.raw[i] === "}") {
|
|
388
|
-
depth--;
|
|
389
|
-
if (depth === 0) {
|
|
390
|
-
jsonEnd = i;
|
|
391
|
-
break;
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
}
|
|
395
|
-
argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
396
|
-
} else {
|
|
397
|
-
argsStr = "";
|
|
398
|
-
}
|
|
465
|
+
argsStr = extractBraceJson(result.raw);
|
|
399
466
|
} else {
|
|
400
467
|
argsStr = "";
|
|
401
468
|
}
|
|
@@ -412,23 +479,7 @@ function model_test_temp_default(pi) {
|
|
|
412
479
|
if (rawArgs && rawArgs !== "{}") {
|
|
413
480
|
argsStr = rawArgs;
|
|
414
481
|
} else if (result.raw) {
|
|
415
|
-
|
|
416
|
-
if (jsonStart !== -1) {
|
|
417
|
-
let depth = 0, jsonEnd = -1;
|
|
418
|
-
for (let i = jsonStart; i < result.raw.length; i++) {
|
|
419
|
-
if (result.raw[i] === "{") depth++;
|
|
420
|
-
else if (result.raw[i] === "}") {
|
|
421
|
-
depth--;
|
|
422
|
-
if (depth === 0) {
|
|
423
|
-
jsonEnd = i;
|
|
424
|
-
break;
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
429
|
-
} else {
|
|
430
|
-
argsStr = "";
|
|
431
|
-
}
|
|
482
|
+
argsStr = extractBraceJson(result.raw);
|
|
432
483
|
} else {
|
|
433
484
|
argsStr = "";
|
|
434
485
|
}
|
|
@@ -908,6 +959,34 @@ function model_test_temp_default(pi) {
|
|
|
908
959
|
} else {
|
|
909
960
|
lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
|
|
910
961
|
}
|
|
962
|
+
try {
|
|
963
|
+
const historyEntry = {
|
|
964
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
965
|
+
model,
|
|
966
|
+
providerKind: "ollama",
|
|
967
|
+
providerName: providerName || "ollama",
|
|
968
|
+
tests: {
|
|
969
|
+
reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
|
|
970
|
+
thinking: { supported: thinking.supported },
|
|
971
|
+
toolUsage: { score: tools.score, pass: tools.score === "STRONG" || tools.score === "MODERATE", toolCall: tools.toolCall },
|
|
972
|
+
reactParsing: { score: react.score, pass: react.score === "STRONG" || react.score === "MODERATE", toolCall: react.toolCall, dialect: react.dialect },
|
|
973
|
+
instructionFollowing: { score: instructions.score, pass: instructions.pass },
|
|
974
|
+
toolSupport: { level: toolSupport.level, evidence: toolSupport.evidence }
|
|
975
|
+
},
|
|
976
|
+
passedCount: passed,
|
|
977
|
+
totalCount: total,
|
|
978
|
+
totalMs
|
|
979
|
+
};
|
|
980
|
+
appendTestHistory(historyEntry);
|
|
981
|
+
const regressions = detectRegression(model, historyEntry);
|
|
982
|
+
if (regressions.length > 0) {
|
|
983
|
+
lines.push(section("REGRESSION DETECTED"));
|
|
984
|
+
for (const reg of regressions) {
|
|
985
|
+
lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
} catch {
|
|
989
|
+
}
|
|
911
990
|
return lines.join("\n");
|
|
912
991
|
}
|
|
913
992
|
async function testModelProvider(providerInfo, model, ctx) {
|
|
@@ -1043,6 +1122,34 @@ function model_test_temp_default(pi) {
|
|
|
1043
1122
|
} else {
|
|
1044
1123
|
lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
|
|
1045
1124
|
}
|
|
1125
|
+
try {
|
|
1126
|
+
const historyEntry = {
|
|
1127
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1128
|
+
model,
|
|
1129
|
+
providerKind: "builtin",
|
|
1130
|
+
providerName: providerInfo.name,
|
|
1131
|
+
tests: {
|
|
1132
|
+
reasoning: { score: reasoning.score, pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", answer: reasoning.answer },
|
|
1133
|
+
thinking: { supported: false },
|
|
1134
|
+
toolUsage: { score: toolTest.score, pass: toolTest.pass, toolCall: toolTest.toolCall },
|
|
1135
|
+
reactParsing: { score: "SKIP", pass: false, toolCall: "n/a" },
|
|
1136
|
+
instructionFollowing: { score: instructions.score, pass: instructions.pass },
|
|
1137
|
+
toolSupport: { level: "native", evidence: "provider-native (not probed)" }
|
|
1138
|
+
},
|
|
1139
|
+
passedCount: passed,
|
|
1140
|
+
totalCount: total,
|
|
1141
|
+
totalMs
|
|
1142
|
+
};
|
|
1143
|
+
appendTestHistory(historyEntry);
|
|
1144
|
+
const regressions = detectRegression(model, historyEntry);
|
|
1145
|
+
if (regressions.length > 0) {
|
|
1146
|
+
lines.push(section("REGRESSION DETECTED"));
|
|
1147
|
+
for (const reg of regressions) {
|
|
1148
|
+
lines.push(warn(`${reg.test}: ${reg.previous} \u2192 ${reg.current}`));
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
} catch {
|
|
1152
|
+
}
|
|
1046
1153
|
return lines.join("\n");
|
|
1047
1154
|
}
|
|
1048
1155
|
async function testModel(model, ctx) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.7",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.1.
|
|
17
|
+
"@vtstech/pi-shared": "1.1.7"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|