@vtstech/pi-model-test 1.0.3 → 1.0.4-1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -0
- package/model-test.js +169 -190
- package/package.json +3 -2
package/README.md
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# @vtstech/pi-model-test
|
|
2
|
+
|
|
3
|
+
Model benchmark extension for the [Pi Coding Agent](https://github.com/badlogic/pi-mono).
|
|
4
|
+
|
|
5
|
+
Test any model for reasoning, tool usage, and instruction following — works with Ollama and cloud providers.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pi install "npm:@vtstech/pi-model-test"
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Commands
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
/model-test Test current Pi model (auto-detects provider)
|
|
17
|
+
/model-test qwen3:0.6b Test a specific Ollama model
|
|
18
|
+
/model-test --all Test every Ollama model
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Test Suites
|
|
22
|
+
|
|
23
|
+
### Ollama (6 tests)
|
|
24
|
+
|
|
25
|
+
| Test | Scoring |
|
|
26
|
+
|------|---------|
|
|
27
|
+
| Reasoning (snail puzzle) | STRONG / MODERATE / WEAK / FAIL |
|
|
28
|
+
| Thinking token support | SUPPORTED / NOT SUPPORTED |
|
|
29
|
+
| Tool usage (native + text) | STRONG / MODERATE / WEAK / FAIL |
|
|
30
|
+
| ReAct parsing | STRONG / MODERATE / WEAK / FAIL |
|
|
31
|
+
| Instruction following (JSON) | STRONG / MODERATE / WEAK / FAIL |
|
|
32
|
+
| Tool support detection | NATIVE / REACT / NONE |
|
|
33
|
+
|
|
34
|
+
### Cloud Providers (4 tests)
|
|
35
|
+
|
|
36
|
+
| Test | Scoring |
|
|
37
|
+
|------|---------|
|
|
38
|
+
| Connectivity | OK / FAIL |
|
|
39
|
+
| Reasoning | STRONG / MODERATE / WEAK / FAIL |
|
|
40
|
+
| Instruction following | STRONG / MODERATE / WEAK / FAIL |
|
|
41
|
+
| Tool usage (function calling) | STRONG / MODERATE / WEAK / FAIL |
|
|
42
|
+
|
|
43
|
+
## Features
|
|
44
|
+
|
|
45
|
+
- Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
|
|
46
|
+
- Automatic remote Ollama URL resolution
|
|
47
|
+
- Timeout resilience with auto-retry on empty responses
|
|
48
|
+
- Rate limit delay between tests (configurable)
|
|
49
|
+
- Thinking model fallback (retries with `think: true`)
|
|
50
|
+
- Tool support cache (`~/.pi/agent/cache/tool_support.json`)
|
|
51
|
+
- JSON repair for truncated output
|
|
52
|
+
- Tab-completion for model names
|
|
53
|
+
|
|
54
|
+
## Links
|
|
55
|
+
|
|
56
|
+
- [Full Documentation](https://github.com/VTSTech/pi-coding-agent#model-benchmark-model-testts)
|
|
57
|
+
- [Changelog](https://github.com/VTSTech/pi-coding-agent/blob/main/CHANGELOG.md)
|
|
58
|
+
|
|
59
|
+
## License
|
|
60
|
+
|
|
61
|
+
MIT — [VTSTech](https://www.vts-tech.org)
|
package/model-test.js
CHANGED
|
@@ -1,42 +1,18 @@
|
|
|
1
|
-
var __create = Object.create;
|
|
2
|
-
var __defProp = Object.defineProperty;
|
|
3
|
-
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
-
var __getProtoOf = Object.getPrototypeOf;
|
|
6
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
-
var __export = (target, all) => {
|
|
8
|
-
for (var name in all)
|
|
9
|
-
__defProp(target, name, { get: all[name], enumerable: true });
|
|
10
|
-
};
|
|
11
|
-
var __copyProps = (to, from, except, desc) => {
|
|
12
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
13
|
-
for (let key of __getOwnPropNames(from))
|
|
14
|
-
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
15
|
-
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
16
|
-
}
|
|
17
|
-
return to;
|
|
18
|
-
};
|
|
19
|
-
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
20
|
-
// If the importer is in node compatibility mode or this is not an ESM
|
|
21
|
-
// file that has been converted to a CommonJS file using a Babel-
|
|
22
|
-
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
23
|
-
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
24
|
-
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
25
|
-
mod
|
|
26
|
-
));
|
|
27
|
-
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
28
|
-
|
|
29
1
|
// .build-npm/model-test/model-test.temp.ts
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
2
|
+
import * as fs from "node:fs";
|
|
3
|
+
import * as os from "node:os";
|
|
4
|
+
import * as path from "node:path";
|
|
5
|
+
import {
|
|
6
|
+
section,
|
|
7
|
+
ok,
|
|
8
|
+
fail,
|
|
9
|
+
warn,
|
|
10
|
+
info,
|
|
11
|
+
msHuman,
|
|
12
|
+
truncate,
|
|
13
|
+
sanitizeForReport
|
|
14
|
+
} from "@vtstech/pi-shared/format";
|
|
15
|
+
import { getOllamaBaseUrl, detectModelFamily, readModelsJson } from "@vtstech/pi-shared/ollama";
|
|
40
16
|
var BUILTIN_PROVIDERS = {
|
|
41
17
|
openrouter: { api: "openai-completions", baseUrl: "https://openrouter.ai/api/v1", envKey: "OPENROUTER_API_KEY" },
|
|
42
18
|
anthropic: { api: "anthropic-messages", baseUrl: "https://api.anthropic.com/v1", envKey: "ANTHROPIC_API_KEY" },
|
|
@@ -55,7 +31,7 @@ function detectProvider(ctx) {
|
|
|
55
31
|
if (!model) return { kind: "unknown", name: "none" };
|
|
56
32
|
const providerName = model.provider || "";
|
|
57
33
|
if (!providerName) return { kind: "unknown", name: "none" };
|
|
58
|
-
const modelsJson =
|
|
34
|
+
const modelsJson = readModelsJson();
|
|
59
35
|
const userProviderCfg = (modelsJson.providers || {})[providerName];
|
|
60
36
|
if (userProviderCfg) {
|
|
61
37
|
const baseUrl = userProviderCfg.baseUrl || "";
|
|
@@ -169,10 +145,10 @@ function cacheToolSupport(model, support, family) {
|
|
|
169
145
|
writeToolSupportCache(cache);
|
|
170
146
|
}
|
|
171
147
|
function model_test_temp_default(pi) {
|
|
172
|
-
const OLLAMA_BASE =
|
|
148
|
+
const OLLAMA_BASE = getOllamaBaseUrl();
|
|
173
149
|
async function rateLimitDelay(lines) {
|
|
174
150
|
if (CONFIG.TEST_DELAY_MS > 0) {
|
|
175
|
-
lines.push(
|
|
151
|
+
lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
|
|
176
152
|
await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
|
|
177
153
|
}
|
|
178
154
|
}
|
|
@@ -254,7 +230,7 @@ function model_test_temp_default(pi) {
|
|
|
254
230
|
const elapsedMs = Date.now() - start;
|
|
255
231
|
if (!res.ok) {
|
|
256
232
|
const errorText = await res.text().catch(() => "unknown error");
|
|
257
|
-
throw new Error(`API returned ${res.status}: ${
|
|
233
|
+
throw new Error(`API returned ${res.status}: ${truncate(errorText, 200)}`);
|
|
258
234
|
}
|
|
259
235
|
const data = await res.json();
|
|
260
236
|
const choice = data.choices?.[0];
|
|
@@ -270,7 +246,7 @@ function model_test_temp_default(pi) {
|
|
|
270
246
|
} catch (e) {
|
|
271
247
|
const elapsedMs = Date.now() - start;
|
|
272
248
|
if (e.name === "AbortError") {
|
|
273
|
-
throw new Error(`Provider API timed out after ${
|
|
249
|
+
throw new Error(`Provider API timed out after ${msHuman(elapsedMs)}`);
|
|
274
250
|
}
|
|
275
251
|
throw e;
|
|
276
252
|
} finally {
|
|
@@ -915,7 +891,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
915
891
|
}
|
|
916
892
|
}
|
|
917
893
|
if (!parsed) {
|
|
918
|
-
return { pass: false, score: "FAIL", output:
|
|
894
|
+
return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs };
|
|
919
895
|
}
|
|
920
896
|
const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
|
|
921
897
|
const correctSum = parsed.sum === 42;
|
|
@@ -974,7 +950,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
974
950
|
}
|
|
975
951
|
}
|
|
976
952
|
if (!parsed) {
|
|
977
|
-
return { pass: false, score: "FAIL", output:
|
|
953
|
+
return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs: result.elapsedMs };
|
|
978
954
|
}
|
|
979
955
|
const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
|
|
980
956
|
const correctSum = parsed.sum === 42;
|
|
@@ -1061,7 +1037,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1061
1037
|
const detail = result.stderr?.trim() || result.stdout?.trim() || "empty response";
|
|
1062
1038
|
const level2 = "none";
|
|
1063
1039
|
cacheToolSupport(model, level2, family);
|
|
1064
|
-
return { level: level2, cached: false, evidence: `API error: ${
|
|
1040
|
+
return { level: level2, cached: false, evidence: `API error: ${truncate(detail, 100)}`, elapsedMs };
|
|
1065
1041
|
}
|
|
1066
1042
|
const parsed = JSON.parse(result.stdout);
|
|
1067
1043
|
const toolCalls = parsed?.message?.tool_calls;
|
|
@@ -1133,7 +1109,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1133
1109
|
}
|
|
1134
1110
|
const level = "none";
|
|
1135
1111
|
cacheToolSupport(model, level, family);
|
|
1136
|
-
const cleanContent =
|
|
1112
|
+
const cleanContent = truncate(strippedContent, 150);
|
|
1137
1113
|
const evidenceDetail = hasTextToolSignal ? `no structured tool calling (text mentions tool: ${cleanContent})` : `no tool calling patterns (text: ${cleanContent})`;
|
|
1138
1114
|
return { level, cached: false, evidence: evidenceDetail, elapsedMs };
|
|
1139
1115
|
} catch (e) {
|
|
@@ -1200,8 +1176,8 @@ The JSON object must have exactly these 4 keys:
|
|
|
1200
1176
|
const lines = [];
|
|
1201
1177
|
const totalStart = Date.now();
|
|
1202
1178
|
lines.push(branding);
|
|
1203
|
-
lines.push(
|
|
1204
|
-
lines.push(
|
|
1179
|
+
lines.push(section(`MODEL: ${model}`));
|
|
1180
|
+
lines.push(info("Provider: Ollama (local/remote)"));
|
|
1205
1181
|
let modelSize = "unknown";
|
|
1206
1182
|
let modelFamily = "unknown";
|
|
1207
1183
|
let modelParams = "unknown";
|
|
@@ -1227,125 +1203,125 @@ The JSON object must have exactly these 4 keys:
|
|
|
1227
1203
|
}
|
|
1228
1204
|
} catch {
|
|
1229
1205
|
}
|
|
1230
|
-
const detectedFamily =
|
|
1231
|
-
lines.push(
|
|
1232
|
-
lines.push(
|
|
1233
|
-
lines.push(
|
|
1234
|
-
lines.push(
|
|
1235
|
-
lines.push(
|
|
1206
|
+
const detectedFamily = detectModelFamily(model);
|
|
1207
|
+
lines.push(info(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
|
|
1208
|
+
lines.push(info(`Family: ${modelFamily} | Detected: ${detectedFamily} | Modified: ${modelModified}`));
|
|
1209
|
+
lines.push(section("REASONING TEST"));
|
|
1210
|
+
lines.push(info("Prompt: A snail climbs 3ft up a wall each day, slides 2ft back each night. Wall is 10ft. How many days?"));
|
|
1211
|
+
lines.push(info("Testing..."));
|
|
1236
1212
|
const reasoning = await testReasoning(model);
|
|
1237
|
-
lines.push(
|
|
1213
|
+
lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
|
|
1238
1214
|
if (reasoning.score === "STRONG") {
|
|
1239
|
-
lines.push(
|
|
1215
|
+
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
|
|
1240
1216
|
} else if (reasoning.score === "MODERATE") {
|
|
1241
|
-
lines.push(
|
|
1217
|
+
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
|
|
1242
1218
|
} else if (reasoning.score === "WEAK") {
|
|
1243
|
-
lines.push(
|
|
1219
|
+
lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
|
|
1244
1220
|
} else if (reasoning.score === "FAIL") {
|
|
1245
|
-
lines.push(
|
|
1221
|
+
lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
|
|
1246
1222
|
} else {
|
|
1247
|
-
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." :
|
|
1248
|
-
lines.push(
|
|
1223
|
+
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
|
|
1224
|
+
lines.push(fail(`Error: ${errMsg}`));
|
|
1249
1225
|
}
|
|
1250
|
-
lines.push(
|
|
1251
|
-
lines.push(
|
|
1252
|
-
lines.push(
|
|
1226
|
+
lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
|
|
1227
|
+
lines.push(section("THINKING TEST"));
|
|
1228
|
+
lines.push(info('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
|
|
1253
1229
|
await rateLimitDelay(lines);
|
|
1254
1230
|
const thinking = await testThinking(model);
|
|
1255
|
-
lines.push(
|
|
1231
|
+
lines.push(info(`Time: ${msHuman(thinking.elapsedMs)}`));
|
|
1256
1232
|
if (thinking.supported) {
|
|
1257
|
-
lines.push(
|
|
1258
|
-
lines.push(
|
|
1233
|
+
lines.push(ok(`Thinking/reasoning tokens: SUPPORTED`));
|
|
1234
|
+
lines.push(info(`Thinking content: ${sanitizeForReport(thinking.thinkingContent)}`));
|
|
1259
1235
|
} else {
|
|
1260
|
-
lines.push(
|
|
1236
|
+
lines.push(fail(`Thinking/reasoning tokens: NOT SUPPORTED`));
|
|
1261
1237
|
}
|
|
1262
|
-
lines.push(
|
|
1263
|
-
lines.push(
|
|
1238
|
+
lines.push(info(`Answer output: ${sanitizeForReport(thinking.answerContent)}`));
|
|
1239
|
+
lines.push(section("MODELS.JSON SYNC"));
|
|
1264
1240
|
const reasoningUpdate = updateModelsJsonReasoning(model, thinking.supported);
|
|
1265
|
-
lines.push(
|
|
1266
|
-
lines.push(
|
|
1267
|
-
lines.push(
|
|
1268
|
-
lines.push(
|
|
1241
|
+
lines.push(info(reasoningUpdate.message));
|
|
1242
|
+
lines.push(section("TOOL USAGE TEST"));
|
|
1243
|
+
lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
|
|
1244
|
+
lines.push(info("Testing..."));
|
|
1269
1245
|
await rateLimitDelay(lines);
|
|
1270
1246
|
const tools = await testToolUsage(model);
|
|
1271
|
-
lines.push(
|
|
1247
|
+
lines.push(info(`Time: ${msHuman(tools.elapsedMs)}`));
|
|
1272
1248
|
if (tools.score === "STRONG") {
|
|
1273
|
-
lines.push(
|
|
1249
|
+
lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
|
|
1274
1250
|
if (tools.response) {
|
|
1275
|
-
lines.push(
|
|
1251
|
+
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
1276
1252
|
}
|
|
1277
1253
|
} else if (tools.score === "MODERATE") {
|
|
1278
|
-
lines.push(
|
|
1254
|
+
lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
|
|
1279
1255
|
if (tools.response) {
|
|
1280
|
-
lines.push(
|
|
1256
|
+
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
1281
1257
|
}
|
|
1282
1258
|
} else if (tools.score === "WEAK") {
|
|
1283
|
-
lines.push(
|
|
1259
|
+
lines.push(warn(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
|
|
1284
1260
|
if (tools.response) {
|
|
1285
|
-
lines.push(
|
|
1261
|
+
lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
|
|
1286
1262
|
}
|
|
1287
1263
|
} else if (tools.score === "FAIL") {
|
|
1288
1264
|
const hasResponse = tools.response && tools.response.trim().length > 0;
|
|
1289
|
-
lines.push(
|
|
1265
|
+
lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
|
|
1290
1266
|
if (hasResponse) {
|
|
1291
|
-
lines.push(
|
|
1267
|
+
lines.push(info(`Text response: ${sanitizeForReport(tools.response)}`));
|
|
1292
1268
|
} else {
|
|
1293
|
-
lines.push(
|
|
1269
|
+
lines.push(info("Text response: (empty)"));
|
|
1294
1270
|
}
|
|
1295
1271
|
} else {
|
|
1296
|
-
lines.push(
|
|
1272
|
+
lines.push(fail(`Error: ${tools.toolCall}`));
|
|
1297
1273
|
}
|
|
1298
|
-
lines.push(
|
|
1299
|
-
lines.push(
|
|
1300
|
-
lines.push(
|
|
1274
|
+
lines.push(section("REACT PARSING TEST"));
|
|
1275
|
+
lines.push(info(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
|
|
1276
|
+
lines.push(info("Testing..."));
|
|
1301
1277
|
await rateLimitDelay(lines);
|
|
1302
1278
|
const react = await testReactParsing(model);
|
|
1303
|
-
lines.push(
|
|
1279
|
+
lines.push(info(`Time: ${msHuman(react.elapsedMs)}`));
|
|
1304
1280
|
if (react.score === "STRONG") {
|
|
1305
|
-
lines.push(
|
|
1281
|
+
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
|
|
1306
1282
|
if (react.thought) {
|
|
1307
|
-
lines.push(
|
|
1283
|
+
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1308
1284
|
}
|
|
1309
1285
|
} else if (react.score === "MODERATE") {
|
|
1310
|
-
lines.push(
|
|
1286
|
+
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
|
|
1311
1287
|
if (react.thought) {
|
|
1312
|
-
lines.push(
|
|
1288
|
+
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1313
1289
|
}
|
|
1314
1290
|
} else if (react.score === "WEAK") {
|
|
1315
|
-
lines.push(
|
|
1291
|
+
lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args`));
|
|
1316
1292
|
if (react.thought) {
|
|
1317
|
-
lines.push(
|
|
1293
|
+
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1318
1294
|
}
|
|
1319
1295
|
} else if (react.score === "FAIL") {
|
|
1320
|
-
lines.push(
|
|
1296
|
+
lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})`));
|
|
1321
1297
|
if (react.response) {
|
|
1322
|
-
lines.push(
|
|
1298
|
+
lines.push(info(`Response: ${sanitizeForReport(react.response)}`));
|
|
1323
1299
|
}
|
|
1324
1300
|
} else {
|
|
1325
|
-
lines.push(
|
|
1301
|
+
lines.push(fail(`Error: ${react.toolCall}`));
|
|
1326
1302
|
}
|
|
1327
|
-
lines.push(
|
|
1328
|
-
lines.push(
|
|
1329
|
-
lines.push(
|
|
1303
|
+
lines.push(section("INSTRUCTION FOLLOWING TEST"));
|
|
1304
|
+
lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
|
|
1305
|
+
lines.push(info("Testing..."));
|
|
1330
1306
|
await rateLimitDelay(lines);
|
|
1331
1307
|
const instructions = await testInstructionFollowing(model);
|
|
1332
|
-
lines.push(
|
|
1308
|
+
lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
|
|
1333
1309
|
if (instructions.score === "STRONG") {
|
|
1334
|
-
lines.push(
|
|
1310
|
+
lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
|
|
1335
1311
|
} else if (instructions.score === "MODERATE") {
|
|
1336
|
-
lines.push(
|
|
1312
|
+
lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
|
|
1337
1313
|
} else if (instructions.score === "WEAK") {
|
|
1338
|
-
lines.push(
|
|
1314
|
+
lines.push(warn(`Partial JSON compliance (${instructions.score})`));
|
|
1339
1315
|
} else {
|
|
1340
|
-
lines.push(
|
|
1316
|
+
lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
|
|
1341
1317
|
}
|
|
1342
|
-
lines.push(
|
|
1343
|
-
lines.push(
|
|
1344
|
-
lines.push(
|
|
1345
|
-
lines.push(
|
|
1318
|
+
lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
|
|
1319
|
+
lines.push(section("TOOL SUPPORT DETECTION"));
|
|
1320
|
+
lines.push(info("Probing model for tool calling capability (native / ReAct / none)"));
|
|
1321
|
+
lines.push(info("Testing..."));
|
|
1346
1322
|
await rateLimitDelay(lines);
|
|
1347
1323
|
const toolSupport = await testToolSupport(model, detectedFamily);
|
|
1348
|
-
lines.push(
|
|
1324
|
+
lines.push(info(`Time: ${msHuman(toolSupport.elapsedMs)}`));
|
|
1349
1325
|
const supportLabel = (level) => {
|
|
1350
1326
|
switch (level) {
|
|
1351
1327
|
case "native":
|
|
@@ -1359,19 +1335,19 @@ The JSON object must have exactly these 4 keys:
|
|
|
1359
1335
|
}
|
|
1360
1336
|
};
|
|
1361
1337
|
if (toolSupport.cached) {
|
|
1362
|
-
lines.push(
|
|
1338
|
+
lines.push(info(`Result: ${supportLabel(toolSupport.level)} \u2014 from cache`));
|
|
1363
1339
|
} else {
|
|
1364
1340
|
if (toolSupport.level === "native") {
|
|
1365
|
-
lines.push(
|
|
1341
|
+
lines.push(ok(`Tool support: ${supportLabel(toolSupport.level)}`));
|
|
1366
1342
|
} else if (toolSupport.level === "react") {
|
|
1367
|
-
lines.push(
|
|
1343
|
+
lines.push(ok(`Tool support: ${supportLabel(toolSupport.level)}`));
|
|
1368
1344
|
} else {
|
|
1369
|
-
lines.push(
|
|
1345
|
+
lines.push(warn(`Tool support: ${supportLabel(toolSupport.level)}`));
|
|
1370
1346
|
}
|
|
1371
1347
|
}
|
|
1372
|
-
lines.push(
|
|
1373
|
-
lines.push(
|
|
1374
|
-
lines.push(
|
|
1348
|
+
lines.push(info(`Evidence: ${toolSupport.evidence}`));
|
|
1349
|
+
lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
|
|
1350
|
+
lines.push(section("SUMMARY"));
|
|
1375
1351
|
const totalMs = Date.now() - totalStart;
|
|
1376
1352
|
const tests = [
|
|
1377
1353
|
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
@@ -1384,19 +1360,19 @@ The JSON object must have exactly these 4 keys:
|
|
|
1384
1360
|
const passed = tests.filter((t) => t.pass).length;
|
|
1385
1361
|
const total = tests.length;
|
|
1386
1362
|
for (const t of tests) {
|
|
1387
|
-
lines.push(t.pass ?
|
|
1363
|
+
lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
|
|
1388
1364
|
}
|
|
1389
|
-
lines.push(
|
|
1390
|
-
lines.push(
|
|
1391
|
-
lines.push(
|
|
1365
|
+
lines.push(info(`Total time: ${msHuman(totalMs)}`));
|
|
1366
|
+
lines.push(info(`Score: ${passed}/${total} tests passed`));
|
|
1367
|
+
lines.push(section("RECOMMENDATION"));
|
|
1392
1368
|
if (passed === 6) {
|
|
1393
|
-
lines.push(
|
|
1369
|
+
lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
|
|
1394
1370
|
} else if (passed >= 5) {
|
|
1395
|
-
lines.push(
|
|
1371
|
+
lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
|
|
1396
1372
|
} else if (passed >= 4) {
|
|
1397
|
-
lines.push(
|
|
1373
|
+
lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
|
|
1398
1374
|
} else {
|
|
1399
|
-
lines.push(
|
|
1375
|
+
lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
|
|
1400
1376
|
}
|
|
1401
1377
|
return lines.join("\n");
|
|
1402
1378
|
}
|
|
@@ -1404,106 +1380,106 @@ The JSON object must have exactly these 4 keys:
|
|
|
1404
1380
|
const lines = [];
|
|
1405
1381
|
const totalStart = Date.now();
|
|
1406
1382
|
lines.push(branding);
|
|
1407
|
-
lines.push(
|
|
1408
|
-
lines.push(
|
|
1409
|
-
lines.push(
|
|
1410
|
-
lines.push(
|
|
1383
|
+
lines.push(section(`MODEL: ${model}`));
|
|
1384
|
+
lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
|
|
1385
|
+
lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
|
|
1386
|
+
lines.push(info(`Base URL: ${providerInfo.baseUrl || "unknown"}`));
|
|
1411
1387
|
if (providerInfo.apiKey) {
|
|
1412
|
-
lines.push(
|
|
1388
|
+
lines.push(info(`API Key: ****${providerInfo.apiKey.slice(-4)}`));
|
|
1413
1389
|
} else {
|
|
1414
|
-
lines.push(
|
|
1390
|
+
lines.push(warn(`API Key: NOT SET (${providerInfo.envKey || "env var not found"})`));
|
|
1415
1391
|
}
|
|
1416
|
-
lines.push(
|
|
1417
|
-
lines.push(
|
|
1392
|
+
lines.push(section("CONNECTIVITY TEST"));
|
|
1393
|
+
lines.push(info("Sending minimal request to verify API reachability and key validity..."));
|
|
1418
1394
|
const connectivity = await testConnectivity(providerInfo, model);
|
|
1419
|
-
lines.push(
|
|
1395
|
+
lines.push(info(`Time: ${msHuman(connectivity.elapsedMs)}`));
|
|
1420
1396
|
if (connectivity.pass) {
|
|
1421
|
-
lines.push(
|
|
1397
|
+
lines.push(ok(`API reachable and authenticated`));
|
|
1422
1398
|
} else {
|
|
1423
1399
|
if (!connectivity.reachable) {
|
|
1424
|
-
lines.push(
|
|
1400
|
+
lines.push(fail(`API not reachable: ${connectivity.error || "unknown error"}`));
|
|
1425
1401
|
} else if (!connectivity.authValid) {
|
|
1426
|
-
lines.push(
|
|
1402
|
+
lines.push(fail(`Authentication failed: ${connectivity.error || "invalid or missing API key"}`));
|
|
1427
1403
|
} else {
|
|
1428
|
-
lines.push(
|
|
1404
|
+
lines.push(fail(`Connectivity error: ${connectivity.error || "unknown"}`));
|
|
1429
1405
|
}
|
|
1430
|
-
lines.push(
|
|
1431
|
-
lines.push(
|
|
1406
|
+
lines.push(warn("Skipping remaining tests \u2014 fix connectivity first"));
|
|
1407
|
+
lines.push(info("Tip: Check your API key is set correctly and the provider endpoint is accessible"));
|
|
1432
1408
|
return lines.join("\n");
|
|
1433
1409
|
}
|
|
1434
|
-
lines.push(
|
|
1435
|
-
lines.push(
|
|
1436
|
-
lines.push(
|
|
1410
|
+
lines.push(section("REASONING TEST"));
|
|
1411
|
+
lines.push(info("Prompt: A snail climbs 3ft up a wall each day, slides 2ft back each night. Wall is 10ft. How many days?"));
|
|
1412
|
+
lines.push(info("Testing..."));
|
|
1437
1413
|
await rateLimitDelay(lines);
|
|
1438
1414
|
const reasoning = await testReasoningProvider(providerInfo, model);
|
|
1439
|
-
lines.push(
|
|
1415
|
+
lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
|
|
1440
1416
|
if (reasoning.score === "STRONG") {
|
|
1441
|
-
lines.push(
|
|
1417
|
+
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
|
|
1442
1418
|
} else if (reasoning.score === "MODERATE") {
|
|
1443
|
-
lines.push(
|
|
1419
|
+
lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
|
|
1444
1420
|
} else if (reasoning.score === "WEAK") {
|
|
1445
|
-
lines.push(
|
|
1421
|
+
lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
|
|
1446
1422
|
} else if (reasoning.score === "FAIL") {
|
|
1447
|
-
lines.push(
|
|
1423
|
+
lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
|
|
1448
1424
|
} else {
|
|
1449
|
-
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." :
|
|
1450
|
-
lines.push(
|
|
1425
|
+
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
|
|
1426
|
+
lines.push(fail(`Error: ${errMsg}`));
|
|
1451
1427
|
}
|
|
1452
|
-
lines.push(
|
|
1453
|
-
lines.push(
|
|
1454
|
-
lines.push(
|
|
1455
|
-
lines.push(
|
|
1428
|
+
lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
|
|
1429
|
+
lines.push(section("INSTRUCTION FOLLOWING TEST"));
|
|
1430
|
+
lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
|
|
1431
|
+
lines.push(info("Testing..."));
|
|
1456
1432
|
await rateLimitDelay(lines);
|
|
1457
1433
|
const instructions = await testInstructionFollowingProvider(providerInfo, model);
|
|
1458
|
-
lines.push(
|
|
1434
|
+
lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
|
|
1459
1435
|
if (instructions.score === "STRONG") {
|
|
1460
|
-
lines.push(
|
|
1436
|
+
lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
|
|
1461
1437
|
} else if (instructions.score === "MODERATE") {
|
|
1462
|
-
lines.push(
|
|
1438
|
+
lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
|
|
1463
1439
|
} else if (instructions.score === "WEAK") {
|
|
1464
|
-
lines.push(
|
|
1440
|
+
lines.push(warn(`Partial JSON compliance (${instructions.score})`));
|
|
1465
1441
|
} else {
|
|
1466
|
-
lines.push(
|
|
1442
|
+
lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
|
|
1467
1443
|
}
|
|
1468
|
-
lines.push(
|
|
1469
|
-
lines.push(
|
|
1470
|
-
lines.push(
|
|
1471
|
-
lines.push(
|
|
1444
|
+
lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
|
|
1445
|
+
lines.push(section("TOOL USAGE TEST"));
|
|
1446
|
+
lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
|
|
1447
|
+
lines.push(info("Testing..."));
|
|
1472
1448
|
await rateLimitDelay(lines);
|
|
1473
1449
|
const toolTest = await testToolUsageProvider(providerInfo, model);
|
|
1474
|
-
lines.push(
|
|
1450
|
+
lines.push(info(`Time: ${msHuman(toolTest.elapsedMs)}`));
|
|
1475
1451
|
if (toolTest.score === "STRONG") {
|
|
1476
|
-
lines.push(
|
|
1452
|
+
lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
|
|
1477
1453
|
if (toolTest.response) {
|
|
1478
|
-
lines.push(
|
|
1454
|
+
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1479
1455
|
}
|
|
1480
1456
|
} else if (toolTest.score === "MODERATE") {
|
|
1481
|
-
lines.push(
|
|
1457
|
+
lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
|
|
1482
1458
|
if (toolTest.response) {
|
|
1483
|
-
lines.push(
|
|
1459
|
+
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1484
1460
|
}
|
|
1485
1461
|
} else if (toolTest.score === "WEAK") {
|
|
1486
|
-
lines.push(
|
|
1462
|
+
lines.push(warn(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
|
|
1487
1463
|
if (toolTest.response) {
|
|
1488
|
-
lines.push(
|
|
1464
|
+
lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
|
|
1489
1465
|
}
|
|
1490
1466
|
} else if (toolTest.score === "FAIL") {
|
|
1491
1467
|
const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
|
|
1492
|
-
lines.push(
|
|
1468
|
+
lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
|
|
1493
1469
|
if (hasResponse) {
|
|
1494
|
-
lines.push(
|
|
1470
|
+
lines.push(info(`Text response: ${sanitizeForReport(toolTest.response)}`));
|
|
1495
1471
|
} else {
|
|
1496
|
-
lines.push(
|
|
1472
|
+
lines.push(info("Text response: (empty)"));
|
|
1497
1473
|
}
|
|
1498
1474
|
} else {
|
|
1499
|
-
lines.push(
|
|
1475
|
+
lines.push(fail(`Error: ${toolTest.toolCall}`));
|
|
1500
1476
|
}
|
|
1501
|
-
lines.push(
|
|
1502
|
-
lines.push(
|
|
1503
|
-
lines.push(
|
|
1504
|
-
lines.push(
|
|
1505
|
-
lines.push(
|
|
1506
|
-
lines.push(
|
|
1477
|
+
lines.push(section("SKIPPED TESTS (OLLAMA-ONLY)"));
|
|
1478
|
+
lines.push(warn("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
|
|
1479
|
+
lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
|
|
1480
|
+
lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
|
|
1481
|
+
lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
|
|
1482
|
+
lines.push(section("SUMMARY"));
|
|
1507
1483
|
const totalMs = Date.now() - totalStart;
|
|
1508
1484
|
const tests = [
|
|
1509
1485
|
{ name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
|
|
@@ -1514,19 +1490,19 @@ The JSON object must have exactly these 4 keys:
|
|
|
1514
1490
|
const passed = tests.filter((t) => t.pass).length;
|
|
1515
1491
|
const total = tests.length;
|
|
1516
1492
|
for (const t of tests) {
|
|
1517
|
-
lines.push(t.pass ?
|
|
1493
|
+
lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
|
|
1518
1494
|
}
|
|
1519
|
-
lines.push(
|
|
1520
|
-
lines.push(
|
|
1521
|
-
lines.push(
|
|
1495
|
+
lines.push(info(`Total time: ${msHuman(totalMs)}`));
|
|
1496
|
+
lines.push(info(`Score: ${passed}/${total} tests passed`));
|
|
1497
|
+
lines.push(section("RECOMMENDATION"));
|
|
1522
1498
|
if (passed === 4) {
|
|
1523
|
-
lines.push(
|
|
1499
|
+
lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
|
|
1524
1500
|
} else if (passed >= 3) {
|
|
1525
|
-
lines.push(
|
|
1501
|
+
lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
|
|
1526
1502
|
} else if (passed >= 2) {
|
|
1527
|
-
lines.push(
|
|
1503
|
+
lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
|
|
1528
1504
|
} else {
|
|
1529
|
-
lines.push(
|
|
1505
|
+
lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
|
|
1530
1506
|
}
|
|
1531
1507
|
return lines.join("\n");
|
|
1532
1508
|
}
|
|
@@ -1647,3 +1623,6 @@ The JSON object must have exactly these 4 keys:
|
|
|
1647
1623
|
}
|
|
1648
1624
|
});
|
|
1649
1625
|
}
|
|
1626
|
+
export {
|
|
1627
|
+
model_test_temp_default as default
|
|
1628
|
+
};
|
package/package.json
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.4-1",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-package", "pi-extensions"],
|
|
7
7
|
"license": "MIT",
|
|
8
8
|
"access": "public",
|
|
9
|
+
"type": "module",
|
|
9
10
|
"author": "VTSTech",
|
|
10
11
|
"homepage": "https://www.vts-tech.org",
|
|
11
12
|
"repository": {
|
|
@@ -13,7 +14,7 @@
|
|
|
13
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
14
15
|
},
|
|
15
16
|
"dependencies": {
|
|
16
|
-
"@vtstech/pi-shared": "1.0.
|
|
17
|
+
"@vtstech/pi-shared": "1.0.4-1"
|
|
17
18
|
},
|
|
18
19
|
"peerDependencies": {
|
|
19
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|