@vtstech/pi-model-test 1.0.3 → 1.0.4-1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +61 -0
  2. package/model-test.js +169 -190
  3. package/package.json +3 -2
package/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # @vtstech/pi-model-test
2
+
3
+ Model benchmark extension for the [Pi Coding Agent](https://github.com/badlogic/pi-mono).
4
+
5
+ Test any model for reasoning, tool usage, and instruction following — works with Ollama and cloud providers.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pi install "npm:@vtstech/pi-model-test"
11
+ ```
12
+
13
+ ## Commands
14
+
15
+ ```bash
16
+ /model-test Test current Pi model (auto-detects provider)
17
+ /model-test qwen3:0.6b Test a specific Ollama model
18
+ /model-test --all Test every Ollama model
19
+ ```
20
+
21
+ ## Test Suites
22
+
23
+ ### Ollama (6 tests)
24
+
25
+ | Test | Scoring |
26
+ |------|---------|
27
+ | Reasoning (snail puzzle) | STRONG / MODERATE / WEAK / FAIL |
28
+ | Thinking token support | SUPPORTED / NOT SUPPORTED |
29
+ | Tool usage (native + text) | STRONG / MODERATE / WEAK / FAIL |
30
+ | ReAct parsing | STRONG / MODERATE / WEAK / FAIL |
31
+ | Instruction following (JSON) | STRONG / MODERATE / WEAK / FAIL |
32
+ | Tool support detection | NATIVE / REACT / NONE |
33
+
34
+ ### Cloud Providers (4 tests)
35
+
36
+ | Test | Scoring |
37
+ |------|---------|
38
+ | Connectivity | OK / FAIL |
39
+ | Reasoning | STRONG / MODERATE / WEAK / FAIL |
40
+ | Instruction following | STRONG / MODERATE / WEAK / FAIL |
41
+ | Tool usage (function calling) | STRONG / MODERATE / WEAK / FAIL |
42
+
43
+ ## Features
44
+
45
+ - Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
46
+ - Automatic remote Ollama URL resolution
47
+ - Timeout resilience with auto-retry on empty responses
48
+ - Rate limit delay between tests (configurable)
49
+ - Thinking model fallback (retries with `think: true`)
50
+ - Tool support cache (`~/.pi/agent/cache/tool_support.json`)
51
+ - JSON repair for truncated output
52
+ - Tab-completion for model names
53
+
54
+ ## Links
55
+
56
+ - [Full Documentation](https://github.com/VTSTech/pi-coding-agent#model-benchmark-model-testts)
57
+ - [Changelog](https://github.com/VTSTech/pi-coding-agent/blob/main/CHANGELOG.md)
58
+
59
+ ## License
60
+
61
+ MIT — [VTSTech](https://www.vts-tech.org)
package/model-test.js CHANGED
@@ -1,42 +1,18 @@
1
- var __create = Object.create;
2
- var __defProp = Object.defineProperty;
3
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
- var __getOwnPropNames = Object.getOwnPropertyNames;
5
- var __getProtoOf = Object.getPrototypeOf;
6
- var __hasOwnProp = Object.prototype.hasOwnProperty;
7
- var __export = (target, all) => {
8
- for (var name in all)
9
- __defProp(target, name, { get: all[name], enumerable: true });
10
- };
11
- var __copyProps = (to, from, except, desc) => {
12
- if (from && typeof from === "object" || typeof from === "function") {
13
- for (let key of __getOwnPropNames(from))
14
- if (!__hasOwnProp.call(to, key) && key !== except)
15
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
16
- }
17
- return to;
18
- };
19
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
- // If the importer is in node compatibility mode or this is not an ESM
21
- // file that has been converted to a CommonJS file using a Babel-
22
- // compatible transform (i.e. "__esModule" has not been set), then set
23
- // "default" to the CommonJS "module.exports" for node compatibility.
24
- isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
- mod
26
- ));
27
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
28
-
29
1
  // .build-npm/model-test/model-test.temp.ts
30
- var model_test_temp_exports = {};
31
- __export(model_test_temp_exports, {
32
- default: () => model_test_temp_default
33
- });
34
- module.exports = __toCommonJS(model_test_temp_exports);
35
- var fs = __toESM(require("node:fs"));
36
- var os = __toESM(require("node:os"));
37
- var path = __toESM(require("node:path"));
38
- var import_format = require("@vtstech/pi-shared/format");
39
- var import_ollama = require("@vtstech/pi-shared/ollama");
2
+ import * as fs from "node:fs";
3
+ import * as os from "node:os";
4
+ import * as path from "node:path";
5
+ import {
6
+ section,
7
+ ok,
8
+ fail,
9
+ warn,
10
+ info,
11
+ msHuman,
12
+ truncate,
13
+ sanitizeForReport
14
+ } from "@vtstech/pi-shared/format";
15
+ import { getOllamaBaseUrl, detectModelFamily, readModelsJson } from "@vtstech/pi-shared/ollama";
40
16
  var BUILTIN_PROVIDERS = {
41
17
  openrouter: { api: "openai-completions", baseUrl: "https://openrouter.ai/api/v1", envKey: "OPENROUTER_API_KEY" },
42
18
  anthropic: { api: "anthropic-messages", baseUrl: "https://api.anthropic.com/v1", envKey: "ANTHROPIC_API_KEY" },
@@ -55,7 +31,7 @@ function detectProvider(ctx) {
55
31
  if (!model) return { kind: "unknown", name: "none" };
56
32
  const providerName = model.provider || "";
57
33
  if (!providerName) return { kind: "unknown", name: "none" };
58
- const modelsJson = (0, import_ollama.readModelsJson)();
34
+ const modelsJson = readModelsJson();
59
35
  const userProviderCfg = (modelsJson.providers || {})[providerName];
60
36
  if (userProviderCfg) {
61
37
  const baseUrl = userProviderCfg.baseUrl || "";
@@ -169,10 +145,10 @@ function cacheToolSupport(model, support, family) {
169
145
  writeToolSupportCache(cache);
170
146
  }
171
147
  function model_test_temp_default(pi) {
172
- const OLLAMA_BASE = (0, import_ollama.getOllamaBaseUrl)();
148
+ const OLLAMA_BASE = getOllamaBaseUrl();
173
149
  async function rateLimitDelay(lines) {
174
150
  if (CONFIG.TEST_DELAY_MS > 0) {
175
- lines.push((0, import_format.info)(`Waiting ${(0, import_format.msHuman)(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
151
+ lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
176
152
  await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
177
153
  }
178
154
  }
@@ -254,7 +230,7 @@ function model_test_temp_default(pi) {
254
230
  const elapsedMs = Date.now() - start;
255
231
  if (!res.ok) {
256
232
  const errorText = await res.text().catch(() => "unknown error");
257
- throw new Error(`API returned ${res.status}: ${(0, import_format.truncate)(errorText, 200)}`);
233
+ throw new Error(`API returned ${res.status}: ${truncate(errorText, 200)}`);
258
234
  }
259
235
  const data = await res.json();
260
236
  const choice = data.choices?.[0];
@@ -270,7 +246,7 @@ function model_test_temp_default(pi) {
270
246
  } catch (e) {
271
247
  const elapsedMs = Date.now() - start;
272
248
  if (e.name === "AbortError") {
273
- throw new Error(`Provider API timed out after ${(0, import_format.msHuman)(elapsedMs)}`);
249
+ throw new Error(`Provider API timed out after ${msHuman(elapsedMs)}`);
274
250
  }
275
251
  throw e;
276
252
  } finally {
@@ -915,7 +891,7 @@ The JSON object must have exactly these 4 keys:
915
891
  }
916
892
  }
917
893
  if (!parsed) {
918
- return { pass: false, score: "FAIL", output: (0, import_format.sanitizeForReport)(msg), elapsedMs };
894
+ return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs };
919
895
  }
920
896
  const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
921
897
  const correctSum = parsed.sum === 42;
@@ -974,7 +950,7 @@ The JSON object must have exactly these 4 keys:
974
950
  }
975
951
  }
976
952
  if (!parsed) {
977
- return { pass: false, score: "FAIL", output: (0, import_format.sanitizeForReport)(msg), elapsedMs: result.elapsedMs };
953
+ return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs: result.elapsedMs };
978
954
  }
979
955
  const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
980
956
  const correctSum = parsed.sum === 42;
@@ -1061,7 +1037,7 @@ The JSON object must have exactly these 4 keys:
1061
1037
  const detail = result.stderr?.trim() || result.stdout?.trim() || "empty response";
1062
1038
  const level2 = "none";
1063
1039
  cacheToolSupport(model, level2, family);
1064
- return { level: level2, cached: false, evidence: `API error: ${(0, import_format.truncate)(detail, 100)}`, elapsedMs };
1040
+ return { level: level2, cached: false, evidence: `API error: ${truncate(detail, 100)}`, elapsedMs };
1065
1041
  }
1066
1042
  const parsed = JSON.parse(result.stdout);
1067
1043
  const toolCalls = parsed?.message?.tool_calls;
@@ -1133,7 +1109,7 @@ The JSON object must have exactly these 4 keys:
1133
1109
  }
1134
1110
  const level = "none";
1135
1111
  cacheToolSupport(model, level, family);
1136
- const cleanContent = (0, import_format.truncate)(strippedContent, 150);
1112
+ const cleanContent = truncate(strippedContent, 150);
1137
1113
  const evidenceDetail = hasTextToolSignal ? `no structured tool calling (text mentions tool: ${cleanContent})` : `no tool calling patterns (text: ${cleanContent})`;
1138
1114
  return { level, cached: false, evidence: evidenceDetail, elapsedMs };
1139
1115
  } catch (e) {
@@ -1200,8 +1176,8 @@ The JSON object must have exactly these 4 keys:
1200
1176
  const lines = [];
1201
1177
  const totalStart = Date.now();
1202
1178
  lines.push(branding);
1203
- lines.push((0, import_format.section)(`MODEL: ${model}`));
1204
- lines.push((0, import_format.info)("Provider: Ollama (local/remote)"));
1179
+ lines.push(section(`MODEL: ${model}`));
1180
+ lines.push(info("Provider: Ollama (local/remote)"));
1205
1181
  let modelSize = "unknown";
1206
1182
  let modelFamily = "unknown";
1207
1183
  let modelParams = "unknown";
@@ -1227,125 +1203,125 @@ The JSON object must have exactly these 4 keys:
1227
1203
  }
1228
1204
  } catch {
1229
1205
  }
1230
- const detectedFamily = (0, import_ollama.detectModelFamily)(model);
1231
- lines.push((0, import_format.info)(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
1232
- lines.push((0, import_format.info)(`Family: ${modelFamily} | Detected: ${detectedFamily} | Modified: ${modelModified}`));
1233
- lines.push((0, import_format.section)("REASONING TEST"));
1234
- lines.push((0, import_format.info)("Prompt: A snail climbs 3ft up a wall each day, slides 2ft back each night. Wall is 10ft. How many days?"));
1235
- lines.push((0, import_format.info)("Testing..."));
1206
+ const detectedFamily = detectModelFamily(model);
1207
+ lines.push(info(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
1208
+ lines.push(info(`Family: ${modelFamily} | Detected: ${detectedFamily} | Modified: ${modelModified}`));
1209
+ lines.push(section("REASONING TEST"));
1210
+ lines.push(info("Prompt: A snail climbs 3ft up a wall each day, slides 2ft back each night. Wall is 10ft. How many days?"));
1211
+ lines.push(info("Testing..."));
1236
1212
  const reasoning = await testReasoning(model);
1237
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(reasoning.elapsedMs)}`));
1213
+ lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
1238
1214
  if (reasoning.score === "STRONG") {
1239
- lines.push((0, import_format.ok)(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
1215
+ lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
1240
1216
  } else if (reasoning.score === "MODERATE") {
1241
- lines.push((0, import_format.ok)(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
1217
+ lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
1242
1218
  } else if (reasoning.score === "WEAK") {
1243
- lines.push((0, import_format.fail)(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
1219
+ lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
1244
1220
  } else if (reasoning.score === "FAIL") {
1245
- lines.push((0, import_format.fail)(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
1221
+ lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
1246
1222
  } else {
1247
- const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : (0, import_format.truncate)(reasoning.reasoning, 300);
1248
- lines.push((0, import_format.fail)(`Error: ${errMsg}`));
1223
+ const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
1224
+ lines.push(fail(`Error: ${errMsg}`));
1249
1225
  }
1250
- lines.push((0, import_format.info)(`Response: ${(0, import_format.sanitizeForReport)(reasoning.reasoning)}`));
1251
- lines.push((0, import_format.section)("THINKING TEST"));
1252
- lines.push((0, import_format.info)('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
1226
+ lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
1227
+ lines.push(section("THINKING TEST"));
1228
+ lines.push(info('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
1253
1229
  await rateLimitDelay(lines);
1254
1230
  const thinking = await testThinking(model);
1255
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(thinking.elapsedMs)}`));
1231
+ lines.push(info(`Time: ${msHuman(thinking.elapsedMs)}`));
1256
1232
  if (thinking.supported) {
1257
- lines.push((0, import_format.ok)(`Thinking/reasoning tokens: SUPPORTED`));
1258
- lines.push((0, import_format.info)(`Thinking content: ${(0, import_format.sanitizeForReport)(thinking.thinkingContent)}`));
1233
+ lines.push(ok(`Thinking/reasoning tokens: SUPPORTED`));
1234
+ lines.push(info(`Thinking content: ${sanitizeForReport(thinking.thinkingContent)}`));
1259
1235
  } else {
1260
- lines.push((0, import_format.fail)(`Thinking/reasoning tokens: NOT SUPPORTED`));
1236
+ lines.push(fail(`Thinking/reasoning tokens: NOT SUPPORTED`));
1261
1237
  }
1262
- lines.push((0, import_format.info)(`Answer output: ${(0, import_format.sanitizeForReport)(thinking.answerContent)}`));
1263
- lines.push((0, import_format.section)("MODELS.JSON SYNC"));
1238
+ lines.push(info(`Answer output: ${sanitizeForReport(thinking.answerContent)}`));
1239
+ lines.push(section("MODELS.JSON SYNC"));
1264
1240
  const reasoningUpdate = updateModelsJsonReasoning(model, thinking.supported);
1265
- lines.push((0, import_format.info)(reasoningUpdate.message));
1266
- lines.push((0, import_format.section)("TOOL USAGE TEST"));
1267
- lines.push((0, import_format.info)(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
1268
- lines.push((0, import_format.info)("Testing..."));
1241
+ lines.push(info(reasoningUpdate.message));
1242
+ lines.push(section("TOOL USAGE TEST"));
1243
+ lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
1244
+ lines.push(info("Testing..."));
1269
1245
  await rateLimitDelay(lines);
1270
1246
  const tools = await testToolUsage(model);
1271
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(tools.elapsedMs)}`));
1247
+ lines.push(info(`Time: ${msHuman(tools.elapsedMs)}`));
1272
1248
  if (tools.score === "STRONG") {
1273
- lines.push((0, import_format.ok)(`Tool call: ${tools.toolCall} (${tools.score})`));
1249
+ lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
1274
1250
  if (tools.response) {
1275
- lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(tools.response)}`));
1251
+ lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
1276
1252
  }
1277
1253
  } else if (tools.score === "MODERATE") {
1278
- lines.push((0, import_format.ok)(`Tool call: ${tools.toolCall} (${tools.score})`));
1254
+ lines.push(ok(`Tool call: ${tools.toolCall} (${tools.score})`));
1279
1255
  if (tools.response) {
1280
- lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(tools.response)}`));
1256
+ lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
1281
1257
  }
1282
1258
  } else if (tools.score === "WEAK") {
1283
- lines.push((0, import_format.warn)(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
1259
+ lines.push(warn(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
1284
1260
  if (tools.response) {
1285
- lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(tools.response)}`));
1261
+ lines.push(info(`Raw response: ${sanitizeForReport(tools.response)}`));
1286
1262
  }
1287
1263
  } else if (tools.score === "FAIL") {
1288
1264
  const hasResponse = tools.response && tools.response.trim().length > 0;
1289
- lines.push((0, import_format.fail)(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
1265
+ lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
1290
1266
  if (hasResponse) {
1291
- lines.push((0, import_format.info)(`Text response: ${(0, import_format.sanitizeForReport)(tools.response)}`));
1267
+ lines.push(info(`Text response: ${sanitizeForReport(tools.response)}`));
1292
1268
  } else {
1293
- lines.push((0, import_format.info)("Text response: (empty)"));
1269
+ lines.push(info("Text response: (empty)"));
1294
1270
  }
1295
1271
  } else {
1296
- lines.push((0, import_format.fail)(`Error: ${tools.toolCall}`));
1272
+ lines.push(fail(`Error: ${tools.toolCall}`));
1297
1273
  }
1298
- lines.push((0, import_format.section)("REACT PARSING TEST"));
1299
- lines.push((0, import_format.info)(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
1300
- lines.push((0, import_format.info)("Testing..."));
1274
+ lines.push(section("REACT PARSING TEST"));
1275
+ lines.push(info(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
1276
+ lines.push(info("Testing..."));
1301
1277
  await rateLimitDelay(lines);
1302
1278
  const react = await testReactParsing(model);
1303
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(react.elapsedMs)}`));
1279
+ lines.push(info(`Time: ${msHuman(react.elapsedMs)}`));
1304
1280
  if (react.score === "STRONG") {
1305
- lines.push((0, import_format.ok)(`ReAct parsed: ${react.toolCall} (${react.score})`));
1281
+ lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
1306
1282
  if (react.thought) {
1307
- lines.push((0, import_format.info)(`Thought: ${(0, import_format.sanitizeForReport)(react.thought)}`));
1283
+ lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1308
1284
  }
1309
1285
  } else if (react.score === "MODERATE") {
1310
- lines.push((0, import_format.ok)(`ReAct parsed: ${react.toolCall} (${react.score})`));
1286
+ lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
1311
1287
  if (react.thought) {
1312
- lines.push((0, import_format.info)(`Thought: ${(0, import_format.sanitizeForReport)(react.thought)}`));
1288
+ lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1313
1289
  }
1314
1290
  } else if (react.score === "WEAK") {
1315
- lines.push((0, import_format.warn)(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args`));
1291
+ lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args`));
1316
1292
  if (react.thought) {
1317
- lines.push((0, import_format.info)(`Thought: ${(0, import_format.sanitizeForReport)(react.thought)}`));
1293
+ lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1318
1294
  }
1319
1295
  } else if (react.score === "FAIL") {
1320
- lines.push((0, import_format.fail)(`ReAct parsing: ${react.toolCall} (${react.score})`));
1296
+ lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})`));
1321
1297
  if (react.response) {
1322
- lines.push((0, import_format.info)(`Response: ${(0, import_format.sanitizeForReport)(react.response)}`));
1298
+ lines.push(info(`Response: ${sanitizeForReport(react.response)}`));
1323
1299
  }
1324
1300
  } else {
1325
- lines.push((0, import_format.fail)(`Error: ${react.toolCall}`));
1301
+ lines.push(fail(`Error: ${react.toolCall}`));
1326
1302
  }
1327
- lines.push((0, import_format.section)("INSTRUCTION FOLLOWING TEST"));
1328
- lines.push((0, import_format.info)("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
1329
- lines.push((0, import_format.info)("Testing..."));
1303
+ lines.push(section("INSTRUCTION FOLLOWING TEST"));
1304
+ lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
1305
+ lines.push(info("Testing..."));
1330
1306
  await rateLimitDelay(lines);
1331
1307
  const instructions = await testInstructionFollowing(model);
1332
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(instructions.elapsedMs)}`));
1308
+ lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
1333
1309
  if (instructions.score === "STRONG") {
1334
- lines.push((0, import_format.ok)(`JSON output valid with correct values (${instructions.score})`));
1310
+ lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
1335
1311
  } else if (instructions.score === "MODERATE") {
1336
- lines.push((0, import_format.ok)(`JSON output valid but some values incorrect (${instructions.score})`));
1312
+ lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
1337
1313
  } else if (instructions.score === "WEAK") {
1338
- lines.push((0, import_format.warn)(`Partial JSON compliance (${instructions.score})`));
1314
+ lines.push(warn(`Partial JSON compliance (${instructions.score})`));
1339
1315
  } else {
1340
- lines.push((0, import_format.fail)(`Failed to produce valid JSON (${instructions.score})`));
1316
+ lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
1341
1317
  }
1342
- lines.push((0, import_format.info)(`Output: ${(0, import_format.sanitizeForReport)(instructions.output)}`));
1343
- lines.push((0, import_format.section)("TOOL SUPPORT DETECTION"));
1344
- lines.push((0, import_format.info)("Probing model for tool calling capability (native / ReAct / none)"));
1345
- lines.push((0, import_format.info)("Testing..."));
1318
+ lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
1319
+ lines.push(section("TOOL SUPPORT DETECTION"));
1320
+ lines.push(info("Probing model for tool calling capability (native / ReAct / none)"));
1321
+ lines.push(info("Testing..."));
1346
1322
  await rateLimitDelay(lines);
1347
1323
  const toolSupport = await testToolSupport(model, detectedFamily);
1348
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(toolSupport.elapsedMs)}`));
1324
+ lines.push(info(`Time: ${msHuman(toolSupport.elapsedMs)}`));
1349
1325
  const supportLabel = (level) => {
1350
1326
  switch (level) {
1351
1327
  case "native":
@@ -1359,19 +1335,19 @@ The JSON object must have exactly these 4 keys:
1359
1335
  }
1360
1336
  };
1361
1337
  if (toolSupport.cached) {
1362
- lines.push((0, import_format.info)(`Result: ${supportLabel(toolSupport.level)} \u2014 from cache`));
1338
+ lines.push(info(`Result: ${supportLabel(toolSupport.level)} \u2014 from cache`));
1363
1339
  } else {
1364
1340
  if (toolSupport.level === "native") {
1365
- lines.push((0, import_format.ok)(`Tool support: ${supportLabel(toolSupport.level)}`));
1341
+ lines.push(ok(`Tool support: ${supportLabel(toolSupport.level)}`));
1366
1342
  } else if (toolSupport.level === "react") {
1367
- lines.push((0, import_format.ok)(`Tool support: ${supportLabel(toolSupport.level)}`));
1343
+ lines.push(ok(`Tool support: ${supportLabel(toolSupport.level)}`));
1368
1344
  } else {
1369
- lines.push((0, import_format.warn)(`Tool support: ${supportLabel(toolSupport.level)}`));
1345
+ lines.push(warn(`Tool support: ${supportLabel(toolSupport.level)}`));
1370
1346
  }
1371
1347
  }
1372
- lines.push((0, import_format.info)(`Evidence: ${toolSupport.evidence}`));
1373
- lines.push((0, import_format.info)(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
1374
- lines.push((0, import_format.section)("SUMMARY"));
1348
+ lines.push(info(`Evidence: ${toolSupport.evidence}`));
1349
+ lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
1350
+ lines.push(section("SUMMARY"));
1375
1351
  const totalMs = Date.now() - totalStart;
1376
1352
  const tests = [
1377
1353
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
@@ -1384,19 +1360,19 @@ The JSON object must have exactly these 4 keys:
1384
1360
  const passed = tests.filter((t) => t.pass).length;
1385
1361
  const total = tests.length;
1386
1362
  for (const t of tests) {
1387
- lines.push(t.pass ? (0, import_format.ok)(`${t.name}: ${t.score}`) : (0, import_format.fail)(`${t.name}: ${t.score}`));
1363
+ lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
1388
1364
  }
1389
- lines.push((0, import_format.info)(`Total time: ${(0, import_format.msHuman)(totalMs)}`));
1390
- lines.push((0, import_format.info)(`Score: ${passed}/${total} tests passed`));
1391
- lines.push((0, import_format.section)("RECOMMENDATION"));
1365
+ lines.push(info(`Total time: ${msHuman(totalMs)}`));
1366
+ lines.push(info(`Score: ${passed}/${total} tests passed`));
1367
+ lines.push(section("RECOMMENDATION"));
1392
1368
  if (passed === 6) {
1393
- lines.push((0, import_format.ok)(`${model} is a STRONG model \u2014 full capability`));
1369
+ lines.push(ok(`${model} is a STRONG model \u2014 full capability`));
1394
1370
  } else if (passed >= 5) {
1395
- lines.push((0, import_format.ok)(`${model} is a GOOD model \u2014 most capabilities work`));
1371
+ lines.push(ok(`${model} is a GOOD model \u2014 most capabilities work`));
1396
1372
  } else if (passed >= 4) {
1397
- lines.push((0, import_format.warn)(`${model} is USABLE \u2014 some capabilities are limited`));
1373
+ lines.push(warn(`${model} is USABLE \u2014 some capabilities are limited`));
1398
1374
  } else {
1399
- lines.push((0, import_format.fail)(`${model} is WEAK \u2014 limited capabilities for agent use`));
1375
+ lines.push(fail(`${model} is WEAK \u2014 limited capabilities for agent use`));
1400
1376
  }
1401
1377
  return lines.join("\n");
1402
1378
  }
@@ -1404,106 +1380,106 @@ The JSON object must have exactly these 4 keys:
1404
1380
  const lines = [];
1405
1381
  const totalStart = Date.now();
1406
1382
  lines.push(branding);
1407
- lines.push((0, import_format.section)(`MODEL: ${model}`));
1408
- lines.push((0, import_format.info)(`Provider: ${providerInfo.name} (built-in)`));
1409
- lines.push((0, import_format.info)(`API: ${providerInfo.apiMode || "openai-completions"}`));
1410
- lines.push((0, import_format.info)(`Base URL: ${providerInfo.baseUrl || "unknown"}`));
1383
+ lines.push(section(`MODEL: ${model}`));
1384
+ lines.push(info(`Provider: ${providerInfo.name} (built-in)`));
1385
+ lines.push(info(`API: ${providerInfo.apiMode || "openai-completions"}`));
1386
+ lines.push(info(`Base URL: ${providerInfo.baseUrl || "unknown"}`));
1411
1387
  if (providerInfo.apiKey) {
1412
- lines.push((0, import_format.info)(`API Key: ****${providerInfo.apiKey.slice(-4)}`));
1388
+ lines.push(info(`API Key: ****${providerInfo.apiKey.slice(-4)}`));
1413
1389
  } else {
1414
- lines.push((0, import_format.warn)(`API Key: NOT SET (${providerInfo.envKey || "env var not found"})`));
1390
+ lines.push(warn(`API Key: NOT SET (${providerInfo.envKey || "env var not found"})`));
1415
1391
  }
1416
- lines.push((0, import_format.section)("CONNECTIVITY TEST"));
1417
- lines.push((0, import_format.info)("Sending minimal request to verify API reachability and key validity..."));
1392
+ lines.push(section("CONNECTIVITY TEST"));
1393
+ lines.push(info("Sending minimal request to verify API reachability and key validity..."));
1418
1394
  const connectivity = await testConnectivity(providerInfo, model);
1419
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(connectivity.elapsedMs)}`));
1395
+ lines.push(info(`Time: ${msHuman(connectivity.elapsedMs)}`));
1420
1396
  if (connectivity.pass) {
1421
- lines.push((0, import_format.ok)(`API reachable and authenticated`));
1397
+ lines.push(ok(`API reachable and authenticated`));
1422
1398
  } else {
1423
1399
  if (!connectivity.reachable) {
1424
- lines.push((0, import_format.fail)(`API not reachable: ${connectivity.error || "unknown error"}`));
1400
+ lines.push(fail(`API not reachable: ${connectivity.error || "unknown error"}`));
1425
1401
  } else if (!connectivity.authValid) {
1426
- lines.push((0, import_format.fail)(`Authentication failed: ${connectivity.error || "invalid or missing API key"}`));
1402
+ lines.push(fail(`Authentication failed: ${connectivity.error || "invalid or missing API key"}`));
1427
1403
  } else {
1428
- lines.push((0, import_format.fail)(`Connectivity error: ${connectivity.error || "unknown"}`));
1404
+ lines.push(fail(`Connectivity error: ${connectivity.error || "unknown"}`));
1429
1405
  }
1430
- lines.push((0, import_format.warn)("Skipping remaining tests \u2014 fix connectivity first"));
1431
- lines.push((0, import_format.info)("Tip: Check your API key is set correctly and the provider endpoint is accessible"));
1406
+ lines.push(warn("Skipping remaining tests \u2014 fix connectivity first"));
1407
+ lines.push(info("Tip: Check your API key is set correctly and the provider endpoint is accessible"));
1432
1408
  return lines.join("\n");
1433
1409
  }
1434
- lines.push((0, import_format.section)("REASONING TEST"));
1435
- lines.push((0, import_format.info)("Prompt: A snail climbs 3ft up a wall each day, slides 2ft back each night. Wall is 10ft. How many days?"));
1436
- lines.push((0, import_format.info)("Testing..."));
1410
+ lines.push(section("REASONING TEST"));
1411
+ lines.push(info("Prompt: A snail climbs 3ft up a wall each day, slides 2ft back each night. Wall is 10ft. How many days?"));
1412
+ lines.push(info("Testing..."));
1437
1413
  await rateLimitDelay(lines);
1438
1414
  const reasoning = await testReasoningProvider(providerInfo, model);
1439
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(reasoning.elapsedMs)}`));
1415
+ lines.push(info(`Time: ${msHuman(reasoning.elapsedMs)}`));
1440
1416
  if (reasoning.score === "STRONG") {
1441
- lines.push((0, import_format.ok)(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
1417
+ lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
1442
1418
  } else if (reasoning.score === "MODERATE") {
1443
- lines.push((0, import_format.ok)(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
1419
+ lines.push(ok(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
1444
1420
  } else if (reasoning.score === "WEAK") {
1445
- lines.push((0, import_format.fail)(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
1421
+ lines.push(fail(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
1446
1422
  } else if (reasoning.score === "FAIL") {
1447
- lines.push((0, import_format.fail)(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
1423
+ lines.push(fail(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
1448
1424
  } else {
1449
- const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : (0, import_format.truncate)(reasoning.reasoning, 300);
1450
- lines.push((0, import_format.fail)(`Error: ${errMsg}`));
1425
+ const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : truncate(reasoning.reasoning, 300);
1426
+ lines.push(fail(`Error: ${errMsg}`));
1451
1427
  }
1452
- lines.push((0, import_format.info)(`Response: ${(0, import_format.sanitizeForReport)(reasoning.reasoning)}`));
1453
- lines.push((0, import_format.section)("INSTRUCTION FOLLOWING TEST"));
1454
- lines.push((0, import_format.info)("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
1455
- lines.push((0, import_format.info)("Testing..."));
1428
+ lines.push(info(`Response: ${sanitizeForReport(reasoning.reasoning)}`));
1429
+ lines.push(section("INSTRUCTION FOLLOWING TEST"));
1430
+ lines.push(info("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
1431
+ lines.push(info("Testing..."));
1456
1432
  await rateLimitDelay(lines);
1457
1433
  const instructions = await testInstructionFollowingProvider(providerInfo, model);
1458
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(instructions.elapsedMs)}`));
1434
+ lines.push(info(`Time: ${msHuman(instructions.elapsedMs)}`));
1459
1435
  if (instructions.score === "STRONG") {
1460
- lines.push((0, import_format.ok)(`JSON output valid with correct values (${instructions.score})`));
1436
+ lines.push(ok(`JSON output valid with correct values (${instructions.score})`));
1461
1437
  } else if (instructions.score === "MODERATE") {
1462
- lines.push((0, import_format.ok)(`JSON output valid but some values incorrect (${instructions.score})`));
1438
+ lines.push(ok(`JSON output valid but some values incorrect (${instructions.score})`));
1463
1439
  } else if (instructions.score === "WEAK") {
1464
- lines.push((0, import_format.warn)(`Partial JSON compliance (${instructions.score})`));
1440
+ lines.push(warn(`Partial JSON compliance (${instructions.score})`));
1465
1441
  } else {
1466
- lines.push((0, import_format.fail)(`Failed to produce valid JSON (${instructions.score})`));
1442
+ lines.push(fail(`Failed to produce valid JSON (${instructions.score})`));
1467
1443
  }
1468
- lines.push((0, import_format.info)(`Output: ${(0, import_format.sanitizeForReport)(instructions.output)}`));
1469
- lines.push((0, import_format.section)("TOOL USAGE TEST"));
1470
- lines.push((0, import_format.info)(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
1471
- lines.push((0, import_format.info)("Testing..."));
1444
+ lines.push(info(`Output: ${sanitizeForReport(instructions.output)}`));
1445
+ lines.push(section("TOOL USAGE TEST"));
1446
+ lines.push(info(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
1447
+ lines.push(info("Testing..."));
1472
1448
  await rateLimitDelay(lines);
1473
1449
  const toolTest = await testToolUsageProvider(providerInfo, model);
1474
- lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(toolTest.elapsedMs)}`));
1450
+ lines.push(info(`Time: ${msHuman(toolTest.elapsedMs)}`));
1475
1451
  if (toolTest.score === "STRONG") {
1476
- lines.push((0, import_format.ok)(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
1452
+ lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
1477
1453
  if (toolTest.response) {
1478
- lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(toolTest.response)}`));
1454
+ lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1479
1455
  }
1480
1456
  } else if (toolTest.score === "MODERATE") {
1481
- lines.push((0, import_format.ok)(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
1457
+ lines.push(ok(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
1482
1458
  if (toolTest.response) {
1483
- lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(toolTest.response)}`));
1459
+ lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1484
1460
  }
1485
1461
  } else if (toolTest.score === "WEAK") {
1486
- lines.push((0, import_format.warn)(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
1462
+ lines.push(warn(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
1487
1463
  if (toolTest.response) {
1488
- lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(toolTest.response)}`));
1464
+ lines.push(info(`Raw response: ${sanitizeForReport(toolTest.response)}`));
1489
1465
  }
1490
1466
  } else if (toolTest.score === "FAIL") {
1491
1467
  const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
1492
- lines.push((0, import_format.fail)(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
1468
+ lines.push(fail(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
1493
1469
  if (hasResponse) {
1494
- lines.push((0, import_format.info)(`Text response: ${(0, import_format.sanitizeForReport)(toolTest.response)}`));
1470
+ lines.push(info(`Text response: ${sanitizeForReport(toolTest.response)}`));
1495
1471
  } else {
1496
- lines.push((0, import_format.info)("Text response: (empty)"));
1472
+ lines.push(info("Text response: (empty)"));
1497
1473
  }
1498
1474
  } else {
1499
- lines.push((0, import_format.fail)(`Error: ${toolTest.toolCall}`));
1475
+ lines.push(fail(`Error: ${toolTest.toolCall}`));
1500
1476
  }
1501
- lines.push((0, import_format.section)("SKIPPED TESTS (OLLAMA-ONLY)"));
1502
- lines.push((0, import_format.warn)("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
1503
- lines.push((0, import_format.warn)("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
1504
- lines.push((0, import_format.warn)("Tool support detection \u2014 Ollama-specific tool support cache"));
1505
- lines.push((0, import_format.warn)("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
1506
- lines.push((0, import_format.section)("SUMMARY"));
1477
+ lines.push(section("SKIPPED TESTS (OLLAMA-ONLY)"));
1478
+ lines.push(warn("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
1479
+ lines.push(warn("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
1480
+ lines.push(warn("Tool support detection \u2014 Ollama-specific tool support cache"));
1481
+ lines.push(warn("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
1482
+ lines.push(section("SUMMARY"));
1507
1483
  const totalMs = Date.now() - totalStart;
1508
1484
  const tests = [
1509
1485
  { name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
@@ -1514,19 +1490,19 @@ The JSON object must have exactly these 4 keys:
1514
1490
  const passed = tests.filter((t) => t.pass).length;
1515
1491
  const total = tests.length;
1516
1492
  for (const t of tests) {
1517
- lines.push(t.pass ? (0, import_format.ok)(`${t.name}: ${t.score}`) : (0, import_format.fail)(`${t.name}: ${t.score}`));
1493
+ lines.push(t.pass ? ok(`${t.name}: ${t.score}`) : fail(`${t.name}: ${t.score}`));
1518
1494
  }
1519
- lines.push((0, import_format.info)(`Total time: ${(0, import_format.msHuman)(totalMs)}`));
1520
- lines.push((0, import_format.info)(`Score: ${passed}/${total} tests passed`));
1521
- lines.push((0, import_format.section)("RECOMMENDATION"));
1495
+ lines.push(info(`Total time: ${msHuman(totalMs)}`));
1496
+ lines.push(info(`Score: ${passed}/${total} tests passed`));
1497
+ lines.push(section("RECOMMENDATION"));
1522
1498
  if (passed === 4) {
1523
- lines.push((0, import_format.ok)(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
1499
+ lines.push(ok(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
1524
1500
  } else if (passed >= 3) {
1525
- lines.push((0, import_format.ok)(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
1501
+ lines.push(ok(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
1526
1502
  } else if (passed >= 2) {
1527
- lines.push((0, import_format.warn)(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
1503
+ lines.push(warn(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
1528
1504
  } else {
1529
- lines.push((0, import_format.fail)(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
1505
+ lines.push(fail(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
1530
1506
  }
1531
1507
  return lines.join("\n");
1532
1508
  }
@@ -1647,3 +1623,6 @@ The JSON object must have exactly these 4 keys:
1647
1623
  }
1648
1624
  });
1649
1625
  }
1626
+ export {
1627
+ model_test_temp_default as default
1628
+ };
package/package.json CHANGED
@@ -1,11 +1,12 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.0.3",
3
+ "version": "1.0.4-1",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-package", "pi-extensions"],
7
7
  "license": "MIT",
8
8
  "access": "public",
9
+ "type": "module",
9
10
  "author": "VTSTech",
10
11
  "homepage": "https://www.vts-tech.org",
11
12
  "repository": {
@@ -13,7 +14,7 @@
13
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
14
15
  },
15
16
  "dependencies": {
16
- "@vtstech/pi-shared": "1.0.3"
17
+ "@vtstech/pi-shared": "1.0.4-1"
17
18
  },
18
19
  "peerDependencies": {
19
20
  "@mariozechner/pi-coding-agent": ">=0.66"