@vtstech/pi-model-test 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +1649 -0
- package/package.json +24 -0
package/model-test.js
ADDED
|
@@ -0,0 +1,1649 @@
|
|
|
1
|
+
var __create = Object.create;
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
6
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
var __export = (target, all) => {
|
|
8
|
+
for (var name in all)
|
|
9
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
10
|
+
};
|
|
11
|
+
var __copyProps = (to, from, except, desc) => {
|
|
12
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
13
|
+
for (let key of __getOwnPropNames(from))
|
|
14
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
15
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
16
|
+
}
|
|
17
|
+
return to;
|
|
18
|
+
};
|
|
19
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
20
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
21
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
22
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
23
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
24
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
25
|
+
mod
|
|
26
|
+
));
|
|
27
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
28
|
+
|
|
29
|
+
// .build-npm/model-test/model-test.temp.ts
|
|
30
|
+
var model_test_temp_exports = {};
|
|
31
|
+
__export(model_test_temp_exports, {
|
|
32
|
+
default: () => model_test_temp_default
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(model_test_temp_exports);
|
|
35
|
+
var fs = __toESM(require("node:fs"));
|
|
36
|
+
var os = __toESM(require("node:os"));
|
|
37
|
+
var path = __toESM(require("node:path"));
|
|
38
|
+
var import_format = require("@vtstech/pi-shared/format");
|
|
39
|
+
var import_ollama = require("@vtstech/pi-shared/ollama");
|
|
40
|
+
var BUILTIN_PROVIDERS = {
|
|
41
|
+
openrouter: { api: "openai-completions", baseUrl: "https://openrouter.ai/api/v1", envKey: "OPENROUTER_API_KEY" },
|
|
42
|
+
anthropic: { api: "anthropic-messages", baseUrl: "https://api.anthropic.com/v1", envKey: "ANTHROPIC_API_KEY" },
|
|
43
|
+
google: { api: "gemini", baseUrl: "https://generativelanguage.googleapis.com", envKey: "GOOGLE_API_KEY" },
|
|
44
|
+
openai: { api: "openai-completions", baseUrl: "https://api.openai.com/v1", envKey: "OPENAI_API_KEY" },
|
|
45
|
+
groq: { api: "openai-completions", baseUrl: "https://api.groq.com/v1", envKey: "GROQ_API_KEY" },
|
|
46
|
+
deepseek: { api: "openai-completions", baseUrl: "https://api.deepseek.com/v1", envKey: "DEEPSEEK_API_KEY" },
|
|
47
|
+
mistral: { api: "openai-completions", baseUrl: "https://api.mistral.ai/v1", envKey: "MISTRAL_API_KEY" },
|
|
48
|
+
xai: { api: "openai-completions", baseUrl: "https://api.x.ai/v1", envKey: "XAI_API_KEY" },
|
|
49
|
+
together: { api: "openai-completions", baseUrl: "https://api.together.xyz/v1", envKey: "TOGETHER_API_KEY" },
|
|
50
|
+
fireworks: { api: "openai-completions", baseUrl: "https://api.fireworks.ai/inference/v1", envKey: "FIREWORKS_API_KEY" },
|
|
51
|
+
cohere: { api: "cohere-chat", baseUrl: "https://api.cohere.com/v1", envKey: "COHERE_API_KEY" }
|
|
52
|
+
};
|
|
53
|
+
function detectProvider(ctx) {
|
|
54
|
+
const model = ctx.model;
|
|
55
|
+
if (!model) return { kind: "unknown", name: "none" };
|
|
56
|
+
const providerName = model.provider || "";
|
|
57
|
+
if (!providerName) return { kind: "unknown", name: "none" };
|
|
58
|
+
const modelsJson = (0, import_ollama.readModelsJson)();
|
|
59
|
+
const userProviderCfg = (modelsJson.providers || {})[providerName];
|
|
60
|
+
if (userProviderCfg) {
|
|
61
|
+
const baseUrl = userProviderCfg.baseUrl || "";
|
|
62
|
+
const apiMode = userProviderCfg.api || "";
|
|
63
|
+
const apiKey = userProviderCfg.apiKey || "";
|
|
64
|
+
const isOllama = /ollama/i.test(providerName) || /localhost:\d+/.test(baseUrl) || /127\.0\.0\.1:\d+/.test(baseUrl) || /\/api\/chat/.test(baseUrl) || apiMode === "ollama";
|
|
65
|
+
if (isOllama) {
|
|
66
|
+
return { kind: "ollama", name: providerName, apiMode: "ollama", baseUrl, apiKey };
|
|
67
|
+
}
|
|
68
|
+
if (/\/api\/chat/.test(baseUrl)) {
|
|
69
|
+
return { kind: "ollama", name: providerName, apiMode: "ollama", baseUrl, apiKey };
|
|
70
|
+
}
|
|
71
|
+
return {
|
|
72
|
+
kind: "builtin",
|
|
73
|
+
name: providerName,
|
|
74
|
+
apiMode: apiMode || userProviderCfg.api || "openai-completions",
|
|
75
|
+
baseUrl,
|
|
76
|
+
apiKey
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
const builtin = BUILTIN_PROVIDERS[providerName];
|
|
80
|
+
if (builtin) {
|
|
81
|
+
const apiKey = process.env[builtin.envKey] || "";
|
|
82
|
+
return {
|
|
83
|
+
kind: "builtin",
|
|
84
|
+
name: providerName,
|
|
85
|
+
apiMode: builtin.api,
|
|
86
|
+
baseUrl: builtin.baseUrl,
|
|
87
|
+
envKey: builtin.envKey,
|
|
88
|
+
apiKey
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
return { kind: "unknown", name: providerName };
|
|
92
|
+
}
|
|
93
|
+
var CONFIG = {
|
|
94
|
+
// General API settings
|
|
95
|
+
DEFAULT_TIMEOUT_MS: 6e5,
|
|
96
|
+
// 8.3 minutes - default timeout for model responses
|
|
97
|
+
CONNECT_TIMEOUT_S: 30,
|
|
98
|
+
// 30 seconds to establish connection
|
|
99
|
+
MAX_RETRIES: 1,
|
|
100
|
+
// Single retry for transient failures
|
|
101
|
+
RETRY_DELAY_MS: 2e3,
|
|
102
|
+
// 2 seconds between retries
|
|
103
|
+
EXEC_BUFFER_MS: 5e3,
|
|
104
|
+
// Extra buffer for exec timeout over curl timeout
|
|
105
|
+
// Model generation settings
|
|
106
|
+
NUM_PREDICT: 1024,
|
|
107
|
+
// Max tokens in response
|
|
108
|
+
TEMPERATURE: 0.1,
|
|
109
|
+
// Low temperature for more deterministic output
|
|
110
|
+
// Test-specific settings
|
|
111
|
+
MIN_THINKING_LENGTH: 10,
|
|
112
|
+
// Minimum chars to consider thinking tokens valid
|
|
113
|
+
TOOL_TEST_TIMEOUT_MS: 9e4,
|
|
114
|
+
// 90 seconds for tool usage tests
|
|
115
|
+
TOOL_TEST_MAX_TIME_S: 9999,
|
|
116
|
+
// Max curl time for tool tests (effectively unlimited)
|
|
117
|
+
TOOL_SUPPORT_TIMEOUT_MS: 26e4,
|
|
118
|
+
// 2+ minutes for tool support detection
|
|
119
|
+
TOOL_SUPPORT_MAX_TIME_S: 240,
|
|
120
|
+
// Max curl time for tool support detection
|
|
121
|
+
// Metadata retrieval
|
|
122
|
+
TAGS_TIMEOUT_MS: 15e3,
|
|
123
|
+
// 15 seconds for /api/tags
|
|
124
|
+
TAGS_CONNECT_TIMEOUT_S: 10,
|
|
125
|
+
// 10 seconds connection timeout for tags
|
|
126
|
+
MODEL_INFO_TIMEOUT_MS: 1e4,
|
|
127
|
+
// 10 seconds for model info lookup
|
|
128
|
+
// Provider API settings
|
|
129
|
+
PROVIDER_TIMEOUT_MS: 12e4,
|
|
130
|
+
// 2 minutes for cloud provider API calls
|
|
131
|
+
PROVIDER_TOOL_TIMEOUT_MS: 6e4,
|
|
132
|
+
// 60 seconds for tool usage tests on providers
|
|
133
|
+
// Rate limiting
|
|
134
|
+
TEST_DELAY_MS: 3e4
|
|
135
|
+
// 30 seconds between tests to avoid rate limiting
|
|
136
|
+
};
|
|
137
|
+
var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
|
|
138
|
+
var TOOL_SUPPORT_CACHE_PATH = path.join(TOOL_SUPPORT_CACHE_DIR, "tool_support.json");
|
|
139
|
+
function readToolSupportCache() {
|
|
140
|
+
try {
|
|
141
|
+
if (fs.existsSync(TOOL_SUPPORT_CACHE_PATH)) {
|
|
142
|
+
const raw = fs.readFileSync(TOOL_SUPPORT_CACHE_PATH, "utf-8");
|
|
143
|
+
return JSON.parse(raw);
|
|
144
|
+
}
|
|
145
|
+
} catch {
|
|
146
|
+
}
|
|
147
|
+
return {};
|
|
148
|
+
}
|
|
149
|
+
function writeToolSupportCache(cache) {
|
|
150
|
+
if (!fs.existsSync(TOOL_SUPPORT_CACHE_DIR)) {
|
|
151
|
+
fs.mkdirSync(TOOL_SUPPORT_CACHE_DIR, { recursive: true });
|
|
152
|
+
}
|
|
153
|
+
fs.writeFileSync(TOOL_SUPPORT_CACHE_PATH, JSON.stringify(cache, null, 2) + "\n", "utf-8");
|
|
154
|
+
}
|
|
155
|
+
function getCachedToolSupport(model) {
|
|
156
|
+
const cache = readToolSupportCache();
|
|
157
|
+
const entry = cache[model];
|
|
158
|
+
if (!entry) return null;
|
|
159
|
+
if (!entry.support || !["native", "react", "none"].includes(entry.support)) return null;
|
|
160
|
+
return entry;
|
|
161
|
+
}
|
|
162
|
+
function cacheToolSupport(model, support, family) {
|
|
163
|
+
const cache = readToolSupportCache();
|
|
164
|
+
cache[model] = {
|
|
165
|
+
support,
|
|
166
|
+
testedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
167
|
+
family
|
|
168
|
+
};
|
|
169
|
+
writeToolSupportCache(cache);
|
|
170
|
+
}
|
|
171
|
+
function model_test_temp_default(pi) {
|
|
172
|
+
const OLLAMA_BASE = (0, import_ollama.getOllamaBaseUrl)();
|
|
173
|
+
async function rateLimitDelay(lines) {
|
|
174
|
+
if (CONFIG.TEST_DELAY_MS > 0) {
|
|
175
|
+
lines.push((0, import_format.info)(`Waiting ${(0, import_format.msHuman)(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
|
|
176
|
+
await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
async function ollamaChat(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS, retries = CONFIG.MAX_RETRIES) {
|
|
180
|
+
const body = { model, messages, stream: false, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
|
|
181
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
182
|
+
const start = Date.now();
|
|
183
|
+
try {
|
|
184
|
+
const result = await pi.exec("curl", [
|
|
185
|
+
"-s",
|
|
186
|
+
"--fail-with-body",
|
|
187
|
+
"-X",
|
|
188
|
+
"POST",
|
|
189
|
+
"--connect-timeout",
|
|
190
|
+
String(CONFIG.CONNECT_TIMEOUT_S),
|
|
191
|
+
"--max-time",
|
|
192
|
+
String(Math.ceil(timeoutMs / 1e3)),
|
|
193
|
+
`${OLLAMA_BASE}/api/chat`,
|
|
194
|
+
"-H",
|
|
195
|
+
"Content-Type: application/json",
|
|
196
|
+
"-d",
|
|
197
|
+
JSON.stringify(body)
|
|
198
|
+
], { timeout: timeoutMs + CONFIG.EXEC_BUFFER_MS });
|
|
199
|
+
const elapsedMs = Date.now() - start;
|
|
200
|
+
if (result.code !== 0) {
|
|
201
|
+
const detail = result.stderr?.trim() || result.stdout?.trim() || "unknown error";
|
|
202
|
+
throw new Error(`curl exited ${result.code}: ${detail}`);
|
|
203
|
+
}
|
|
204
|
+
if (!result.stdout.trim()) {
|
|
205
|
+
if (attempt < retries) {
|
|
206
|
+
await new Promise((r) => setTimeout(r, CONFIG.RETRY_DELAY_MS));
|
|
207
|
+
continue;
|
|
208
|
+
}
|
|
209
|
+
throw new Error(`Empty response from Ollama after ${attempt + 1} attempt(s)`);
|
|
210
|
+
}
|
|
211
|
+
const parsed = JSON.parse(result.stdout);
|
|
212
|
+
return { response: parsed, elapsedMs };
|
|
213
|
+
} catch (e) {
|
|
214
|
+
if (attempt < retries && (e.message.includes("Empty response") || e.message.includes("timed out") || e.message.includes("curl exited 22") || e.message.includes("curl exited 28") || e.message.includes("curl exited 35") || e.message.includes("curl exited 52"))) {
|
|
215
|
+
await new Promise((r) => setTimeout(r, CONFIG.RETRY_DELAY_MS));
|
|
216
|
+
continue;
|
|
217
|
+
}
|
|
218
|
+
throw e;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
throw new Error("Unreachable");
|
|
222
|
+
}
|
|
223
|
+
async function providerChat(providerInfo, model, messages, options = {}) {
|
|
224
|
+
const { baseUrl, apiKey } = providerInfo;
|
|
225
|
+
const maxTokens = options.maxTokens ?? CONFIG.NUM_PREDICT;
|
|
226
|
+
const temperature = options.temperature ?? CONFIG.TEMPERATURE;
|
|
227
|
+
const timeoutMs = options.timeoutMs ?? CONFIG.PROVIDER_TIMEOUT_MS;
|
|
228
|
+
if (!baseUrl) throw new Error(`No base URL for provider "${providerInfo.name}"`);
|
|
229
|
+
if (!apiKey) throw new Error(`No API key for provider "${providerInfo.name}". Set ${providerInfo.envKey || "the appropriate env var"}.`);
|
|
230
|
+
const url = `${baseUrl}/chat/completions`;
|
|
231
|
+
const body = {
|
|
232
|
+
model,
|
|
233
|
+
messages,
|
|
234
|
+
max_tokens: maxTokens,
|
|
235
|
+
temperature,
|
|
236
|
+
stream: false
|
|
237
|
+
};
|
|
238
|
+
if (options.tools && options.tools.length > 0) {
|
|
239
|
+
body.tools = options.tools;
|
|
240
|
+
}
|
|
241
|
+
const controller = new AbortController();
|
|
242
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
243
|
+
const start = Date.now();
|
|
244
|
+
try {
|
|
245
|
+
const res = await fetch(url, {
|
|
246
|
+
method: "POST",
|
|
247
|
+
headers: {
|
|
248
|
+
"Content-Type": "application/json",
|
|
249
|
+
"Authorization": `Bearer ${apiKey}`
|
|
250
|
+
},
|
|
251
|
+
body: JSON.stringify(body),
|
|
252
|
+
signal: controller.signal
|
|
253
|
+
});
|
|
254
|
+
const elapsedMs = Date.now() - start;
|
|
255
|
+
if (!res.ok) {
|
|
256
|
+
const errorText = await res.text().catch(() => "unknown error");
|
|
257
|
+
throw new Error(`API returned ${res.status}: ${(0, import_format.truncate)(errorText, 200)}`);
|
|
258
|
+
}
|
|
259
|
+
const data = await res.json();
|
|
260
|
+
const choice = data.choices?.[0];
|
|
261
|
+
const message = choice?.message || {};
|
|
262
|
+
const content = message.content || "";
|
|
263
|
+
const toolCalls = message.tool_calls || void 0;
|
|
264
|
+
return {
|
|
265
|
+
content,
|
|
266
|
+
toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : void 0,
|
|
267
|
+
elapsedMs,
|
|
268
|
+
usage: data.usage
|
|
269
|
+
};
|
|
270
|
+
} catch (e) {
|
|
271
|
+
const elapsedMs = Date.now() - start;
|
|
272
|
+
if (e.name === "AbortError") {
|
|
273
|
+
throw new Error(`Provider API timed out after ${(0, import_format.msHuman)(elapsedMs)}`);
|
|
274
|
+
}
|
|
275
|
+
throw e;
|
|
276
|
+
} finally {
|
|
277
|
+
clearTimeout(timeoutId);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
async function testConnectivity(providerInfo, model) {
|
|
281
|
+
try {
|
|
282
|
+
const start = Date.now();
|
|
283
|
+
const result = await providerChat(providerInfo, model, [
|
|
284
|
+
{ role: "user", content: "Reply with exactly: PONG" }
|
|
285
|
+
], { maxTokens: 10, timeoutMs: 3e4 });
|
|
286
|
+
const elapsedMs = Date.now() - start;
|
|
287
|
+
const content = result.content.trim().toUpperCase();
|
|
288
|
+
const reachable = true;
|
|
289
|
+
const authValid = true;
|
|
290
|
+
const hasPong = content.includes("PONG");
|
|
291
|
+
return {
|
|
292
|
+
pass: reachable && authValid,
|
|
293
|
+
reachable,
|
|
294
|
+
authValid,
|
|
295
|
+
modelName: model,
|
|
296
|
+
elapsedMs
|
|
297
|
+
};
|
|
298
|
+
} catch (e) {
|
|
299
|
+
const start = Date.now();
|
|
300
|
+
let reachable = false;
|
|
301
|
+
let authValid = false;
|
|
302
|
+
const msg = e.message || "";
|
|
303
|
+
if (msg.includes("timed out") || msg.includes("ECONNREFUSED") || msg.includes("ENOTFOUND") || msg.includes("fetch failed")) {
|
|
304
|
+
reachable = false;
|
|
305
|
+
authValid = false;
|
|
306
|
+
} else if (msg.includes("401") || msg.includes("403") || msg.includes("authentication") || msg.includes("unauthorized") || msg.includes("invalid API key")) {
|
|
307
|
+
reachable = true;
|
|
308
|
+
authValid = false;
|
|
309
|
+
} else if (msg.includes("404") || msg.includes("model")) {
|
|
310
|
+
reachable = true;
|
|
311
|
+
authValid = true;
|
|
312
|
+
} else {
|
|
313
|
+
reachable = true;
|
|
314
|
+
authValid = false;
|
|
315
|
+
}
|
|
316
|
+
return {
|
|
317
|
+
pass: false,
|
|
318
|
+
reachable,
|
|
319
|
+
authValid,
|
|
320
|
+
modelName: model,
|
|
321
|
+
elapsedMs: 0,
|
|
322
|
+
error: msg
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
async function testReasoning(model) {
|
|
327
|
+
const prompt = `A snail climbs 3 feet up a wall each day, but slides back 2 feet each night. The wall is 10 feet tall. How many days does it take the snail to reach the top? Think step by step and give the final answer on its own line like: ANSWER: <number>`;
|
|
328
|
+
try {
|
|
329
|
+
let response, elapsedMs;
|
|
330
|
+
let usedThinkingFallback = false;
|
|
331
|
+
try {
|
|
332
|
+
const result = await ollamaChat(model, [
|
|
333
|
+
{ role: "user", content: prompt }
|
|
334
|
+
]);
|
|
335
|
+
response = result.response;
|
|
336
|
+
elapsedMs = result.elapsedMs;
|
|
337
|
+
const msg2 = response?.message?.content || "";
|
|
338
|
+
const thinking2 = response?.message?.thinking || "";
|
|
339
|
+
if (msg2.trim().length === 0 && thinking2.trim().length === 0) {
|
|
340
|
+
throw new Error("empty \u2014 will retry with thinking");
|
|
341
|
+
}
|
|
342
|
+
} catch (firstErr) {
|
|
343
|
+
if (firstErr.message?.includes("empty \u2014 will retry with thinking")) {
|
|
344
|
+
const retry = await ollamaChat(model, [
|
|
345
|
+
{ role: "user", content: prompt }
|
|
346
|
+
], { think: true });
|
|
347
|
+
response = retry.response;
|
|
348
|
+
elapsedMs = retry.elapsedMs;
|
|
349
|
+
usedThinkingFallback = true;
|
|
350
|
+
} else {
|
|
351
|
+
throw firstErr;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
let msg = response?.message?.content || "";
|
|
355
|
+
const thinking = response?.message?.thinking || "";
|
|
356
|
+
const effectiveMsg = msg.trim().length > 0 ? msg : thinking;
|
|
357
|
+
if (effectiveMsg.trim().length === 0) {
|
|
358
|
+
return { pass: false, score: "ERROR", reasoning: "Empty response from Ollama (no content or thinking tokens)", answer: "?", elapsedMs };
|
|
359
|
+
}
|
|
360
|
+
const allNumbers = effectiveMsg.match(/\b(\d+)\b/g) || [];
|
|
361
|
+
const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
|
|
362
|
+
const isCorrect = answer === "8";
|
|
363
|
+
const reasoningPatterns = [
|
|
364
|
+
"because",
|
|
365
|
+
"therefore",
|
|
366
|
+
"since",
|
|
367
|
+
"step",
|
|
368
|
+
"subtract",
|
|
369
|
+
"minus",
|
|
370
|
+
"each day",
|
|
371
|
+
"each night",
|
|
372
|
+
"slides",
|
|
373
|
+
"climbs",
|
|
374
|
+
"night",
|
|
375
|
+
"reaches",
|
|
376
|
+
"finally",
|
|
377
|
+
"last day"
|
|
378
|
+
];
|
|
379
|
+
const hasReasoningWords = reasoningPatterns.some((w) => effectiveMsg.toLowerCase().includes(w));
|
|
380
|
+
const hasNumberedSteps = /^\s*\d+\.\s/m.test(effectiveMsg);
|
|
381
|
+
const hasReasoning = hasReasoningWords || hasNumberedSteps;
|
|
382
|
+
let score;
|
|
383
|
+
let pass;
|
|
384
|
+
if (isCorrect && hasReasoning) {
|
|
385
|
+
score = "STRONG";
|
|
386
|
+
pass = true;
|
|
387
|
+
} else if (isCorrect) {
|
|
388
|
+
score = "MODERATE";
|
|
389
|
+
pass = true;
|
|
390
|
+
} else if (hasReasoning) {
|
|
391
|
+
score = "WEAK";
|
|
392
|
+
pass = false;
|
|
393
|
+
} else {
|
|
394
|
+
score = "FAIL";
|
|
395
|
+
pass = false;
|
|
396
|
+
}
|
|
397
|
+
const displayMsg = msg.trim().length > 0 ? effectiveMsg : `[thinking tokens] ${effectiveMsg}`;
|
|
398
|
+
return { pass, score, reasoning: displayMsg, answer, elapsedMs };
|
|
399
|
+
} catch (e) {
|
|
400
|
+
return { pass: false, score: "ERROR", reasoning: e.message, answer: "?", elapsedMs: 0 };
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
async function testReasoningProvider(providerInfo, model) {
|
|
404
|
+
const prompt = `A snail climbs 3 feet up a wall each day, but slides back 2 feet each night. The wall is 10 feet tall. How many days does it take the snail to reach the top? Think step by step and give the final answer on its own line like: ANSWER: <number>`;
|
|
405
|
+
try {
|
|
406
|
+
const result = await providerChat(providerInfo, model, [
|
|
407
|
+
{ role: "user", content: prompt }
|
|
408
|
+
]);
|
|
409
|
+
const msg = result.content.trim();
|
|
410
|
+
if (msg.length === 0) {
|
|
411
|
+
return { pass: false, score: "ERROR", reasoning: "Empty response from provider", answer: "?", elapsedMs: result.elapsedMs };
|
|
412
|
+
}
|
|
413
|
+
const allNumbers = msg.match(/\b(\d+)\b/g) || [];
|
|
414
|
+
const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
|
|
415
|
+
const isCorrect = answer === "8";
|
|
416
|
+
const reasoningPatterns = [
|
|
417
|
+
"because",
|
|
418
|
+
"therefore",
|
|
419
|
+
"since",
|
|
420
|
+
"step",
|
|
421
|
+
"subtract",
|
|
422
|
+
"minus",
|
|
423
|
+
"each day",
|
|
424
|
+
"each night",
|
|
425
|
+
"slides",
|
|
426
|
+
"climbs",
|
|
427
|
+
"night",
|
|
428
|
+
"reaches",
|
|
429
|
+
"finally",
|
|
430
|
+
"last day"
|
|
431
|
+
];
|
|
432
|
+
const hasReasoningWords = reasoningPatterns.some((w) => msg.toLowerCase().includes(w));
|
|
433
|
+
const hasNumberedSteps = /^\s*\d+\.\s/m.test(msg);
|
|
434
|
+
const hasReasoning = hasReasoningWords || hasNumberedSteps;
|
|
435
|
+
let score;
|
|
436
|
+
let pass;
|
|
437
|
+
if (isCorrect && hasReasoning) {
|
|
438
|
+
score = "STRONG";
|
|
439
|
+
pass = true;
|
|
440
|
+
} else if (isCorrect) {
|
|
441
|
+
score = "MODERATE";
|
|
442
|
+
pass = true;
|
|
443
|
+
} else if (hasReasoning) {
|
|
444
|
+
score = "WEAK";
|
|
445
|
+
pass = false;
|
|
446
|
+
} else {
|
|
447
|
+
score = "FAIL";
|
|
448
|
+
pass = false;
|
|
449
|
+
}
|
|
450
|
+
return { pass, score, reasoning: msg, answer, elapsedMs: result.elapsedMs };
|
|
451
|
+
} catch (e) {
|
|
452
|
+
return { pass: false, score: "ERROR", reasoning: e.message, answer: "?", elapsedMs: 0 };
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
async function testThinking(model) {
|
|
456
|
+
const prompt = "Multiply 37 by 43. Explain your reasoning step by step and give the final answer.";
|
|
457
|
+
try {
|
|
458
|
+
const { response, elapsedMs } = await ollamaChat(model, [
|
|
459
|
+
{ role: "user", content: prompt }
|
|
460
|
+
], { think: true });
|
|
461
|
+
const msg = response?.message?.content || "";
|
|
462
|
+
const thinking = response?.message?.thinking || "";
|
|
463
|
+
const hasThinking = !!thinking && thinking.length > CONFIG.MIN_THINKING_LENGTH;
|
|
464
|
+
const thinkTagMatch = msg.match(/<think[^>]*>([\s\S]*?)<\/think>/i);
|
|
465
|
+
const hasThinkTags = !!thinkTagMatch;
|
|
466
|
+
return {
|
|
467
|
+
supported: hasThinking || hasThinkTags,
|
|
468
|
+
thinkingContent: hasThinking ? thinking : hasThinkTags ? thinkTagMatch[1] : "none",
|
|
469
|
+
answerContent: hasThinkTags ? msg.replace(/<think[^>]*>[\s\S]*?<\/think>/gi, "").trim() : msg,
|
|
470
|
+
elapsedMs
|
|
471
|
+
};
|
|
472
|
+
} catch (e) {
|
|
473
|
+
return { supported: false, thinkingContent: `error: ${e.message}`, answerContent: "", elapsedMs: 0 };
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
async function testToolUsage(model) {
|
|
477
|
+
const tools = [
|
|
478
|
+
{
|
|
479
|
+
type: "function",
|
|
480
|
+
function: {
|
|
481
|
+
name: "get_weather",
|
|
482
|
+
description: "Get the current weather for a location",
|
|
483
|
+
parameters: {
|
|
484
|
+
type: "object",
|
|
485
|
+
properties: {
|
|
486
|
+
location: { type: "string", description: "City name" },
|
|
487
|
+
unit: { type: "string", enum: ["celsius", "fahrenheit"] }
|
|
488
|
+
},
|
|
489
|
+
required: ["location"]
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
];
|
|
494
|
+
const body = {
|
|
495
|
+
model,
|
|
496
|
+
messages: [
|
|
497
|
+
{ role: "system", content: "You are a helpful assistant. Use the available tools when needed." },
|
|
498
|
+
{ role: "user", content: "What's the weather like in Paris right now?" }
|
|
499
|
+
],
|
|
500
|
+
tools,
|
|
501
|
+
stream: false,
|
|
502
|
+
options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
|
|
503
|
+
};
|
|
504
|
+
try {
|
|
505
|
+
const start = Date.now();
|
|
506
|
+
const result = await pi.exec("curl", [
|
|
507
|
+
"-s",
|
|
508
|
+
"--fail-with-body",
|
|
509
|
+
"-X",
|
|
510
|
+
"POST",
|
|
511
|
+
"--connect-timeout",
|
|
512
|
+
String(CONFIG.CONNECT_TIMEOUT_S),
|
|
513
|
+
"--max-time",
|
|
514
|
+
String(CONFIG.TOOL_TEST_MAX_TIME_S),
|
|
515
|
+
`${OLLAMA_BASE}/api/chat`,
|
|
516
|
+
"-H",
|
|
517
|
+
"Content-Type: application/json",
|
|
518
|
+
"-d",
|
|
519
|
+
JSON.stringify(body)
|
|
520
|
+
], { timeout: CONFIG.TOOL_TEST_TIMEOUT_MS });
|
|
521
|
+
const elapsedMs = Date.now() - start;
|
|
522
|
+
if (result.code !== 0) {
|
|
523
|
+
const detail = result.stderr?.trim() || result.stdout?.trim() || "unknown error";
|
|
524
|
+
return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `curl error: ${result.code}: ${detail}`, response: "", elapsedMs };
|
|
525
|
+
}
|
|
526
|
+
if (!result.stdout.trim()) throw new Error("Empty response from Ollama");
|
|
527
|
+
const parsed = JSON.parse(result.stdout);
|
|
528
|
+
const toolCalls = parsed?.message?.tool_calls;
|
|
529
|
+
const content = parsed?.message?.content || "";
|
|
530
|
+
if (toolCalls && toolCalls.length > 0) {
|
|
531
|
+
const call = toolCalls[0];
|
|
532
|
+
const fn = call.function || {};
|
|
533
|
+
let args = {};
|
|
534
|
+
try {
|
|
535
|
+
args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
|
|
536
|
+
} catch {
|
|
537
|
+
return {
|
|
538
|
+
pass: true,
|
|
539
|
+
score: "WEAK",
|
|
540
|
+
hasToolCalls: true,
|
|
541
|
+
toolCall: `malformed args: ${String(fn.arguments)}`,
|
|
542
|
+
response: content,
|
|
543
|
+
elapsedMs
|
|
544
|
+
};
|
|
545
|
+
}
|
|
546
|
+
const hasCorrectTool = fn.name === "get_weather";
|
|
547
|
+
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
548
|
+
let score;
|
|
549
|
+
if (hasCorrectTool && hasLocation) {
|
|
550
|
+
score = "STRONG";
|
|
551
|
+
} else if (hasCorrectTool) {
|
|
552
|
+
score = "MODERATE";
|
|
553
|
+
} else {
|
|
554
|
+
score = "WEAK";
|
|
555
|
+
}
|
|
556
|
+
return {
|
|
557
|
+
pass: true,
|
|
558
|
+
score,
|
|
559
|
+
hasToolCalls: true,
|
|
560
|
+
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
561
|
+
response: content,
|
|
562
|
+
elapsedMs
|
|
563
|
+
};
|
|
564
|
+
}
|
|
565
|
+
const firstBrace = content.indexOf("{");
|
|
566
|
+
let textToolParsed = null;
|
|
567
|
+
if (firstBrace !== -1) {
|
|
568
|
+
const lastBrace = content.lastIndexOf("}");
|
|
569
|
+
if (lastBrace > firstBrace) {
|
|
570
|
+
const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
|
|
571
|
+
try {
|
|
572
|
+
textToolParsed = JSON.parse(jsonCandidate);
|
|
573
|
+
} catch {
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
if (textToolParsed && typeof textToolParsed.name === "string") {
|
|
578
|
+
const fnName = textToolParsed.name;
|
|
579
|
+
const rawArgs = textToolParsed.arguments || { ...textToolParsed };
|
|
580
|
+
const { name: _, ...fnArgs } = rawArgs;
|
|
581
|
+
const isWeatherTool = fnName === "get_weather";
|
|
582
|
+
const hasLocation = typeof fnArgs.location === "string" && fnArgs.location.toLowerCase().includes("paris");
|
|
583
|
+
let score;
|
|
584
|
+
if (isWeatherTool && hasLocation) {
|
|
585
|
+
score = "STRONG";
|
|
586
|
+
} else if (isWeatherTool) {
|
|
587
|
+
score = "MODERATE";
|
|
588
|
+
} else {
|
|
589
|
+
score = "WEAK";
|
|
590
|
+
}
|
|
591
|
+
return {
|
|
592
|
+
pass: true,
|
|
593
|
+
score,
|
|
594
|
+
hasToolCalls: true,
|
|
595
|
+
toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
|
|
596
|
+
response: content,
|
|
597
|
+
elapsedMs
|
|
598
|
+
};
|
|
599
|
+
}
|
|
600
|
+
return {
|
|
601
|
+
pass: false,
|
|
602
|
+
score: "FAIL",
|
|
603
|
+
hasToolCalls: false,
|
|
604
|
+
toolCall: "none",
|
|
605
|
+
response: content,
|
|
606
|
+
elapsedMs
|
|
607
|
+
};
|
|
608
|
+
} catch (e) {
|
|
609
|
+
return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `error: ${e.message}`, response: "", elapsedMs: 0 };
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
async function testToolUsageProvider(providerInfo, model) {
|
|
613
|
+
const tools = [
|
|
614
|
+
{
|
|
615
|
+
type: "function",
|
|
616
|
+
function: {
|
|
617
|
+
name: "get_weather",
|
|
618
|
+
description: "Get the current weather for a location",
|
|
619
|
+
parameters: {
|
|
620
|
+
type: "object",
|
|
621
|
+
properties: {
|
|
622
|
+
location: { type: "string", description: "City name" },
|
|
623
|
+
unit: { type: "string", enum: ["celsius", "fahrenheit"] }
|
|
624
|
+
},
|
|
625
|
+
required: ["location"]
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
];
|
|
630
|
+
try {
|
|
631
|
+
const result = await providerChat(providerInfo, model, [
|
|
632
|
+
{ role: "system", content: "You are a helpful assistant. Use the available tools when needed." },
|
|
633
|
+
{ role: "user", content: "What's the weather like in Paris right now?" }
|
|
634
|
+
], {
|
|
635
|
+
maxTokens: CONFIG.NUM_PREDICT,
|
|
636
|
+
tools,
|
|
637
|
+
timeoutMs: CONFIG.PROVIDER_TOOL_TIMEOUT_MS
|
|
638
|
+
});
|
|
639
|
+
const content = result.content;
|
|
640
|
+
const toolCalls = result.toolCalls;
|
|
641
|
+
if (toolCalls && toolCalls.length > 0) {
|
|
642
|
+
const call = toolCalls[0];
|
|
643
|
+
const fn = call.function || {};
|
|
644
|
+
let args = {};
|
|
645
|
+
try {
|
|
646
|
+
args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
|
|
647
|
+
} catch {
|
|
648
|
+
return {
|
|
649
|
+
pass: true,
|
|
650
|
+
score: "WEAK",
|
|
651
|
+
hasToolCalls: true,
|
|
652
|
+
toolCall: `malformed args: ${String(fn.arguments)}`,
|
|
653
|
+
response: content,
|
|
654
|
+
elapsedMs: result.elapsedMs
|
|
655
|
+
};
|
|
656
|
+
}
|
|
657
|
+
const hasCorrectTool = fn.name === "get_weather";
|
|
658
|
+
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
659
|
+
let score;
|
|
660
|
+
if (hasCorrectTool && hasLocation) {
|
|
661
|
+
score = "STRONG";
|
|
662
|
+
} else if (hasCorrectTool) {
|
|
663
|
+
score = "MODERATE";
|
|
664
|
+
} else {
|
|
665
|
+
score = "WEAK";
|
|
666
|
+
}
|
|
667
|
+
return {
|
|
668
|
+
pass: true,
|
|
669
|
+
score,
|
|
670
|
+
hasToolCalls: true,
|
|
671
|
+
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
672
|
+
response: content,
|
|
673
|
+
elapsedMs: result.elapsedMs
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
const firstBrace = content.indexOf("{");
|
|
677
|
+
let textToolParsed = null;
|
|
678
|
+
if (firstBrace !== -1) {
|
|
679
|
+
const lastBrace = content.lastIndexOf("}");
|
|
680
|
+
if (lastBrace > firstBrace) {
|
|
681
|
+
const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
|
|
682
|
+
try {
|
|
683
|
+
textToolParsed = JSON.parse(jsonCandidate);
|
|
684
|
+
} catch {
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
if (textToolParsed && typeof textToolParsed.name === "string") {
|
|
689
|
+
const fnName = textToolParsed.name;
|
|
690
|
+
const rawArgs = textToolParsed.arguments || { ...textToolParsed };
|
|
691
|
+
const { name: _, ...fnArgs } = rawArgs;
|
|
692
|
+
const isWeatherTool = fnName === "get_weather";
|
|
693
|
+
const hasLocation = typeof fnArgs.location === "string" && fnArgs.location.toLowerCase().includes("paris");
|
|
694
|
+
let score;
|
|
695
|
+
if (isWeatherTool && hasLocation) {
|
|
696
|
+
score = "STRONG";
|
|
697
|
+
} else if (isWeatherTool) {
|
|
698
|
+
score = "MODERATE";
|
|
699
|
+
} else {
|
|
700
|
+
score = "WEAK";
|
|
701
|
+
}
|
|
702
|
+
return {
|
|
703
|
+
pass: true,
|
|
704
|
+
score,
|
|
705
|
+
hasToolCalls: true,
|
|
706
|
+
toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
|
|
707
|
+
response: content,
|
|
708
|
+
elapsedMs: result.elapsedMs
|
|
709
|
+
};
|
|
710
|
+
}
|
|
711
|
+
return {
|
|
712
|
+
pass: false,
|
|
713
|
+
score: "FAIL",
|
|
714
|
+
hasToolCalls: false,
|
|
715
|
+
toolCall: "none",
|
|
716
|
+
response: content,
|
|
717
|
+
elapsedMs: result.elapsedMs
|
|
718
|
+
};
|
|
719
|
+
} catch (e) {
|
|
720
|
+
return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `error: ${e.message}`, response: "", elapsedMs: 0 };
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
async function testReactParsing(model) {
|
|
724
|
+
const systemPrompt = [
|
|
725
|
+
"You are a helpful assistant with access to tools.",
|
|
726
|
+
"When you need to use a tool, you MUST output in this EXACT format:",
|
|
727
|
+
"Thought: <your reasoning about what to do>",
|
|
728
|
+
"Action: <tool_name>",
|
|
729
|
+
"Action Input: <JSON object with arguments>",
|
|
730
|
+
"Do NOT output anything after the Action Input line.",
|
|
731
|
+
"The available tools are: get_weather (parameters: location: string), calculate (parameters: expression: string)."
|
|
732
|
+
].join("\n");
|
|
733
|
+
const body = {
|
|
734
|
+
model,
|
|
735
|
+
messages: [
|
|
736
|
+
{ role: "system", content: systemPrompt },
|
|
737
|
+
{ role: "user", content: "What's the weather like in Tokyo? Use the get_weather tool." }
|
|
738
|
+
],
|
|
739
|
+
stream: false,
|
|
740
|
+
options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
|
|
741
|
+
};
|
|
742
|
+
try {
|
|
743
|
+
const start = Date.now();
|
|
744
|
+
const result = await pi.exec("curl", [
|
|
745
|
+
"-s",
|
|
746
|
+
"--fail-with-body",
|
|
747
|
+
"-X",
|
|
748
|
+
"POST",
|
|
749
|
+
"--connect-timeout",
|
|
750
|
+
String(CONFIG.CONNECT_TIMEOUT_S),
|
|
751
|
+
"--max-time",
|
|
752
|
+
String(CONFIG.TOOL_TEST_MAX_TIME_S),
|
|
753
|
+
`${OLLAMA_BASE}/api/chat`,
|
|
754
|
+
"-H",
|
|
755
|
+
"Content-Type: application/json",
|
|
756
|
+
"-d",
|
|
757
|
+
JSON.stringify(body)
|
|
758
|
+
], { timeout: CONFIG.TOOL_TEST_TIMEOUT_MS });
|
|
759
|
+
const elapsedMs = Date.now() - start;
|
|
760
|
+
if (result.code !== 0) {
|
|
761
|
+
const detail = result.stderr?.trim() || result.stdout?.trim() || "unknown error";
|
|
762
|
+
return { pass: false, score: "ERROR", toolCall: `curl error: ${result.code}: ${detail}`, thought: "", response: "", elapsedMs };
|
|
763
|
+
}
|
|
764
|
+
if (!result.stdout.trim()) throw new Error("Empty response from Ollama");
|
|
765
|
+
const parsed = JSON.parse(result.stdout);
|
|
766
|
+
const content = (parsed?.message?.content || "").trim();
|
|
767
|
+
if (!content) {
|
|
768
|
+
return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
|
|
769
|
+
}
|
|
770
|
+
const THOUGHT_RE = /Thought:\s*(.*?)(?=Action:|Final Answer:|$)/is;
|
|
771
|
+
const ACTION_RE = /Action:\s*[`"']?(\w+)[`"']?\s*\n?\s*Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:|Action:)|$)/is;
|
|
772
|
+
const ACTION_RE_SAMELINE = /Action:\s*[`"']?(\w+)[`"']?\s+Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:)|$)/is;
|
|
773
|
+
const ACTION_RE_LOOSE = /Action:\s*(.+?)\n\s*Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:|Action:)|$)/is;
|
|
774
|
+
const ACTION_RE_PAREN = /Action:\s*(\w+)\s*\(([^)]*)\)/i;
|
|
775
|
+
let thought = "";
|
|
776
|
+
const thoughtMatch = THOUGHT_RE.exec(content);
|
|
777
|
+
if (thoughtMatch) thought = thoughtMatch[1].trim();
|
|
778
|
+
let match = ACTION_RE.exec(content);
|
|
779
|
+
if (!match) match = ACTION_RE_SAMELINE.exec(content);
|
|
780
|
+
let looseMatch = false;
|
|
781
|
+
if (!match) match = ACTION_RE_LOOSE.exec(content), looseMatch = true;
|
|
782
|
+
let parenMatch = false;
|
|
783
|
+
if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
|
|
784
|
+
if (match) {
|
|
785
|
+
let toolName = match[1].trim().replace(/[`"']/g, "");
|
|
786
|
+
if (looseMatch) {
|
|
787
|
+
const actionText = toolName.toLowerCase();
|
|
788
|
+
if (actionText.includes("get_weather")) toolName = "get_weather";
|
|
789
|
+
else {
|
|
790
|
+
const toolWords = actionText.match(/\b[a-z][a-z0-9]*(?:[_-][a-z0-9]+)+\b/gi) || [];
|
|
791
|
+
if (toolWords.length > 0) toolName = toolWords[0];
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
const rawArgs = parenMatch ? match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim() : match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
|
|
795
|
+
let argsParsed = false;
|
|
796
|
+
let argsStr = rawArgs;
|
|
797
|
+
if (parenMatch && rawArgs && !rawArgs.startsWith("{")) {
|
|
798
|
+
const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
|
|
799
|
+
if (pairs) {
|
|
800
|
+
const obj = {};
|
|
801
|
+
for (const p of pairs) {
|
|
802
|
+
const colonIdx = p.indexOf(":");
|
|
803
|
+
const key = p.slice(0, colonIdx).trim();
|
|
804
|
+
let val = p.slice(colonIdx + 1).trim();
|
|
805
|
+
if (val.startsWith('"') && val.endsWith('"') || val.startsWith("'") && val.endsWith("'")) {
|
|
806
|
+
val = val.slice(1, -1);
|
|
807
|
+
}
|
|
808
|
+
obj[key] = val;
|
|
809
|
+
}
|
|
810
|
+
try {
|
|
811
|
+
argsStr = JSON.stringify(obj);
|
|
812
|
+
argsParsed = true;
|
|
813
|
+
} catch {
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
if (!argsParsed) {
|
|
818
|
+
const jsonStart = rawArgs.indexOf("{");
|
|
819
|
+
if (jsonStart !== -1) {
|
|
820
|
+
let depth = 0;
|
|
821
|
+
let jsonEnd = -1;
|
|
822
|
+
for (let i = jsonStart; i < rawArgs.length; i++) {
|
|
823
|
+
if (rawArgs[i] === "{") depth++;
|
|
824
|
+
else if (rawArgs[i] === "}") {
|
|
825
|
+
depth--;
|
|
826
|
+
if (depth === 0) {
|
|
827
|
+
jsonEnd = i;
|
|
828
|
+
break;
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
if (jsonEnd !== -1) {
|
|
833
|
+
const jsonStr = rawArgs.slice(jsonStart, jsonEnd + 1);
|
|
834
|
+
try {
|
|
835
|
+
JSON.parse(jsonStr);
|
|
836
|
+
argsParsed = true;
|
|
837
|
+
argsStr = jsonStr;
|
|
838
|
+
} catch {
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
let score;
|
|
844
|
+
const isWeatherTool = toolName.toLowerCase().includes("get_weather") || toolName.toLowerCase() === "get_weather";
|
|
845
|
+
if (isWeatherTool && argsParsed) {
|
|
846
|
+
score = "STRONG";
|
|
847
|
+
} else if (isWeatherTool) {
|
|
848
|
+
score = "MODERATE";
|
|
849
|
+
} else {
|
|
850
|
+
score = "WEAK";
|
|
851
|
+
}
|
|
852
|
+
return {
|
|
853
|
+
pass: true,
|
|
854
|
+
score,
|
|
855
|
+
toolCall: `${toolName}(${argsStr})`,
|
|
856
|
+
thought,
|
|
857
|
+
response: content,
|
|
858
|
+
elapsedMs
|
|
859
|
+
};
|
|
860
|
+
}
|
|
861
|
+
const hasToolMention = /\bget_weather\b/i.test(content) || /\btool\b/i.test(content);
|
|
862
|
+
if (hasToolMention) {
|
|
863
|
+
return {
|
|
864
|
+
pass: false,
|
|
865
|
+
score: "FAIL",
|
|
866
|
+
toolCall: "none \u2014 model mentioned tool but not in ReAct format",
|
|
867
|
+
thought: "",
|
|
868
|
+
response: content,
|
|
869
|
+
elapsedMs
|
|
870
|
+
};
|
|
871
|
+
}
|
|
872
|
+
return {
|
|
873
|
+
pass: false,
|
|
874
|
+
score: "FAIL",
|
|
875
|
+
toolCall: "none",
|
|
876
|
+
thought: "",
|
|
877
|
+
response: content,
|
|
878
|
+
elapsedMs
|
|
879
|
+
};
|
|
880
|
+
} catch (e) {
|
|
881
|
+
return { pass: false, score: "ERROR", toolCall: `error: ${e.message}`, thought: "", response: "", elapsedMs: 0 };
|
|
882
|
+
}
|
|
883
|
+
}
|
|
884
|
+
async function testInstructionFollowing(model) {
|
|
885
|
+
const prompt = `You must respond with ONLY a valid JSON object. No markdown, no explanation, no backticks, no extra text.
|
|
886
|
+
|
|
887
|
+
The JSON object must have exactly these 4 keys:
|
|
888
|
+
- "name" (string): your model name
|
|
889
|
+
- "can_count" (boolean): true
|
|
890
|
+
- "sum" (number): the result of 15 + 27
|
|
891
|
+
- "language" (string): the language you are responding in`;
|
|
892
|
+
try {
|
|
893
|
+
const { response, elapsedMs } = await ollamaChat(model, [
|
|
894
|
+
{ role: "user", content: prompt }
|
|
895
|
+
], { num_predict: CONFIG.NUM_PREDICT });
|
|
896
|
+
const msg = (response?.message?.content || "").trim();
|
|
897
|
+
let parsed = null;
|
|
898
|
+
let repairNote = "";
|
|
899
|
+
try {
|
|
900
|
+
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
901
|
+
parsed = JSON.parse(cleaned);
|
|
902
|
+
} catch {
|
|
903
|
+
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
904
|
+
const openBraces = (cleaned.match(/\{/g) || []).length;
|
|
905
|
+
const closeBraces = (cleaned.match(/\}/g) || []).length;
|
|
906
|
+
const openBrackets = (cleaned.match(/\[/g) || []).length;
|
|
907
|
+
const closeBrackets = (cleaned.match(/\]/g) || []).length;
|
|
908
|
+
if (openBraces > closeBraces || openBrackets > closeBrackets) {
|
|
909
|
+
const repaired = cleaned + "}".repeat(Math.max(0, openBraces - closeBraces)) + "]".repeat(Math.max(0, openBrackets - closeBrackets));
|
|
910
|
+
try {
|
|
911
|
+
parsed = JSON.parse(repaired);
|
|
912
|
+
repairNote = " (repaired truncated JSON)";
|
|
913
|
+
} catch {
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
if (!parsed) {
|
|
918
|
+
return { pass: false, score: "FAIL", output: (0, import_format.sanitizeForReport)(msg), elapsedMs };
|
|
919
|
+
}
|
|
920
|
+
const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
|
|
921
|
+
const correctSum = parsed.sum === 42;
|
|
922
|
+
const hasCorrectCount = parsed.can_count === true;
|
|
923
|
+
let score;
|
|
924
|
+
if (hasKeys && correctSum && hasCorrectCount) {
|
|
925
|
+
score = "STRONG";
|
|
926
|
+
} else if (hasKeys && (correctSum || hasCorrectCount)) {
|
|
927
|
+
score = "MODERATE";
|
|
928
|
+
} else if (parsed.sum !== void 0 || parsed.name) {
|
|
929
|
+
score = "WEAK";
|
|
930
|
+
} else {
|
|
931
|
+
score = "FAIL";
|
|
932
|
+
}
|
|
933
|
+
return {
|
|
934
|
+
pass: hasKeys,
|
|
935
|
+
score,
|
|
936
|
+
output: JSON.stringify(parsed) + repairNote,
|
|
937
|
+
elapsedMs
|
|
938
|
+
};
|
|
939
|
+
} catch (e) {
|
|
940
|
+
return { pass: false, score: "ERROR", output: e.message, elapsedMs: 0 };
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
async function testInstructionFollowingProvider(providerInfo, model) {
|
|
944
|
+
const prompt = `You must respond with ONLY a valid JSON object. No markdown, no explanation, no backticks, no extra text.
|
|
945
|
+
|
|
946
|
+
The JSON object must have exactly these 4 keys:
|
|
947
|
+
- "name" (string): your model name
|
|
948
|
+
- "can_count" (boolean): true
|
|
949
|
+
- "sum" (number): the result of 15 + 27
|
|
950
|
+
- "language" (string): the language you are responding in`;
|
|
951
|
+
try {
|
|
952
|
+
const result = await providerChat(providerInfo, model, [
|
|
953
|
+
{ role: "user", content: prompt }
|
|
954
|
+
]);
|
|
955
|
+
const msg = result.content.trim();
|
|
956
|
+
let parsed = null;
|
|
957
|
+
let repairNote = "";
|
|
958
|
+
try {
|
|
959
|
+
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
960
|
+
parsed = JSON.parse(cleaned);
|
|
961
|
+
} catch {
|
|
962
|
+
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
963
|
+
const openBraces = (cleaned.match(/\{/g) || []).length;
|
|
964
|
+
const closeBraces = (cleaned.match(/\}/g) || []).length;
|
|
965
|
+
const openBrackets = (cleaned.match(/\[/g) || []).length;
|
|
966
|
+
const closeBrackets = (cleaned.match(/\]/g) || []).length;
|
|
967
|
+
if (openBraces > closeBraces || openBrackets > closeBrackets) {
|
|
968
|
+
const repaired = cleaned + "}".repeat(Math.max(0, openBraces - closeBraces)) + "]".repeat(Math.max(0, openBrackets - closeBrackets));
|
|
969
|
+
try {
|
|
970
|
+
parsed = JSON.parse(repaired);
|
|
971
|
+
repairNote = " (repaired truncated JSON)";
|
|
972
|
+
} catch {
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
if (!parsed) {
|
|
977
|
+
return { pass: false, score: "FAIL", output: (0, import_format.sanitizeForReport)(msg), elapsedMs: result.elapsedMs };
|
|
978
|
+
}
|
|
979
|
+
const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
|
|
980
|
+
const correctSum = parsed.sum === 42;
|
|
981
|
+
const hasCorrectCount = parsed.can_count === true;
|
|
982
|
+
let score;
|
|
983
|
+
if (hasKeys && correctSum && hasCorrectCount) {
|
|
984
|
+
score = "STRONG";
|
|
985
|
+
} else if (hasKeys && (correctSum || hasCorrectCount)) {
|
|
986
|
+
score = "MODERATE";
|
|
987
|
+
} else if (parsed.sum !== void 0 || parsed.name) {
|
|
988
|
+
score = "WEAK";
|
|
989
|
+
} else {
|
|
990
|
+
score = "FAIL";
|
|
991
|
+
}
|
|
992
|
+
return {
|
|
993
|
+
pass: hasKeys,
|
|
994
|
+
score,
|
|
995
|
+
output: JSON.stringify(parsed) + repairNote,
|
|
996
|
+
elapsedMs: result.elapsedMs
|
|
997
|
+
};
|
|
998
|
+
} catch (e) {
|
|
999
|
+
return { pass: false, score: "ERROR", output: e.message, elapsedMs: 0 };
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
async function testToolSupport(model, family) {
|
|
1003
|
+
const cached = getCachedToolSupport(model);
|
|
1004
|
+
if (cached) {
|
|
1005
|
+
return {
|
|
1006
|
+
level: cached.support,
|
|
1007
|
+
cached: true,
|
|
1008
|
+
evidence: `cached (tested ${cached.testedAt})`,
|
|
1009
|
+
elapsedMs: 0
|
|
1010
|
+
};
|
|
1011
|
+
}
|
|
1012
|
+
const tools = [
|
|
1013
|
+
{
|
|
1014
|
+
type: "function",
|
|
1015
|
+
function: {
|
|
1016
|
+
name: "get_weather",
|
|
1017
|
+
description: "Get the current weather for a location",
|
|
1018
|
+
parameters: {
|
|
1019
|
+
type: "object",
|
|
1020
|
+
properties: {
|
|
1021
|
+
location: { type: "string", description: "City name" },
|
|
1022
|
+
unit: { type: "string", enum: ["celsius", "fahrenheit"] }
|
|
1023
|
+
},
|
|
1024
|
+
required: ["location"]
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
];
|
|
1029
|
+
const body = {
|
|
1030
|
+
model,
|
|
1031
|
+
messages: [
|
|
1032
|
+
{
|
|
1033
|
+
role: "system",
|
|
1034
|
+
content: "You are a helpful assistant with access to tools. When you need to look up information, use the available tools. Always use tools when asked about real-time data like weather."
|
|
1035
|
+
},
|
|
1036
|
+
{ role: "user", content: "What's the weather like in Tokyo right now? Use the get_weather tool to find out." }
|
|
1037
|
+
],
|
|
1038
|
+
tools,
|
|
1039
|
+
stream: false,
|
|
1040
|
+
options: { num_predict: 1024, temperature: 0.1 }
|
|
1041
|
+
};
|
|
1042
|
+
try {
|
|
1043
|
+
const start = Date.now();
|
|
1044
|
+
const result = await pi.exec("curl", [
|
|
1045
|
+
"-s",
|
|
1046
|
+
"--fail-with-body",
|
|
1047
|
+
"-X",
|
|
1048
|
+
"POST",
|
|
1049
|
+
"--connect-timeout",
|
|
1050
|
+
"30",
|
|
1051
|
+
"--max-time",
|
|
1052
|
+
"120",
|
|
1053
|
+
`${OLLAMA_BASE}/api/chat`,
|
|
1054
|
+
"-H",
|
|
1055
|
+
"Content-Type: application/json",
|
|
1056
|
+
"-d",
|
|
1057
|
+
JSON.stringify(body)
|
|
1058
|
+
], { timeout: 13e4 });
|
|
1059
|
+
const elapsedMs = Date.now() - start;
|
|
1060
|
+
if (result.code !== 0 || !result.stdout.trim()) {
|
|
1061
|
+
const detail = result.stderr?.trim() || result.stdout?.trim() || "empty response";
|
|
1062
|
+
const level2 = "none";
|
|
1063
|
+
cacheToolSupport(model, level2, family);
|
|
1064
|
+
return { level: level2, cached: false, evidence: `API error: ${(0, import_format.truncate)(detail, 100)}`, elapsedMs };
|
|
1065
|
+
}
|
|
1066
|
+
const parsed = JSON.parse(result.stdout);
|
|
1067
|
+
const toolCalls = parsed?.message?.tool_calls;
|
|
1068
|
+
const content = (parsed?.message?.content || "").trim();
|
|
1069
|
+
if (toolCalls && Array.isArray(toolCalls) && toolCalls.length > 0) {
|
|
1070
|
+
const fn = toolCalls[0].function || {};
|
|
1071
|
+
const fnName = fn.name || "unknown";
|
|
1072
|
+
let argsStr;
|
|
1073
|
+
try {
|
|
1074
|
+
const args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
|
|
1075
|
+
argsStr = JSON.stringify(args);
|
|
1076
|
+
} catch {
|
|
1077
|
+
argsStr = String(fn.arguments);
|
|
1078
|
+
}
|
|
1079
|
+
const level2 = "native";
|
|
1080
|
+
cacheToolSupport(model, level2, family);
|
|
1081
|
+
return {
|
|
1082
|
+
level: level2,
|
|
1083
|
+
cached: false,
|
|
1084
|
+
evidence: `API returned tool_calls: ${fnName}(${argsStr})`,
|
|
1085
|
+
elapsedMs
|
|
1086
|
+
};
|
|
1087
|
+
}
|
|
1088
|
+
const reactPatterns = [
|
|
1089
|
+
/^\s*Action:\s*/im,
|
|
1090
|
+
// "Action: get_weather"
|
|
1091
|
+
/^\s*Action Input:\s*/im,
|
|
1092
|
+
// "Action Input: {"location": "Tokyo"}"
|
|
1093
|
+
/^\s*Thought:\s*/im,
|
|
1094
|
+
// "Thought: I need to look up the weather"
|
|
1095
|
+
/Action:\s*\w+/i,
|
|
1096
|
+
// "Action: get_weather" anywhere
|
|
1097
|
+
/Action Input:\s*\{/i
|
|
1098
|
+
// "Action Input: {..." anywhere
|
|
1099
|
+
];
|
|
1100
|
+
const hasReActPattern = reactPatterns.some((p) => p.test(content));
|
|
1101
|
+
if (hasReActPattern) {
|
|
1102
|
+
const level2 = "react";
|
|
1103
|
+
cacheToolSupport(model, level2, family);
|
|
1104
|
+
return {
|
|
1105
|
+
level: level2,
|
|
1106
|
+
cached: false,
|
|
1107
|
+
evidence: `ReAct format detected in text response`,
|
|
1108
|
+
elapsedMs
|
|
1109
|
+
};
|
|
1110
|
+
}
|
|
1111
|
+
const strippedContent = content.replace(/^\s*```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
|
|
1112
|
+
const textToolPatterns = [
|
|
1113
|
+
/\bget_weather\b/i,
|
|
1114
|
+
// Model mentions the tool name
|
|
1115
|
+
/\bfunction_call\b/i,
|
|
1116
|
+
// Explicit function call marker
|
|
1117
|
+
/\btool_call\b/i,
|
|
1118
|
+
// Explicit tool call marker
|
|
1119
|
+
/"name"\s*:\s*"get_weather"/
|
|
1120
|
+
// JSON with tool name
|
|
1121
|
+
];
|
|
1122
|
+
const hasTextToolSignal = textToolPatterns.some((p) => p.test(strippedContent));
|
|
1123
|
+
const hasJsonToolCall = /"name"\s*:\s*"get_weather"/i.test(strippedContent) && /"arguments"\s*:\s*\{/i.test(strippedContent);
|
|
1124
|
+
if (hasJsonToolCall) {
|
|
1125
|
+
const level2 = "react";
|
|
1126
|
+
cacheToolSupport(model, level2, family);
|
|
1127
|
+
return {
|
|
1128
|
+
level: level2,
|
|
1129
|
+
cached: false,
|
|
1130
|
+
evidence: `JSON tool call in text (no native API tool_calls \u2014 will use react-fallback)`,
|
|
1131
|
+
elapsedMs
|
|
1132
|
+
};
|
|
1133
|
+
}
|
|
1134
|
+
const level = "none";
|
|
1135
|
+
cacheToolSupport(model, level, family);
|
|
1136
|
+
const cleanContent = (0, import_format.truncate)(strippedContent, 150);
|
|
1137
|
+
const evidenceDetail = hasTextToolSignal ? `no structured tool calling (text mentions tool: ${cleanContent})` : `no tool calling patterns (text: ${cleanContent})`;
|
|
1138
|
+
return { level, cached: false, evidence: evidenceDetail, elapsedMs };
|
|
1139
|
+
} catch (e) {
|
|
1140
|
+
const level = "none";
|
|
1141
|
+
cacheToolSupport(model, level, family);
|
|
1142
|
+
return { level, cached: false, evidence: `error: ${e.message}`, elapsedMs: 0 };
|
|
1143
|
+
}
|
|
1144
|
+
}
|
|
1145
|
+
async function getOllamaModels() {
|
|
1146
|
+
try {
|
|
1147
|
+
const result = await pi.exec("curl", ["-s", "--connect-timeout", "10", `${OLLAMA_BASE}/api/tags`], { timeout: 15e3 });
|
|
1148
|
+
if (result.code !== 0 || !result.stdout.trim()) return [];
|
|
1149
|
+
const data = JSON.parse(result.stdout);
|
|
1150
|
+
return (data.models || []).map((m) => m.name).filter(Boolean);
|
|
1151
|
+
} catch {
|
|
1152
|
+
return [];
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
function getCurrentModel(ctx) {
|
|
1156
|
+
return ctx.model?.id;
|
|
1157
|
+
}
|
|
1158
|
+
function updateModelsJsonReasoning(model, hasReasoning) {
|
|
1159
|
+
const agentDir = path.join(os.homedir(), ".pi", "agent");
|
|
1160
|
+
const modelsJsonPath = path.join(agentDir, "models.json");
|
|
1161
|
+
if (!fs.existsSync(modelsJsonPath)) {
|
|
1162
|
+
return { updated: false, message: "models.json not found \u2014 skipped" };
|
|
1163
|
+
}
|
|
1164
|
+
try {
|
|
1165
|
+
const raw = fs.readFileSync(modelsJsonPath, "utf-8");
|
|
1166
|
+
const config = JSON.parse(raw);
|
|
1167
|
+
let updated = false;
|
|
1168
|
+
for (const provider of Object.values(config.providers || {})) {
|
|
1169
|
+
const models = provider.models || [];
|
|
1170
|
+
for (const m of models) {
|
|
1171
|
+
if (m.id === model) {
|
|
1172
|
+
const current = m.reasoning;
|
|
1173
|
+
if (current === hasReasoning) {
|
|
1174
|
+
return { updated: false, message: `reasoning already "${hasReasoning}" for ${model} \u2014 no change` };
|
|
1175
|
+
}
|
|
1176
|
+
m.reasoning = hasReasoning;
|
|
1177
|
+
updated = true;
|
|
1178
|
+
break;
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
if (updated) break;
|
|
1182
|
+
}
|
|
1183
|
+
if (!updated) {
|
|
1184
|
+
return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
|
|
1185
|
+
}
|
|
1186
|
+
fs.writeFileSync(modelsJsonPath, JSON.stringify(config, null, 2) + "\n", "utf-8");
|
|
1187
|
+
const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
|
|
1188
|
+
return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
|
|
1189
|
+
} catch (e) {
|
|
1190
|
+
return { updated: false, message: `Failed to update models.json: ${e.message}` };
|
|
1191
|
+
}
|
|
1192
|
+
}
|
|
1193
|
+
const branding = [
|
|
1194
|
+
` \u26A1 Pi Model Benchmark v1.0.3`,
|
|
1195
|
+
` Written by VTSTech`,
|
|
1196
|
+
` GitHub: https://github.com/VTSTech`,
|
|
1197
|
+
` Website: www.vts-tech.org`
|
|
1198
|
+
].join("\n");
|
|
1199
|
+
async function testModelOllama(model) {
|
|
1200
|
+
const lines = [];
|
|
1201
|
+
const totalStart = Date.now();
|
|
1202
|
+
lines.push(branding);
|
|
1203
|
+
lines.push((0, import_format.section)(`MODEL: ${model}`));
|
|
1204
|
+
lines.push((0, import_format.info)("Provider: Ollama (local/remote)"));
|
|
1205
|
+
let modelSize = "unknown";
|
|
1206
|
+
let modelFamily = "unknown";
|
|
1207
|
+
let modelParams = "unknown";
|
|
1208
|
+
let modelQuant = "unknown";
|
|
1209
|
+
let modelModified = "unknown";
|
|
1210
|
+
try {
|
|
1211
|
+
const tagsResult = await pi.exec("curl", ["-s", `${OLLAMA_BASE}/api/tags`], { timeout: 1e4 });
|
|
1212
|
+
if (tagsResult.code === 0 && tagsResult.stdout.trim()) {
|
|
1213
|
+
const tags = JSON.parse(tagsResult.stdout);
|
|
1214
|
+
const entry = (tags.models || []).find((m) => m.name === model);
|
|
1215
|
+
if (entry) {
|
|
1216
|
+
const details = entry.details || {};
|
|
1217
|
+
const sizeBytes = entry.size || 0;
|
|
1218
|
+
const sizeGB = sizeBytes / (1024 * 1024 * 1024);
|
|
1219
|
+
const sizeMB = sizeBytes / (1024 * 1024);
|
|
1220
|
+
modelSize = sizeGB >= 1 ? `${sizeGB.toFixed(1)} GB` : `${sizeMB.toFixed(0)} MB`;
|
|
1221
|
+
modelFamily = details.family || details.families?.[0] || "unknown";
|
|
1222
|
+
modelParams = details.parameter_size || "unknown";
|
|
1223
|
+
modelQuant = details.quantization_level || "unknown";
|
|
1224
|
+
const modDate = entry.modified_at ? new Date(entry.modified_at) : null;
|
|
1225
|
+
modelModified = modDate ? modDate.toLocaleDateString() : "unknown";
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
} catch {
|
|
1229
|
+
}
|
|
1230
|
+
const detectedFamily = (0, import_ollama.detectModelFamily)(model);
|
|
1231
|
+
lines.push((0, import_format.info)(`Size: ${modelSize} | Params: ${modelParams} | Quant: ${modelQuant}`));
|
|
1232
|
+
lines.push((0, import_format.info)(`Family: ${modelFamily} | Detected: ${detectedFamily} | Modified: ${modelModified}`));
|
|
1233
|
+
lines.push((0, import_format.section)("REASONING TEST"));
|
|
1234
|
+
lines.push((0, import_format.info)("Prompt: A snail climbs 3ft up a wall each day, slides 2ft back each night. Wall is 10ft. How many days?"));
|
|
1235
|
+
lines.push((0, import_format.info)("Testing..."));
|
|
1236
|
+
const reasoning = await testReasoning(model);
|
|
1237
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(reasoning.elapsedMs)}`));
|
|
1238
|
+
if (reasoning.score === "STRONG") {
|
|
1239
|
+
lines.push((0, import_format.ok)(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
|
|
1240
|
+
} else if (reasoning.score === "MODERATE") {
|
|
1241
|
+
lines.push((0, import_format.ok)(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
|
|
1242
|
+
} else if (reasoning.score === "WEAK") {
|
|
1243
|
+
lines.push((0, import_format.fail)(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
|
|
1244
|
+
} else if (reasoning.score === "FAIL") {
|
|
1245
|
+
lines.push((0, import_format.fail)(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
|
|
1246
|
+
} else {
|
|
1247
|
+
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : (0, import_format.truncate)(reasoning.reasoning, 300);
|
|
1248
|
+
lines.push((0, import_format.fail)(`Error: ${errMsg}`));
|
|
1249
|
+
}
|
|
1250
|
+
lines.push((0, import_format.info)(`Response: ${(0, import_format.sanitizeForReport)(reasoning.reasoning)}`));
|
|
1251
|
+
lines.push((0, import_format.section)("THINKING TEST"));
|
|
1252
|
+
lines.push((0, import_format.info)('Prompt: "Multiply 37 by 43. Explain your reasoning step by step."'));
|
|
1253
|
+
await rateLimitDelay(lines);
|
|
1254
|
+
const thinking = await testThinking(model);
|
|
1255
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(thinking.elapsedMs)}`));
|
|
1256
|
+
if (thinking.supported) {
|
|
1257
|
+
lines.push((0, import_format.ok)(`Thinking/reasoning tokens: SUPPORTED`));
|
|
1258
|
+
lines.push((0, import_format.info)(`Thinking content: ${(0, import_format.sanitizeForReport)(thinking.thinkingContent)}`));
|
|
1259
|
+
} else {
|
|
1260
|
+
lines.push((0, import_format.fail)(`Thinking/reasoning tokens: NOT SUPPORTED`));
|
|
1261
|
+
}
|
|
1262
|
+
lines.push((0, import_format.info)(`Answer output: ${(0, import_format.sanitizeForReport)(thinking.answerContent)}`));
|
|
1263
|
+
lines.push((0, import_format.section)("MODELS.JSON SYNC"));
|
|
1264
|
+
const reasoningUpdate = updateModelsJsonReasoning(model, thinking.supported);
|
|
1265
|
+
lines.push((0, import_format.info)(reasoningUpdate.message));
|
|
1266
|
+
lines.push((0, import_format.section)("TOOL USAGE TEST"));
|
|
1267
|
+
lines.push((0, import_format.info)(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
|
|
1268
|
+
lines.push((0, import_format.info)("Testing..."));
|
|
1269
|
+
await rateLimitDelay(lines);
|
|
1270
|
+
const tools = await testToolUsage(model);
|
|
1271
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(tools.elapsedMs)}`));
|
|
1272
|
+
if (tools.score === "STRONG") {
|
|
1273
|
+
lines.push((0, import_format.ok)(`Tool call: ${tools.toolCall} (${tools.score})`));
|
|
1274
|
+
if (tools.response) {
|
|
1275
|
+
lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(tools.response)}`));
|
|
1276
|
+
}
|
|
1277
|
+
} else if (tools.score === "MODERATE") {
|
|
1278
|
+
lines.push((0, import_format.ok)(`Tool call: ${tools.toolCall} (${tools.score})`));
|
|
1279
|
+
if (tools.response) {
|
|
1280
|
+
lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(tools.response)}`));
|
|
1281
|
+
}
|
|
1282
|
+
} else if (tools.score === "WEAK") {
|
|
1283
|
+
lines.push((0, import_format.warn)(`Tool call: ${tools.toolCall} (${tools.score}) \u2014 malformed call`));
|
|
1284
|
+
if (tools.response) {
|
|
1285
|
+
lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(tools.response)}`));
|
|
1286
|
+
}
|
|
1287
|
+
} else if (tools.score === "FAIL") {
|
|
1288
|
+
const hasResponse = tools.response && tools.response.trim().length > 0;
|
|
1289
|
+
lines.push((0, import_format.fail)(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${tools.score})`));
|
|
1290
|
+
if (hasResponse) {
|
|
1291
|
+
lines.push((0, import_format.info)(`Text response: ${(0, import_format.sanitizeForReport)(tools.response)}`));
|
|
1292
|
+
} else {
|
|
1293
|
+
lines.push((0, import_format.info)("Text response: (empty)"));
|
|
1294
|
+
}
|
|
1295
|
+
} else {
|
|
1296
|
+
lines.push((0, import_format.fail)(`Error: ${tools.toolCall}`));
|
|
1297
|
+
}
|
|
1298
|
+
lines.push((0, import_format.section)("REACT PARSING TEST"));
|
|
1299
|
+
lines.push((0, import_format.info)(`Prompt: "What's the weather in Tokyo?" (ReAct format, no native tools)`));
|
|
1300
|
+
lines.push((0, import_format.info)("Testing..."));
|
|
1301
|
+
await rateLimitDelay(lines);
|
|
1302
|
+
const react = await testReactParsing(model);
|
|
1303
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(react.elapsedMs)}`));
|
|
1304
|
+
if (react.score === "STRONG") {
|
|
1305
|
+
lines.push((0, import_format.ok)(`ReAct parsed: ${react.toolCall} (${react.score})`));
|
|
1306
|
+
if (react.thought) {
|
|
1307
|
+
lines.push((0, import_format.info)(`Thought: ${(0, import_format.sanitizeForReport)(react.thought)}`));
|
|
1308
|
+
}
|
|
1309
|
+
} else if (react.score === "MODERATE") {
|
|
1310
|
+
lines.push((0, import_format.ok)(`ReAct parsed: ${react.toolCall} (${react.score})`));
|
|
1311
|
+
if (react.thought) {
|
|
1312
|
+
lines.push((0, import_format.info)(`Thought: ${(0, import_format.sanitizeForReport)(react.thought)}`));
|
|
1313
|
+
}
|
|
1314
|
+
} else if (react.score === "WEAK") {
|
|
1315
|
+
lines.push((0, import_format.warn)(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args`));
|
|
1316
|
+
if (react.thought) {
|
|
1317
|
+
lines.push((0, import_format.info)(`Thought: ${(0, import_format.sanitizeForReport)(react.thought)}`));
|
|
1318
|
+
}
|
|
1319
|
+
} else if (react.score === "FAIL") {
|
|
1320
|
+
lines.push((0, import_format.fail)(`ReAct parsing: ${react.toolCall} (${react.score})`));
|
|
1321
|
+
if (react.response) {
|
|
1322
|
+
lines.push((0, import_format.info)(`Response: ${(0, import_format.sanitizeForReport)(react.response)}`));
|
|
1323
|
+
}
|
|
1324
|
+
} else {
|
|
1325
|
+
lines.push((0, import_format.fail)(`Error: ${react.toolCall}`));
|
|
1326
|
+
}
|
|
1327
|
+
lines.push((0, import_format.section)("INSTRUCTION FOLLOWING TEST"));
|
|
1328
|
+
lines.push((0, import_format.info)("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
|
|
1329
|
+
lines.push((0, import_format.info)("Testing..."));
|
|
1330
|
+
await rateLimitDelay(lines);
|
|
1331
|
+
const instructions = await testInstructionFollowing(model);
|
|
1332
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(instructions.elapsedMs)}`));
|
|
1333
|
+
if (instructions.score === "STRONG") {
|
|
1334
|
+
lines.push((0, import_format.ok)(`JSON output valid with correct values (${instructions.score})`));
|
|
1335
|
+
} else if (instructions.score === "MODERATE") {
|
|
1336
|
+
lines.push((0, import_format.ok)(`JSON output valid but some values incorrect (${instructions.score})`));
|
|
1337
|
+
} else if (instructions.score === "WEAK") {
|
|
1338
|
+
lines.push((0, import_format.warn)(`Partial JSON compliance (${instructions.score})`));
|
|
1339
|
+
} else {
|
|
1340
|
+
lines.push((0, import_format.fail)(`Failed to produce valid JSON (${instructions.score})`));
|
|
1341
|
+
}
|
|
1342
|
+
lines.push((0, import_format.info)(`Output: ${(0, import_format.sanitizeForReport)(instructions.output)}`));
|
|
1343
|
+
lines.push((0, import_format.section)("TOOL SUPPORT DETECTION"));
|
|
1344
|
+
lines.push((0, import_format.info)("Probing model for tool calling capability (native / ReAct / none)"));
|
|
1345
|
+
lines.push((0, import_format.info)("Testing..."));
|
|
1346
|
+
await rateLimitDelay(lines);
|
|
1347
|
+
const toolSupport = await testToolSupport(model, detectedFamily);
|
|
1348
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(toolSupport.elapsedMs)}`));
|
|
1349
|
+
const supportLabel = (level) => {
|
|
1350
|
+
switch (level) {
|
|
1351
|
+
case "native":
|
|
1352
|
+
return "NATIVE (structured API tool_calls)";
|
|
1353
|
+
case "react":
|
|
1354
|
+
return "REACT (Action:/Action Input: text format)";
|
|
1355
|
+
case "none":
|
|
1356
|
+
return "NONE (no tool support detected)";
|
|
1357
|
+
default:
|
|
1358
|
+
return "UNKNOWN";
|
|
1359
|
+
}
|
|
1360
|
+
};
|
|
1361
|
+
if (toolSupport.cached) {
|
|
1362
|
+
lines.push((0, import_format.info)(`Result: ${supportLabel(toolSupport.level)} \u2014 from cache`));
|
|
1363
|
+
} else {
|
|
1364
|
+
if (toolSupport.level === "native") {
|
|
1365
|
+
lines.push((0, import_format.ok)(`Tool support: ${supportLabel(toolSupport.level)}`));
|
|
1366
|
+
} else if (toolSupport.level === "react") {
|
|
1367
|
+
lines.push((0, import_format.ok)(`Tool support: ${supportLabel(toolSupport.level)}`));
|
|
1368
|
+
} else {
|
|
1369
|
+
lines.push((0, import_format.warn)(`Tool support: ${supportLabel(toolSupport.level)}`));
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
lines.push((0, import_format.info)(`Evidence: ${toolSupport.evidence}`));
|
|
1373
|
+
lines.push((0, import_format.info)(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
|
|
1374
|
+
lines.push((0, import_format.section)("SUMMARY"));
|
|
1375
|
+
const totalMs = Date.now() - totalStart;
|
|
1376
|
+
const tests = [
|
|
1377
|
+
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
1378
|
+
{ name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
|
|
1379
|
+
{ name: "Tool Usage", pass: tools.pass, score: tools.score },
|
|
1380
|
+
{ name: "ReAct Parse", pass: react.pass, score: react.score },
|
|
1381
|
+
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
1382
|
+
{ name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
|
|
1383
|
+
];
|
|
1384
|
+
const passed = tests.filter((t) => t.pass).length;
|
|
1385
|
+
const total = tests.length;
|
|
1386
|
+
for (const t of tests) {
|
|
1387
|
+
lines.push(t.pass ? (0, import_format.ok)(`${t.name}: ${t.score}`) : (0, import_format.fail)(`${t.name}: ${t.score}`));
|
|
1388
|
+
}
|
|
1389
|
+
lines.push((0, import_format.info)(`Total time: ${(0, import_format.msHuman)(totalMs)}`));
|
|
1390
|
+
lines.push((0, import_format.info)(`Score: ${passed}/${total} tests passed`));
|
|
1391
|
+
lines.push((0, import_format.section)("RECOMMENDATION"));
|
|
1392
|
+
if (passed === 6) {
|
|
1393
|
+
lines.push((0, import_format.ok)(`${model} is a STRONG model \u2014 full capability`));
|
|
1394
|
+
} else if (passed >= 5) {
|
|
1395
|
+
lines.push((0, import_format.ok)(`${model} is a GOOD model \u2014 most capabilities work`));
|
|
1396
|
+
} else if (passed >= 4) {
|
|
1397
|
+
lines.push((0, import_format.warn)(`${model} is USABLE \u2014 some capabilities are limited`));
|
|
1398
|
+
} else {
|
|
1399
|
+
lines.push((0, import_format.fail)(`${model} is WEAK \u2014 limited capabilities for agent use`));
|
|
1400
|
+
}
|
|
1401
|
+
return lines.join("\n");
|
|
1402
|
+
}
|
|
1403
|
+
async function testModelProvider(providerInfo, model) {
|
|
1404
|
+
const lines = [];
|
|
1405
|
+
const totalStart = Date.now();
|
|
1406
|
+
lines.push(branding);
|
|
1407
|
+
lines.push((0, import_format.section)(`MODEL: ${model}`));
|
|
1408
|
+
lines.push((0, import_format.info)(`Provider: ${providerInfo.name} (built-in)`));
|
|
1409
|
+
lines.push((0, import_format.info)(`API: ${providerInfo.apiMode || "openai-completions"}`));
|
|
1410
|
+
lines.push((0, import_format.info)(`Base URL: ${providerInfo.baseUrl || "unknown"}`));
|
|
1411
|
+
if (providerInfo.apiKey) {
|
|
1412
|
+
lines.push((0, import_format.info)(`API Key: ****${providerInfo.apiKey.slice(-4)}`));
|
|
1413
|
+
} else {
|
|
1414
|
+
lines.push((0, import_format.warn)(`API Key: NOT SET (${providerInfo.envKey || "env var not found"})`));
|
|
1415
|
+
}
|
|
1416
|
+
lines.push((0, import_format.section)("CONNECTIVITY TEST"));
|
|
1417
|
+
lines.push((0, import_format.info)("Sending minimal request to verify API reachability and key validity..."));
|
|
1418
|
+
const connectivity = await testConnectivity(providerInfo, model);
|
|
1419
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(connectivity.elapsedMs)}`));
|
|
1420
|
+
if (connectivity.pass) {
|
|
1421
|
+
lines.push((0, import_format.ok)(`API reachable and authenticated`));
|
|
1422
|
+
} else {
|
|
1423
|
+
if (!connectivity.reachable) {
|
|
1424
|
+
lines.push((0, import_format.fail)(`API not reachable: ${connectivity.error || "unknown error"}`));
|
|
1425
|
+
} else if (!connectivity.authValid) {
|
|
1426
|
+
lines.push((0, import_format.fail)(`Authentication failed: ${connectivity.error || "invalid or missing API key"}`));
|
|
1427
|
+
} else {
|
|
1428
|
+
lines.push((0, import_format.fail)(`Connectivity error: ${connectivity.error || "unknown"}`));
|
|
1429
|
+
}
|
|
1430
|
+
lines.push((0, import_format.warn)("Skipping remaining tests \u2014 fix connectivity first"));
|
|
1431
|
+
lines.push((0, import_format.info)("Tip: Check your API key is set correctly and the provider endpoint is accessible"));
|
|
1432
|
+
return lines.join("\n");
|
|
1433
|
+
}
|
|
1434
|
+
lines.push((0, import_format.section)("REASONING TEST"));
|
|
1435
|
+
lines.push((0, import_format.info)("Prompt: A snail climbs 3ft up a wall each day, slides 2ft back each night. Wall is 10ft. How many days?"));
|
|
1436
|
+
lines.push((0, import_format.info)("Testing..."));
|
|
1437
|
+
await rateLimitDelay(lines);
|
|
1438
|
+
const reasoning = await testReasoningProvider(providerInfo, model);
|
|
1439
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(reasoning.elapsedMs)}`));
|
|
1440
|
+
if (reasoning.score === "STRONG") {
|
|
1441
|
+
lines.push((0, import_format.ok)(`Answer: ${reasoning.answer} \u2014 Correct with clear reasoning (${reasoning.score})`));
|
|
1442
|
+
} else if (reasoning.score === "MODERATE") {
|
|
1443
|
+
lines.push((0, import_format.ok)(`Answer: ${reasoning.answer} \u2014 Correct but weak reasoning (${reasoning.score})`));
|
|
1444
|
+
} else if (reasoning.score === "WEAK") {
|
|
1445
|
+
lines.push((0, import_format.fail)(`Answer: ${reasoning.answer} \u2014 Reasoned but wrong answer (${reasoning.score})`));
|
|
1446
|
+
} else if (reasoning.score === "FAIL") {
|
|
1447
|
+
lines.push((0, import_format.fail)(`Answer: ${reasoning.answer} \u2014 No reasoning detected (${reasoning.score})`));
|
|
1448
|
+
} else {
|
|
1449
|
+
const errMsg = reasoning.reasoning.includes("<!DOCTYPE") || reasoning.reasoning.includes("<html") ? reasoning.reasoning.split("\n")[0].slice(0, 100) + "..." : (0, import_format.truncate)(reasoning.reasoning, 300);
|
|
1450
|
+
lines.push((0, import_format.fail)(`Error: ${errMsg}`));
|
|
1451
|
+
}
|
|
1452
|
+
lines.push((0, import_format.info)(`Response: ${(0, import_format.sanitizeForReport)(reasoning.reasoning)}`));
|
|
1453
|
+
lines.push((0, import_format.section)("INSTRUCTION FOLLOWING TEST"));
|
|
1454
|
+
lines.push((0, import_format.info)("Prompt: Respond with ONLY a JSON object with keys: name, can_count, sum (15+27), language"));
|
|
1455
|
+
lines.push((0, import_format.info)("Testing..."));
|
|
1456
|
+
await rateLimitDelay(lines);
|
|
1457
|
+
const instructions = await testInstructionFollowingProvider(providerInfo, model);
|
|
1458
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(instructions.elapsedMs)}`));
|
|
1459
|
+
if (instructions.score === "STRONG") {
|
|
1460
|
+
lines.push((0, import_format.ok)(`JSON output valid with correct values (${instructions.score})`));
|
|
1461
|
+
} else if (instructions.score === "MODERATE") {
|
|
1462
|
+
lines.push((0, import_format.ok)(`JSON output valid but some values incorrect (${instructions.score})`));
|
|
1463
|
+
} else if (instructions.score === "WEAK") {
|
|
1464
|
+
lines.push((0, import_format.warn)(`Partial JSON compliance (${instructions.score})`));
|
|
1465
|
+
} else {
|
|
1466
|
+
lines.push((0, import_format.fail)(`Failed to produce valid JSON (${instructions.score})`));
|
|
1467
|
+
}
|
|
1468
|
+
lines.push((0, import_format.info)(`Output: ${(0, import_format.sanitizeForReport)(instructions.output)}`));
|
|
1469
|
+
lines.push((0, import_format.section)("TOOL USAGE TEST"));
|
|
1470
|
+
lines.push((0, import_format.info)(`Prompt: "What's the weather in Paris?" (with get_weather tool available)`));
|
|
1471
|
+
lines.push((0, import_format.info)("Testing..."));
|
|
1472
|
+
await rateLimitDelay(lines);
|
|
1473
|
+
const toolTest = await testToolUsageProvider(providerInfo, model);
|
|
1474
|
+
lines.push((0, import_format.info)(`Time: ${(0, import_format.msHuman)(toolTest.elapsedMs)}`));
|
|
1475
|
+
if (toolTest.score === "STRONG") {
|
|
1476
|
+
lines.push((0, import_format.ok)(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
|
|
1477
|
+
if (toolTest.response) {
|
|
1478
|
+
lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(toolTest.response)}`));
|
|
1479
|
+
}
|
|
1480
|
+
} else if (toolTest.score === "MODERATE") {
|
|
1481
|
+
lines.push((0, import_format.ok)(`Tool call: ${toolTest.toolCall} (${toolTest.score})`));
|
|
1482
|
+
if (toolTest.response) {
|
|
1483
|
+
lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(toolTest.response)}`));
|
|
1484
|
+
}
|
|
1485
|
+
} else if (toolTest.score === "WEAK") {
|
|
1486
|
+
lines.push((0, import_format.warn)(`Tool call: ${toolTest.toolCall} (${toolTest.score}) \u2014 malformed call`));
|
|
1487
|
+
if (toolTest.response) {
|
|
1488
|
+
lines.push((0, import_format.info)(`Raw response: ${(0, import_format.sanitizeForReport)(toolTest.response)}`));
|
|
1489
|
+
}
|
|
1490
|
+
} else if (toolTest.score === "FAIL") {
|
|
1491
|
+
const hasResponse = toolTest.response && toolTest.response.trim().length > 0;
|
|
1492
|
+
lines.push((0, import_format.fail)(`Tool call: none \u2014 ${hasResponse ? "model responded in text instead" : "model returned empty response"} (${toolTest.score})`));
|
|
1493
|
+
if (hasResponse) {
|
|
1494
|
+
lines.push((0, import_format.info)(`Text response: ${(0, import_format.sanitizeForReport)(toolTest.response)}`));
|
|
1495
|
+
} else {
|
|
1496
|
+
lines.push((0, import_format.info)("Text response: (empty)"));
|
|
1497
|
+
}
|
|
1498
|
+
} else {
|
|
1499
|
+
lines.push((0, import_format.fail)(`Error: ${toolTest.toolCall}`));
|
|
1500
|
+
}
|
|
1501
|
+
lines.push((0, import_format.section)("SKIPPED TESTS (OLLAMA-ONLY)"));
|
|
1502
|
+
lines.push((0, import_format.warn)("Thinking test \u2014 Ollama-specific think:true option and message.thinking field"));
|
|
1503
|
+
lines.push((0, import_format.warn)("ReAct parsing test \u2014 only relevant for Ollama models without native tool calling"));
|
|
1504
|
+
lines.push((0, import_format.warn)("Tool support detection \u2014 Ollama-specific tool support cache"));
|
|
1505
|
+
lines.push((0, import_format.warn)("Model metadata \u2014 Ollama-specific /api/tags endpoint"));
|
|
1506
|
+
lines.push((0, import_format.section)("SUMMARY"));
|
|
1507
|
+
const totalMs = Date.now() - totalStart;
|
|
1508
|
+
const tests = [
|
|
1509
|
+
{ name: "Connectivity", pass: connectivity.pass, score: connectivity.pass ? "OK" : "FAIL" },
|
|
1510
|
+
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
1511
|
+
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
1512
|
+
{ name: "Tool Usage", pass: toolTest.pass, score: toolTest.score }
|
|
1513
|
+
];
|
|
1514
|
+
const passed = tests.filter((t) => t.pass).length;
|
|
1515
|
+
const total = tests.length;
|
|
1516
|
+
for (const t of tests) {
|
|
1517
|
+
lines.push(t.pass ? (0, import_format.ok)(`${t.name}: ${t.score}`) : (0, import_format.fail)(`${t.name}: ${t.score}`));
|
|
1518
|
+
}
|
|
1519
|
+
lines.push((0, import_format.info)(`Total time: ${(0, import_format.msHuman)(totalMs)}`));
|
|
1520
|
+
lines.push((0, import_format.info)(`Score: ${passed}/${total} tests passed`));
|
|
1521
|
+
lines.push((0, import_format.section)("RECOMMENDATION"));
|
|
1522
|
+
if (passed === 4) {
|
|
1523
|
+
lines.push((0, import_format.ok)(`${model} is a STRONG model via ${providerInfo.name} \u2014 full capability`));
|
|
1524
|
+
} else if (passed >= 3) {
|
|
1525
|
+
lines.push((0, import_format.ok)(`${model} is a GOOD model via ${providerInfo.name} \u2014 most capabilities work`));
|
|
1526
|
+
} else if (passed >= 2) {
|
|
1527
|
+
lines.push((0, import_format.warn)(`${model} is USABLE via ${providerInfo.name} \u2014 some capabilities are limited`));
|
|
1528
|
+
} else {
|
|
1529
|
+
lines.push((0, import_format.fail)(`${model} is WEAK via ${providerInfo.name} \u2014 limited capabilities for agent use`));
|
|
1530
|
+
}
|
|
1531
|
+
return lines.join("\n");
|
|
1532
|
+
}
|
|
1533
|
+
async function testModel(model, ctx) {
|
|
1534
|
+
const providerInfo = ctx ? detectProvider(ctx) : { kind: "ollama", name: "ollama" };
|
|
1535
|
+
if (providerInfo.kind === "ollama") {
|
|
1536
|
+
return testModelOllama(model);
|
|
1537
|
+
} else if (providerInfo.kind === "builtin") {
|
|
1538
|
+
return testModelProvider(providerInfo, model);
|
|
1539
|
+
} else {
|
|
1540
|
+
return testModelOllama(model);
|
|
1541
|
+
}
|
|
1542
|
+
}
|
|
1543
|
+
pi.registerCommand("model-test", {
|
|
1544
|
+
description: "Test a model for reasoning, thinking, tool usage, ReAct parsing, instruction following, and tool support level. Supports both Ollama and cloud providers. Use: /model-test [model] or /model-test --all",
|
|
1545
|
+
getArgumentCompletions: async (prefix) => {
|
|
1546
|
+
try {
|
|
1547
|
+
const models = await getOllamaModels();
|
|
1548
|
+
return models.map((m) => ({ label: m, description: `Test ${m}` })).filter((m) => m.label.startsWith(prefix));
|
|
1549
|
+
} catch {
|
|
1550
|
+
return [];
|
|
1551
|
+
}
|
|
1552
|
+
},
|
|
1553
|
+
handler: async (args, ctx) => {
|
|
1554
|
+
if (!ctx.hasUI) {
|
|
1555
|
+
ctx.ui.notify("model-test requires TUI mode", "error");
|
|
1556
|
+
return;
|
|
1557
|
+
}
|
|
1558
|
+
const arg = args.trim();
|
|
1559
|
+
if (arg === "--all") {
|
|
1560
|
+
const providerInfo = detectProvider(ctx);
|
|
1561
|
+
if (providerInfo.kind !== "ollama") {
|
|
1562
|
+
ctx.ui.notify(`--all is only supported for Ollama models. Current provider: ${providerInfo.name} (${providerInfo.kind})`, "error");
|
|
1563
|
+
return;
|
|
1564
|
+
}
|
|
1565
|
+
ctx.ui.notify("Testing all models \u2014 this will take a while...", "info");
|
|
1566
|
+
let models;
|
|
1567
|
+
try {
|
|
1568
|
+
models = await getOllamaModels();
|
|
1569
|
+
} catch {
|
|
1570
|
+
ctx.ui.notify("Could not list Ollama models", "error");
|
|
1571
|
+
return;
|
|
1572
|
+
}
|
|
1573
|
+
if (models.length === 0) {
|
|
1574
|
+
ctx.ui.notify("No models found in Ollama", "error");
|
|
1575
|
+
return;
|
|
1576
|
+
}
|
|
1577
|
+
for (const model2 of models) {
|
|
1578
|
+
ctx.ui.notify(`Testing ${model2}...`, "info");
|
|
1579
|
+
try {
|
|
1580
|
+
const report = await testModel(model2, ctx);
|
|
1581
|
+
pi.sendMessage({
|
|
1582
|
+
customType: "model-test-report",
|
|
1583
|
+
content: report,
|
|
1584
|
+
display: { type: "content", content: report },
|
|
1585
|
+
details: { model: model2, timestamp: (/* @__PURE__ */ new Date()).toISOString() }
|
|
1586
|
+
});
|
|
1587
|
+
} catch (e) {
|
|
1588
|
+
ctx.ui.notify(`Failed to test ${model2}: ${e.message}`, "error");
|
|
1589
|
+
}
|
|
1590
|
+
}
|
|
1591
|
+
ctx.ui.notify(`Done testing ${models.length} models`, "info");
|
|
1592
|
+
return;
|
|
1593
|
+
}
|
|
1594
|
+
const model = arg || getCurrentModel(ctx);
|
|
1595
|
+
if (!model) {
|
|
1596
|
+
ctx.ui.notify("No model specified and no model currently selected", "error");
|
|
1597
|
+
return;
|
|
1598
|
+
}
|
|
1599
|
+
ctx.ui.notify(`Testing ${model}...`, "info");
|
|
1600
|
+
try {
|
|
1601
|
+
const report = await testModel(model, ctx);
|
|
1602
|
+
pi.sendMessage({
|
|
1603
|
+
customType: "model-test-report",
|
|
1604
|
+
content: report,
|
|
1605
|
+
display: { type: "content", content: report },
|
|
1606
|
+
details: { model, timestamp: (/* @__PURE__ */ new Date()).toISOString() }
|
|
1607
|
+
});
|
|
1608
|
+
} catch (e) {
|
|
1609
|
+
ctx.ui.notify(`Model test failed: ${e.message}`, "error");
|
|
1610
|
+
}
|
|
1611
|
+
}
|
|
1612
|
+
});
|
|
1613
|
+
pi.registerTool({
|
|
1614
|
+
name: "model_test",
|
|
1615
|
+
label: "Model Test",
|
|
1616
|
+
description: "Test a model for reasoning ability, thinking/reasoning token support, tool usage capability, instruction following, and tool support level. Supports both Ollama and built-in cloud providers (OpenRouter, Anthropic, Google, OpenAI, etc.). Returns a detailed report with scores.",
|
|
1617
|
+
promptSnippet: "model_test - test a model's capabilities",
|
|
1618
|
+
promptGuidelines: [
|
|
1619
|
+
"When the user asks to test or evaluate a model, call model_test with the model name."
|
|
1620
|
+
],
|
|
1621
|
+
parameters: {
|
|
1622
|
+
type: "object",
|
|
1623
|
+
properties: {
|
|
1624
|
+
model: { type: "string", description: "Model name to test (e.g. qwen3:0.6b, anthropic/claude-3.5-sonnet). If omitted, tests the current model." }
|
|
1625
|
+
}
|
|
1626
|
+
},
|
|
1627
|
+
execute: async (_toolCallId, _params, _signal, _onUpdate, ctx) => {
|
|
1628
|
+
const model = _params?.model || getCurrentModel(ctx);
|
|
1629
|
+
if (!model) {
|
|
1630
|
+
return {
|
|
1631
|
+
content: [{ type: "text", text: "No model currently selected to test." }],
|
|
1632
|
+
isError: true
|
|
1633
|
+
};
|
|
1634
|
+
}
|
|
1635
|
+
try {
|
|
1636
|
+
const report = await testModel(model, ctx);
|
|
1637
|
+
return {
|
|
1638
|
+
content: [{ type: "text", text: report }],
|
|
1639
|
+
isError: false
|
|
1640
|
+
};
|
|
1641
|
+
} catch (e) {
|
|
1642
|
+
return {
|
|
1643
|
+
content: [{ type: "text", text: `Model test failed: ${e.message}` }],
|
|
1644
|
+
isError: true
|
|
1645
|
+
};
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
});
|
|
1649
|
+
}
|