@vtstech/pi-model-test 1.1.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/model-test.js +168 -261
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -43,12 +43,13 @@ pi install "npm:@vtstech/pi-model-test"
|
|
|
43
43
|
## Features
|
|
44
44
|
|
|
45
45
|
- Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
|
|
46
|
-
-
|
|
46
|
+
- Uses native `fetch()` for all HTTP communication (no shell subprocess or curl dependency)
|
|
47
|
+
- Automatic remote Ollama URL resolution (reads from `models.json` on every call — picks up config changes immediately)
|
|
47
48
|
- Timeout resilience with auto-retry on empty responses
|
|
48
49
|
- Rate limit delay between tests (configurable)
|
|
49
50
|
- Thinking model fallback (retries with `think: true`)
|
|
50
51
|
- Tool support cache (`~/.pi/agent/cache/tool_support.json`)
|
|
51
|
-
- JSON repair for truncated output
|
|
52
|
+
- JSON repair for truncated output (stack-based nesting-aware parser)
|
|
52
53
|
- Tab-completion for model names
|
|
53
54
|
|
|
54
55
|
## Links
|
package/model-test.js
CHANGED
|
@@ -23,8 +23,6 @@ var CONFIG = {
|
|
|
23
23
|
// Single retry for transient failures
|
|
24
24
|
RETRY_DELAY_MS: 1e4,
|
|
25
25
|
// 10 seconds between retries
|
|
26
|
-
EXEC_BUFFER_MS: 8e3,
|
|
27
|
-
// Extra buffer for exec timeout over curl timeout
|
|
28
26
|
// Model generation settings
|
|
29
27
|
NUM_PREDICT: 1024,
|
|
30
28
|
// Max tokens in response
|
|
@@ -35,17 +33,11 @@ var CONFIG = {
|
|
|
35
33
|
// Minimum chars to consider thinking tokens valid
|
|
36
34
|
TOOL_TEST_TIMEOUT_MS: 999999,
|
|
37
35
|
// Effectively unlimited for slow tool usage tests
|
|
38
|
-
TOOL_TEST_MAX_TIME_S: 999999,
|
|
39
|
-
// Max curl time for tool tests (effectively unlimited)
|
|
40
36
|
TOOL_SUPPORT_TIMEOUT_MS: 999999,
|
|
41
37
|
// Effectively unlimited for tool support detection
|
|
42
|
-
TOOL_SUPPORT_MAX_TIME_S: 999999,
|
|
43
|
-
// Max curl time for tool support detection
|
|
44
38
|
// Metadata retrieval
|
|
45
39
|
TAGS_TIMEOUT_MS: 15e3,
|
|
46
40
|
// 15 seconds for /api/tags
|
|
47
|
-
TAGS_CONNECT_TIMEOUT_S: 30,
|
|
48
|
-
// 30 seconds connection timeout for tags
|
|
49
41
|
MODEL_INFO_TIMEOUT_MS: 3e4,
|
|
50
42
|
// 30 seconds for model info lookup
|
|
51
43
|
// Provider API settings
|
|
@@ -95,53 +87,120 @@ function cacheToolSupport(model, support, family) {
|
|
|
95
87
|
writeToolSupportCache(cache);
|
|
96
88
|
}
|
|
97
89
|
function model_test_temp_default(pi) {
|
|
98
|
-
|
|
90
|
+
function ollamaBase() {
|
|
91
|
+
return getOllamaBaseUrl();
|
|
92
|
+
}
|
|
99
93
|
async function rateLimitDelay(lines) {
|
|
100
94
|
if (CONFIG.TEST_DELAY_MS > 0) {
|
|
101
95
|
lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
|
|
102
96
|
await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
|
|
103
97
|
}
|
|
104
98
|
}
|
|
99
|
+
function scoreReasoning(msg) {
|
|
100
|
+
const allNumbers = msg.match(/\b(\d+)\b/g) || [];
|
|
101
|
+
const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
|
|
102
|
+
const isCorrect = answer === "8";
|
|
103
|
+
const reasoningPatterns = [
|
|
104
|
+
"because",
|
|
105
|
+
"therefore",
|
|
106
|
+
"since",
|
|
107
|
+
"step",
|
|
108
|
+
"subtract",
|
|
109
|
+
"minus",
|
|
110
|
+
"each day",
|
|
111
|
+
"each night",
|
|
112
|
+
"slides",
|
|
113
|
+
"climbs",
|
|
114
|
+
"night",
|
|
115
|
+
"reaches",
|
|
116
|
+
"finally",
|
|
117
|
+
"last day"
|
|
118
|
+
];
|
|
119
|
+
const hasReasoningWords = reasoningPatterns.some((w) => msg.toLowerCase().includes(w));
|
|
120
|
+
const hasNumberedSteps = /^\s*\d+\.\s/m.test(msg);
|
|
121
|
+
const hasReasoning = hasReasoningWords || hasNumberedSteps;
|
|
122
|
+
if (isCorrect && hasReasoning) return { score: "STRONG", pass: true };
|
|
123
|
+
if (isCorrect) return { score: "MODERATE", pass: true };
|
|
124
|
+
if (hasReasoning) return { score: "WEAK", pass: false };
|
|
125
|
+
return { score: "FAIL", pass: false };
|
|
126
|
+
}
|
|
127
|
+
function scoreNativeToolCall(fnName, args) {
|
|
128
|
+
const hasCorrectTool = fnName === "get_weather";
|
|
129
|
+
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
130
|
+
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
131
|
+
if (hasCorrectTool && hasLocation && unitValid) return { score: "STRONG", pass: true };
|
|
132
|
+
if (hasCorrectTool && hasLocation) return { score: "MODERATE", pass: true };
|
|
133
|
+
return { score: "WEAK", pass: false };
|
|
134
|
+
}
|
|
135
|
+
function scoreTextToolCall(fnName, args) {
|
|
136
|
+
const isWeatherTool = fnName === "get_weather";
|
|
137
|
+
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
138
|
+
if (isWeatherTool && hasLocation) return { score: "STRONG", pass: true };
|
|
139
|
+
if (isWeatherTool) return { score: "MODERATE", pass: true };
|
|
140
|
+
return { score: "WEAK", pass: false };
|
|
141
|
+
}
|
|
142
|
+
function parseTextToolCall(content) {
|
|
143
|
+
const firstBrace = content.indexOf("{");
|
|
144
|
+
if (firstBrace === -1) return null;
|
|
145
|
+
const lastBrace = content.lastIndexOf("}");
|
|
146
|
+
if (lastBrace <= firstBrace) return null;
|
|
147
|
+
const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
|
|
148
|
+
let textToolParsed = null;
|
|
149
|
+
try {
|
|
150
|
+
textToolParsed = JSON.parse(jsonCandidate);
|
|
151
|
+
} catch {
|
|
152
|
+
return null;
|
|
153
|
+
}
|
|
154
|
+
if (!textToolParsed || typeof textToolParsed.name !== "string") return null;
|
|
155
|
+
const rawArgs = textToolParsed.arguments || { ...textToolParsed };
|
|
156
|
+
const { name: _, ...fnArgs } = rawArgs;
|
|
157
|
+
return { fnName: textToolParsed.name, args: fnArgs };
|
|
158
|
+
}
|
|
105
159
|
async function ollamaChat(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS, retries = CONFIG.MAX_RETRIES) {
|
|
106
160
|
const body = { model, messages, stream: false, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
|
|
161
|
+
const url = `${ollamaBase()}/api/chat`;
|
|
107
162
|
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
108
163
|
const start = Date.now();
|
|
164
|
+
const controller = new AbortController();
|
|
165
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
109
166
|
try {
|
|
110
|
-
const
|
|
111
|
-
"
|
|
112
|
-
"
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
String(CONFIG.CONNECT_TIMEOUT_S),
|
|
117
|
-
"--max-time",
|
|
118
|
-
String(Math.ceil(timeoutMs / 1e3)),
|
|
119
|
-
`${OLLAMA_BASE}/api/chat`,
|
|
120
|
-
"-H",
|
|
121
|
-
"Content-Type: application/json",
|
|
122
|
-
"-d",
|
|
123
|
-
JSON.stringify(body)
|
|
124
|
-
], { timeout: timeoutMs + CONFIG.EXEC_BUFFER_MS });
|
|
167
|
+
const res = await fetch(url, {
|
|
168
|
+
method: "POST",
|
|
169
|
+
headers: { "Content-Type": "application/json" },
|
|
170
|
+
body: JSON.stringify(body),
|
|
171
|
+
signal: controller.signal
|
|
172
|
+
});
|
|
125
173
|
const elapsedMs = Date.now() - start;
|
|
126
|
-
if (
|
|
127
|
-
const
|
|
128
|
-
throw new Error(`
|
|
174
|
+
if (!res.ok) {
|
|
175
|
+
const errorText = await res.text().catch(() => "unknown error");
|
|
176
|
+
throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
|
|
129
177
|
}
|
|
130
|
-
|
|
178
|
+
const text = await res.text();
|
|
179
|
+
if (!text.trim()) {
|
|
131
180
|
if (attempt < retries) {
|
|
132
181
|
await new Promise((r) => setTimeout(r, CONFIG.RETRY_DELAY_MS));
|
|
133
182
|
continue;
|
|
134
183
|
}
|
|
135
184
|
throw new Error(`Empty response from Ollama after ${attempt + 1} attempt(s)`);
|
|
136
185
|
}
|
|
137
|
-
const parsed = JSON.parse(
|
|
186
|
+
const parsed = JSON.parse(text);
|
|
138
187
|
return { response: parsed, elapsedMs };
|
|
139
188
|
} catch (e) {
|
|
140
|
-
|
|
189
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
190
|
+
if (e instanceof Error && e.name === "AbortError") {
|
|
191
|
+
if (attempt < retries) {
|
|
192
|
+
await new Promise((r) => setTimeout(r, CONFIG.RETRY_DELAY_MS));
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
throw new Error(`Ollama API timed out after ${msHuman(timeoutMs)}`);
|
|
196
|
+
}
|
|
197
|
+
if (attempt < retries && (msg.includes("Empty response") || msg.includes("ECONNREFUSED") || msg.includes("ECONNRESET") || msg.includes("fetch failed"))) {
|
|
141
198
|
await new Promise((r) => setTimeout(r, CONFIG.RETRY_DELAY_MS));
|
|
142
199
|
continue;
|
|
143
200
|
}
|
|
144
201
|
throw e;
|
|
202
|
+
} finally {
|
|
203
|
+
clearTimeout(timeoutId);
|
|
145
204
|
}
|
|
146
205
|
}
|
|
147
206
|
throw new Error("Unreachable");
|
|
@@ -280,41 +339,7 @@ function model_test_temp_default(pi) {
|
|
|
280
339
|
}
|
|
281
340
|
const allNumbers = effectiveMsg.match(/\b(\d+)\b/g) || [];
|
|
282
341
|
const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
|
|
283
|
-
const
|
|
284
|
-
const reasoningPatterns = [
|
|
285
|
-
"because",
|
|
286
|
-
"therefore",
|
|
287
|
-
"since",
|
|
288
|
-
"step",
|
|
289
|
-
"subtract",
|
|
290
|
-
"minus",
|
|
291
|
-
"each day",
|
|
292
|
-
"each night",
|
|
293
|
-
"slides",
|
|
294
|
-
"climbs",
|
|
295
|
-
"night",
|
|
296
|
-
"reaches",
|
|
297
|
-
"finally",
|
|
298
|
-
"last day"
|
|
299
|
-
];
|
|
300
|
-
const hasReasoningWords = reasoningPatterns.some((w) => effectiveMsg.toLowerCase().includes(w));
|
|
301
|
-
const hasNumberedSteps = /^\s*\d+\.\s/m.test(effectiveMsg);
|
|
302
|
-
const hasReasoning = hasReasoningWords || hasNumberedSteps;
|
|
303
|
-
let score;
|
|
304
|
-
let pass;
|
|
305
|
-
if (isCorrect && hasReasoning) {
|
|
306
|
-
score = "STRONG";
|
|
307
|
-
pass = true;
|
|
308
|
-
} else if (isCorrect) {
|
|
309
|
-
score = "MODERATE";
|
|
310
|
-
pass = true;
|
|
311
|
-
} else if (hasReasoning) {
|
|
312
|
-
score = "WEAK";
|
|
313
|
-
pass = false;
|
|
314
|
-
} else {
|
|
315
|
-
score = "FAIL";
|
|
316
|
-
pass = false;
|
|
317
|
-
}
|
|
342
|
+
const { score, pass } = scoreReasoning(effectiveMsg);
|
|
318
343
|
const displayMsg = msg.trim().length > 0 ? effectiveMsg : `[thinking tokens] ${effectiveMsg}`;
|
|
319
344
|
return { pass, score, reasoning: displayMsg, answer, elapsedMs };
|
|
320
345
|
} catch (e) {
|
|
@@ -333,41 +358,7 @@ function model_test_temp_default(pi) {
|
|
|
333
358
|
}
|
|
334
359
|
const allNumbers = msg.match(/\b(\d+)\b/g) || [];
|
|
335
360
|
const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
|
|
336
|
-
const
|
|
337
|
-
const reasoningPatterns = [
|
|
338
|
-
"because",
|
|
339
|
-
"therefore",
|
|
340
|
-
"since",
|
|
341
|
-
"step",
|
|
342
|
-
"subtract",
|
|
343
|
-
"minus",
|
|
344
|
-
"each day",
|
|
345
|
-
"each night",
|
|
346
|
-
"slides",
|
|
347
|
-
"climbs",
|
|
348
|
-
"night",
|
|
349
|
-
"reaches",
|
|
350
|
-
"finally",
|
|
351
|
-
"last day"
|
|
352
|
-
];
|
|
353
|
-
const hasReasoningWords = reasoningPatterns.some((w) => msg.toLowerCase().includes(w));
|
|
354
|
-
const hasNumberedSteps = /^\s*\d+\.\s/m.test(msg);
|
|
355
|
-
const hasReasoning = hasReasoningWords || hasNumberedSteps;
|
|
356
|
-
let score;
|
|
357
|
-
let pass;
|
|
358
|
-
if (isCorrect && hasReasoning) {
|
|
359
|
-
score = "STRONG";
|
|
360
|
-
pass = true;
|
|
361
|
-
} else if (isCorrect) {
|
|
362
|
-
score = "MODERATE";
|
|
363
|
-
pass = true;
|
|
364
|
-
} else if (hasReasoning) {
|
|
365
|
-
score = "WEAK";
|
|
366
|
-
pass = false;
|
|
367
|
-
} else {
|
|
368
|
-
score = "FAIL";
|
|
369
|
-
pass = false;
|
|
370
|
-
}
|
|
361
|
+
const { score, pass } = scoreReasoning(msg);
|
|
371
362
|
return { pass, score, reasoning: msg, answer, elapsedMs: result.elapsedMs };
|
|
372
363
|
} catch (e) {
|
|
373
364
|
return { pass: false, score: "ERROR", reasoning: e.message, answer: "?", elapsedMs: 0 };
|
|
@@ -423,29 +414,24 @@ function model_test_temp_default(pi) {
|
|
|
423
414
|
options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
|
|
424
415
|
};
|
|
425
416
|
try {
|
|
417
|
+
const controller = new AbortController();
|
|
418
|
+
const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
|
|
426
419
|
const start = Date.now();
|
|
427
|
-
const
|
|
428
|
-
"
|
|
429
|
-
"
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
String(CONFIG.CONNECT_TIMEOUT_S),
|
|
434
|
-
"--max-time",
|
|
435
|
-
String(CONFIG.TOOL_TEST_MAX_TIME_S),
|
|
436
|
-
`${OLLAMA_BASE}/api/chat`,
|
|
437
|
-
"-H",
|
|
438
|
-
"Content-Type: application/json",
|
|
439
|
-
"-d",
|
|
440
|
-
JSON.stringify(body)
|
|
441
|
-
], { timeout: CONFIG.TOOL_TEST_TIMEOUT_MS });
|
|
420
|
+
const res = await fetch(`${ollamaBase()}/api/chat`, {
|
|
421
|
+
method: "POST",
|
|
422
|
+
headers: { "Content-Type": "application/json" },
|
|
423
|
+
body: JSON.stringify(body),
|
|
424
|
+
signal: controller.signal
|
|
425
|
+
});
|
|
442
426
|
const elapsedMs = Date.now() - start;
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
427
|
+
clearTimeout(timeoutId);
|
|
428
|
+
if (!res.ok) {
|
|
429
|
+
const errorText = await res.text().catch(() => "unknown error");
|
|
430
|
+
return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `fetch error: ${res.status}`, response: "", elapsedMs };
|
|
446
431
|
}
|
|
447
|
-
|
|
448
|
-
|
|
432
|
+
const text = await res.text();
|
|
433
|
+
if (!text.trim()) throw new Error("Empty response from Ollama");
|
|
434
|
+
const parsed = JSON.parse(text);
|
|
449
435
|
const toolCalls = parsed?.message?.tool_calls;
|
|
450
436
|
const content = parsed?.message?.content || "";
|
|
451
437
|
if (toolCalls && toolCalls.length > 0) {
|
|
@@ -464,20 +450,7 @@ function model_test_temp_default(pi) {
|
|
|
464
450
|
elapsedMs
|
|
465
451
|
};
|
|
466
452
|
}
|
|
467
|
-
const
|
|
468
|
-
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
469
|
-
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
470
|
-
let score;
|
|
471
|
-
if (hasCorrectTool && hasLocation && unitValid) {
|
|
472
|
-
score = "STRONG";
|
|
473
|
-
} else if (hasCorrectTool && hasLocation) {
|
|
474
|
-
score = "MODERATE";
|
|
475
|
-
} else if (hasCorrectTool) {
|
|
476
|
-
score = "WEAK";
|
|
477
|
-
} else {
|
|
478
|
-
score = "WEAK";
|
|
479
|
-
}
|
|
480
|
-
const pass = score !== "WEAK";
|
|
453
|
+
const { score, pass } = scoreNativeToolCall(fn.name || "", args);
|
|
481
454
|
return {
|
|
482
455
|
pass,
|
|
483
456
|
score,
|
|
@@ -487,38 +460,14 @@ function model_test_temp_default(pi) {
|
|
|
487
460
|
elapsedMs
|
|
488
461
|
};
|
|
489
462
|
}
|
|
490
|
-
const
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
const lastBrace = content.lastIndexOf("}");
|
|
494
|
-
if (lastBrace > firstBrace) {
|
|
495
|
-
const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
|
|
496
|
-
try {
|
|
497
|
-
textToolParsed = JSON.parse(jsonCandidate);
|
|
498
|
-
} catch {
|
|
499
|
-
}
|
|
500
|
-
}
|
|
501
|
-
}
|
|
502
|
-
if (textToolParsed && typeof textToolParsed.name === "string") {
|
|
503
|
-
const fnName = textToolParsed.name;
|
|
504
|
-
const rawArgs = textToolParsed.arguments || { ...textToolParsed };
|
|
505
|
-
const { name: _, ...fnArgs } = rawArgs;
|
|
506
|
-
const isWeatherTool = fnName === "get_weather";
|
|
507
|
-
const hasLocation = typeof fnArgs.location === "string" && fnArgs.location.toLowerCase().includes("paris");
|
|
508
|
-
let score;
|
|
509
|
-
if (isWeatherTool && hasLocation) {
|
|
510
|
-
score = "STRONG";
|
|
511
|
-
} else if (isWeatherTool) {
|
|
512
|
-
score = "MODERATE";
|
|
513
|
-
} else {
|
|
514
|
-
score = "WEAK";
|
|
515
|
-
}
|
|
516
|
-
const pass = score !== "WEAK";
|
|
463
|
+
const textParsed = parseTextToolCall(content);
|
|
464
|
+
if (textParsed) {
|
|
465
|
+
const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
|
|
517
466
|
return {
|
|
518
467
|
pass,
|
|
519
468
|
score,
|
|
520
469
|
hasToolCalls: true,
|
|
521
|
-
toolCall: `${fnName}(${JSON.stringify(
|
|
470
|
+
toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
|
|
522
471
|
response: content,
|
|
523
472
|
elapsedMs
|
|
524
473
|
};
|
|
@@ -580,20 +529,7 @@ function model_test_temp_default(pi) {
|
|
|
580
529
|
elapsedMs: result.elapsedMs
|
|
581
530
|
};
|
|
582
531
|
}
|
|
583
|
-
const
|
|
584
|
-
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
585
|
-
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
586
|
-
let score;
|
|
587
|
-
if (hasCorrectTool && hasLocation && unitValid) {
|
|
588
|
-
score = "STRONG";
|
|
589
|
-
} else if (hasCorrectTool && hasLocation) {
|
|
590
|
-
score = "MODERATE";
|
|
591
|
-
} else if (hasCorrectTool) {
|
|
592
|
-
score = "WEAK";
|
|
593
|
-
} else {
|
|
594
|
-
score = "WEAK";
|
|
595
|
-
}
|
|
596
|
-
const pass = score !== "WEAK";
|
|
532
|
+
const { score, pass } = scoreNativeToolCall(fn.name || "", args);
|
|
597
533
|
return {
|
|
598
534
|
pass,
|
|
599
535
|
score,
|
|
@@ -603,38 +539,14 @@ function model_test_temp_default(pi) {
|
|
|
603
539
|
elapsedMs: result.elapsedMs
|
|
604
540
|
};
|
|
605
541
|
}
|
|
606
|
-
const
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
const lastBrace = content.lastIndexOf("}");
|
|
610
|
-
if (lastBrace > firstBrace) {
|
|
611
|
-
const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
|
|
612
|
-
try {
|
|
613
|
-
textToolParsed = JSON.parse(jsonCandidate);
|
|
614
|
-
} catch {
|
|
615
|
-
}
|
|
616
|
-
}
|
|
617
|
-
}
|
|
618
|
-
if (textToolParsed && typeof textToolParsed.name === "string") {
|
|
619
|
-
const fnName = textToolParsed.name;
|
|
620
|
-
const rawArgs = textToolParsed.arguments || { ...textToolParsed };
|
|
621
|
-
const { name: _, ...fnArgs } = rawArgs;
|
|
622
|
-
const isWeatherTool = fnName === "get_weather";
|
|
623
|
-
const hasLocation = typeof fnArgs.location === "string" && fnArgs.location.toLowerCase().includes("paris");
|
|
624
|
-
let score;
|
|
625
|
-
if (isWeatherTool && hasLocation) {
|
|
626
|
-
score = "STRONG";
|
|
627
|
-
} else if (isWeatherTool) {
|
|
628
|
-
score = "MODERATE";
|
|
629
|
-
} else {
|
|
630
|
-
score = "WEAK";
|
|
631
|
-
}
|
|
632
|
-
const pass = score !== "WEAK";
|
|
542
|
+
const textParsed = parseTextToolCall(content);
|
|
543
|
+
if (textParsed) {
|
|
544
|
+
const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
|
|
633
545
|
return {
|
|
634
546
|
pass,
|
|
635
547
|
score,
|
|
636
548
|
hasToolCalls: true,
|
|
637
|
-
toolCall: `${fnName}(${JSON.stringify(
|
|
549
|
+
toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
|
|
638
550
|
response: content,
|
|
639
551
|
elapsedMs: result.elapsedMs
|
|
640
552
|
};
|
|
@@ -671,29 +583,24 @@ function model_test_temp_default(pi) {
|
|
|
671
583
|
options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
|
|
672
584
|
};
|
|
673
585
|
try {
|
|
586
|
+
const controller = new AbortController();
|
|
587
|
+
const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
|
|
674
588
|
const start = Date.now();
|
|
675
|
-
const
|
|
676
|
-
"
|
|
677
|
-
"
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
String(CONFIG.CONNECT_TIMEOUT_S),
|
|
682
|
-
"--max-time",
|
|
683
|
-
String(CONFIG.TOOL_TEST_MAX_TIME_S),
|
|
684
|
-
`${OLLAMA_BASE}/api/chat`,
|
|
685
|
-
"-H",
|
|
686
|
-
"Content-Type: application/json",
|
|
687
|
-
"-d",
|
|
688
|
-
JSON.stringify(body)
|
|
689
|
-
], { timeout: CONFIG.TOOL_TEST_TIMEOUT_MS });
|
|
589
|
+
const res = await fetch(`${ollamaBase()}/api/chat`, {
|
|
590
|
+
method: "POST",
|
|
591
|
+
headers: { "Content-Type": "application/json" },
|
|
592
|
+
body: JSON.stringify(body),
|
|
593
|
+
signal: controller.signal
|
|
594
|
+
});
|
|
690
595
|
const elapsedMs = Date.now() - start;
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
596
|
+
clearTimeout(timeoutId);
|
|
597
|
+
if (!res.ok) {
|
|
598
|
+
const errorText = await res.text().catch(() => "unknown error");
|
|
599
|
+
return { pass: false, score: "ERROR", toolCall: `fetch error: ${res.status}`, thought: "", response: "", elapsedMs };
|
|
694
600
|
}
|
|
695
|
-
|
|
696
|
-
|
|
601
|
+
const text = await res.text();
|
|
602
|
+
if (!text.trim()) throw new Error("Empty response from Ollama");
|
|
603
|
+
const parsed = JSON.parse(text);
|
|
697
604
|
const content = (parsed?.message?.content || "").trim();
|
|
698
605
|
if (!content) {
|
|
699
606
|
return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
|
|
@@ -702,20 +609,20 @@ function model_test_temp_default(pi) {
|
|
|
702
609
|
const sharedParser = pi._reactParser;
|
|
703
610
|
if (sharedParser?.ALL_DIALECT_PATTERNS) {
|
|
704
611
|
for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
|
|
705
|
-
const
|
|
706
|
-
if (
|
|
707
|
-
let toolName =
|
|
612
|
+
const result = sharedParser.parseReactWithPatterns(content, dp, true);
|
|
613
|
+
if (result) {
|
|
614
|
+
let toolName = result.name;
|
|
708
615
|
let argsStr;
|
|
709
|
-
const rawArgs =
|
|
616
|
+
const rawArgs = result.args ? JSON.stringify(result.args) : "";
|
|
710
617
|
if (rawArgs && rawArgs !== "{}") {
|
|
711
618
|
argsStr = rawArgs;
|
|
712
|
-
} else if (
|
|
713
|
-
const jsonStart =
|
|
619
|
+
} else if (result.raw) {
|
|
620
|
+
const jsonStart = result.raw.indexOf("{");
|
|
714
621
|
if (jsonStart !== -1) {
|
|
715
622
|
let depth = 0, jsonEnd = -1;
|
|
716
|
-
for (let i = jsonStart; i <
|
|
717
|
-
if (
|
|
718
|
-
else if (
|
|
623
|
+
for (let i = jsonStart; i < result.raw.length; i++) {
|
|
624
|
+
if (result.raw[i] === "{") depth++;
|
|
625
|
+
else if (result.raw[i] === "}") {
|
|
719
626
|
depth--;
|
|
720
627
|
if (depth === 0) {
|
|
721
628
|
jsonEnd = i;
|
|
@@ -723,14 +630,14 @@ function model_test_temp_default(pi) {
|
|
|
723
630
|
}
|
|
724
631
|
}
|
|
725
632
|
}
|
|
726
|
-
argsStr = jsonEnd !== -1 ?
|
|
633
|
+
argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
727
634
|
} else {
|
|
728
635
|
argsStr = "";
|
|
729
636
|
}
|
|
730
637
|
} else {
|
|
731
638
|
argsStr = "";
|
|
732
639
|
}
|
|
733
|
-
parsedResult = { name: toolName, args: argsStr, thought:
|
|
640
|
+
parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
|
|
734
641
|
break;
|
|
735
642
|
}
|
|
736
643
|
}
|
|
@@ -1051,29 +958,29 @@ The JSON object must have exactly these 4 keys:
|
|
|
1051
958
|
};
|
|
1052
959
|
try {
|
|
1053
960
|
const start = Date.now();
|
|
1054
|
-
const
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
"
|
|
1058
|
-
"
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
"120",
|
|
1063
|
-
`${OLLAMA_BASE}/api/chat`,
|
|
1064
|
-
"-H",
|
|
1065
|
-
"Content-Type: application/json",
|
|
1066
|
-
"-d",
|
|
1067
|
-
JSON.stringify(body)
|
|
1068
|
-
], { timeout: 13e4 });
|
|
961
|
+
const controller = new AbortController();
|
|
962
|
+
const timeoutId = setTimeout(() => controller.abort(), 13e4);
|
|
963
|
+
const res = await fetch(`${ollamaBase()}/api/chat`, {
|
|
964
|
+
method: "POST",
|
|
965
|
+
headers: { "Content-Type": "application/json" },
|
|
966
|
+
body: JSON.stringify(body),
|
|
967
|
+
signal: controller.signal
|
|
968
|
+
});
|
|
1069
969
|
const elapsedMs = Date.now() - start;
|
|
1070
|
-
|
|
1071
|
-
|
|
970
|
+
clearTimeout(timeoutId);
|
|
971
|
+
if (!res.ok) {
|
|
972
|
+
const detail = await res.text().catch(() => "unknown error");
|
|
973
|
+
const level2 = "none";
|
|
974
|
+
cacheToolSupport(model, level2, family);
|
|
975
|
+
return { level: level2, cached: false, evidence: `API error ${res.status}: ${truncate(detail, 100)}`, elapsedMs };
|
|
976
|
+
}
|
|
977
|
+
const text = await res.text();
|
|
978
|
+
if (!text.trim()) {
|
|
1072
979
|
const level2 = "none";
|
|
1073
980
|
cacheToolSupport(model, level2, family);
|
|
1074
|
-
return { level: level2, cached: false, evidence:
|
|
981
|
+
return { level: level2, cached: false, evidence: "empty response from Ollama", elapsedMs };
|
|
1075
982
|
}
|
|
1076
|
-
const parsed = JSON.parse(
|
|
983
|
+
const parsed = JSON.parse(text);
|
|
1077
984
|
const toolCalls = parsed?.message?.tool_calls;
|
|
1078
985
|
const content = (parsed?.message?.content || "").trim();
|
|
1079
986
|
if (toolCalls && Array.isArray(toolCalls) && toolCalls.length > 0) {
|
|
@@ -1169,9 +1076,9 @@ The JSON object must have exactly these 4 keys:
|
|
|
1169
1076
|
}
|
|
1170
1077
|
async function getOllamaModels() {
|
|
1171
1078
|
try {
|
|
1172
|
-
const
|
|
1173
|
-
if (
|
|
1174
|
-
const data =
|
|
1079
|
+
const res = await fetch(`${ollamaBase()}/api/tags`, { signal: AbortSignal.timeout(15e3) });
|
|
1080
|
+
if (!res.ok) return [];
|
|
1081
|
+
const data = await res.json();
|
|
1175
1082
|
return (data.models || []).map((m) => m.name).filter(Boolean);
|
|
1176
1083
|
} catch {
|
|
1177
1084
|
return [];
|
|
@@ -1231,7 +1138,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1231
1138
|
}
|
|
1232
1139
|
}
|
|
1233
1140
|
lines.push(info(`API: ${apiMode}`));
|
|
1234
|
-
const nativeContext = await fetchModelContextLength(
|
|
1141
|
+
const nativeContext = await fetchModelContextLength(ollamaBase(), model);
|
|
1235
1142
|
if (nativeContext !== void 0) {
|
|
1236
1143
|
const ctxStr = nativeContext >= 1e3 ? `${(nativeContext / 1e3).toFixed(1)}k` : String(nativeContext);
|
|
1237
1144
|
lines.push(info(`Context: ${ctxStr} tokens (native max)`));
|
|
@@ -1242,9 +1149,9 @@ The JSON object must have exactly these 4 keys:
|
|
|
1242
1149
|
let modelQuant = "unknown";
|
|
1243
1150
|
let modelModified = "unknown";
|
|
1244
1151
|
try {
|
|
1245
|
-
const
|
|
1246
|
-
if (
|
|
1247
|
-
const tags =
|
|
1152
|
+
const tagsRes = await fetch(`${ollamaBase()}/api/tags`, { signal: AbortSignal.timeout(1e4) });
|
|
1153
|
+
if (tagsRes.ok) {
|
|
1154
|
+
const tags = await tagsRes.json();
|
|
1248
1155
|
const entry = (tags.models || []).find((m) => m.name === model);
|
|
1249
1156
|
if (entry) {
|
|
1250
1157
|
const details = entry.details || {};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.2",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.1.
|
|
17
|
+
"@vtstech/pi-shared": "1.1.2"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|