@vtstech/pi-model-test 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +110 -544
- package/package.json +2 -2
package/model-test.js
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
1
|
// .build-npm/model-test/model-test.temp.ts
|
|
2
|
-
import * as fs from "node:fs";
|
|
3
|
-
import * as os from "node:os";
|
|
4
|
-
import * as path from "node:path";
|
|
5
2
|
import {
|
|
6
3
|
section,
|
|
7
4
|
ok,
|
|
@@ -13,79 +10,21 @@ import {
|
|
|
13
10
|
sanitizeForReport
|
|
14
11
|
} from "@vtstech/pi-shared/format";
|
|
15
12
|
import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
// Test-specific settings
|
|
32
|
-
MIN_THINKING_LENGTH: 10,
|
|
33
|
-
// Minimum chars to consider thinking tokens valid
|
|
34
|
-
TOOL_TEST_TIMEOUT_MS: 999999,
|
|
35
|
-
// Effectively unlimited for slow tool usage tests
|
|
36
|
-
TOOL_SUPPORT_TIMEOUT_MS: 999999,
|
|
37
|
-
// Effectively unlimited for tool support detection
|
|
38
|
-
// Metadata retrieval
|
|
39
|
-
TAGS_TIMEOUT_MS: 15e3,
|
|
40
|
-
// 15 seconds for /api/tags
|
|
41
|
-
MODEL_INFO_TIMEOUT_MS: 3e4,
|
|
42
|
-
// 30 seconds for model info lookup
|
|
43
|
-
// Provider API settings
|
|
44
|
-
PROVIDER_TIMEOUT_MS: 999999,
|
|
45
|
-
// Effectively unlimited for cloud provider API calls
|
|
46
|
-
PROVIDER_TOOL_TIMEOUT_MS: 12e4,
|
|
47
|
-
// 120 seconds for tool usage tests on providers
|
|
48
|
-
// Rate limiting
|
|
49
|
-
TEST_DELAY_MS: 1e4
|
|
50
|
-
// 10 seconds between tests to avoid rate limiting
|
|
51
|
-
};
|
|
52
|
-
var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
|
|
53
|
-
var TOOL_SUPPORT_CACHE_PATH = path.join(TOOL_SUPPORT_CACHE_DIR, "tool_support.json");
|
|
54
|
-
var _toolSupportCacheInMemory = null;
|
|
55
|
-
function readToolSupportCache() {
|
|
56
|
-
try {
|
|
57
|
-
if (fs.existsSync(TOOL_SUPPORT_CACHE_PATH)) {
|
|
58
|
-
const raw = fs.readFileSync(TOOL_SUPPORT_CACHE_PATH, "utf-8");
|
|
59
|
-
return JSON.parse(raw);
|
|
60
|
-
}
|
|
61
|
-
} catch {
|
|
62
|
-
}
|
|
63
|
-
return {};
|
|
64
|
-
}
|
|
65
|
-
function writeToolSupportCache(cache) {
|
|
66
|
-
if (!fs.existsSync(TOOL_SUPPORT_CACHE_DIR)) {
|
|
67
|
-
fs.mkdirSync(TOOL_SUPPORT_CACHE_DIR, { recursive: true });
|
|
68
|
-
}
|
|
69
|
-
fs.writeFileSync(TOOL_SUPPORT_CACHE_PATH, JSON.stringify(cache, null, 2) + "\n", "utf-8");
|
|
70
|
-
}
|
|
71
|
-
function getCachedToolSupport(model) {
|
|
72
|
-
const cache = _toolSupportCacheInMemory || readToolSupportCache();
|
|
73
|
-
if (!_toolSupportCacheInMemory) _toolSupportCacheInMemory = cache;
|
|
74
|
-
const entry = cache[model];
|
|
75
|
-
if (!entry) return null;
|
|
76
|
-
if (!entry.support || !["native", "react", "none"].includes(entry.support)) return null;
|
|
77
|
-
return entry;
|
|
78
|
-
}
|
|
79
|
-
function cacheToolSupport(model, support, family) {
|
|
80
|
-
const cache = _toolSupportCacheInMemory || readToolSupportCache();
|
|
81
|
-
cache[model] = {
|
|
82
|
-
support,
|
|
83
|
-
testedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
84
|
-
family
|
|
85
|
-
};
|
|
86
|
-
_toolSupportCacheInMemory = cache;
|
|
87
|
-
writeToolSupportCache(cache);
|
|
88
|
-
}
|
|
13
|
+
import {
|
|
14
|
+
ALL_DIALECT_PATTERNS,
|
|
15
|
+
parseReactWithPatterns
|
|
16
|
+
} from "@vtstech/pi-shared/react-parser";
|
|
17
|
+
import {
|
|
18
|
+
CONFIG,
|
|
19
|
+
WEATHER_TOOL_DEFINITION,
|
|
20
|
+
scoreReasoning,
|
|
21
|
+
getCachedToolSupport,
|
|
22
|
+
cacheToolSupport,
|
|
23
|
+
testToolUsageUnified,
|
|
24
|
+
testReasoningUnified,
|
|
25
|
+
testInstructionFollowingUnified,
|
|
26
|
+
TOOL_SUPPORT_CACHE_PATH
|
|
27
|
+
} from "@vtstech/pi-shared/model-test-utils";
|
|
89
28
|
function model_test_temp_default(pi) {
|
|
90
29
|
function ollamaBase() {
|
|
91
30
|
return getOllamaBaseUrl();
|
|
@@ -96,65 +35,75 @@ function model_test_temp_default(pi) {
|
|
|
96
35
|
await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
|
|
97
36
|
}
|
|
98
37
|
}
|
|
99
|
-
function
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
"subtract",
|
|
109
|
-
"minus",
|
|
110
|
-
"each day",
|
|
111
|
-
"each night",
|
|
112
|
-
"slides",
|
|
113
|
-
"climbs",
|
|
114
|
-
"night",
|
|
115
|
-
"reaches",
|
|
116
|
-
"finally",
|
|
117
|
-
"last day"
|
|
118
|
-
];
|
|
119
|
-
const hasReasoningWords = reasoningPatterns.some((w) => msg.toLowerCase().includes(w));
|
|
120
|
-
const hasNumberedSteps = /^\s*\d+\.\s/m.test(msg);
|
|
121
|
-
const hasReasoning = hasReasoningWords || hasNumberedSteps;
|
|
122
|
-
if (isCorrect && hasReasoning) return { score: "STRONG", pass: true };
|
|
123
|
-
if (isCorrect) return { score: "MODERATE", pass: true };
|
|
124
|
-
if (hasReasoning) return { score: "WEAK", pass: false };
|
|
125
|
-
return { score: "FAIL", pass: false };
|
|
126
|
-
}
|
|
127
|
-
function scoreNativeToolCall(fnName, args) {
|
|
128
|
-
const hasCorrectTool = fnName === "get_weather";
|
|
129
|
-
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
130
|
-
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
131
|
-
if (hasCorrectTool && hasLocation && unitValid) return { score: "STRONG", pass: true };
|
|
132
|
-
if (hasCorrectTool && hasLocation) return { score: "MODERATE", pass: true };
|
|
133
|
-
return { score: "WEAK", pass: false };
|
|
38
|
+
function makeOllamaChatFn() {
|
|
39
|
+
return async (model, messages, _options) => {
|
|
40
|
+
const result = await ollamaChat(model, messages);
|
|
41
|
+
return {
|
|
42
|
+
content: result.response?.message?.content || "",
|
|
43
|
+
elapsedMs: result.elapsedMs,
|
|
44
|
+
raw: result.response
|
|
45
|
+
};
|
|
46
|
+
};
|
|
134
47
|
}
|
|
135
|
-
function
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
48
|
+
function makeOllamaToolChatFn() {
|
|
49
|
+
return async (model, messages, options) => {
|
|
50
|
+
const tools = options?.tools || void 0;
|
|
51
|
+
const body = {
|
|
52
|
+
model,
|
|
53
|
+
messages,
|
|
54
|
+
stream: false,
|
|
55
|
+
options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
|
|
56
|
+
};
|
|
57
|
+
if (tools && tools.length > 0) {
|
|
58
|
+
body.tools = tools;
|
|
59
|
+
}
|
|
60
|
+
const controller = new AbortController();
|
|
61
|
+
const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
|
|
62
|
+
const start = Date.now();
|
|
63
|
+
try {
|
|
64
|
+
const res = await fetch(`${ollamaBase()}/api/chat`, {
|
|
65
|
+
method: "POST",
|
|
66
|
+
headers: { "Content-Type": "application/json" },
|
|
67
|
+
body: JSON.stringify(body),
|
|
68
|
+
signal: controller.signal
|
|
69
|
+
});
|
|
70
|
+
const elapsedMs = Date.now() - start;
|
|
71
|
+
clearTimeout(timeoutId);
|
|
72
|
+
if (!res.ok) {
|
|
73
|
+
const errorText = await res.text().catch(() => "unknown error");
|
|
74
|
+
throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
|
|
75
|
+
}
|
|
76
|
+
const text = await res.text();
|
|
77
|
+
if (!text.trim()) throw new Error("Empty response from Ollama");
|
|
78
|
+
const parsed = JSON.parse(text);
|
|
79
|
+
const toolCalls = parsed?.message?.tool_calls;
|
|
80
|
+
const content = parsed?.message?.content || "";
|
|
81
|
+
return {
|
|
82
|
+
content,
|
|
83
|
+
toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : void 0,
|
|
84
|
+
elapsedMs,
|
|
85
|
+
raw: parsed
|
|
86
|
+
};
|
|
87
|
+
} catch (e) {
|
|
88
|
+
clearTimeout(timeoutId);
|
|
89
|
+
throw e;
|
|
90
|
+
}
|
|
91
|
+
};
|
|
141
92
|
}
|
|
142
|
-
function
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
const { name: _, ...fnArgs } = rawArgs;
|
|
157
|
-
return { fnName: textToolParsed.name, args: fnArgs };
|
|
93
|
+
function makeProviderChatFn(providerInfo) {
|
|
94
|
+
return async (model, messages, options) => {
|
|
95
|
+
const result = await providerChat(providerInfo, model, messages, {
|
|
96
|
+
maxTokens: CONFIG.NUM_PREDICT,
|
|
97
|
+
tools: options?.tools,
|
|
98
|
+
timeoutMs: CONFIG.PROVIDER_TOOL_TIMEOUT_MS
|
|
99
|
+
});
|
|
100
|
+
return {
|
|
101
|
+
content: result.content,
|
|
102
|
+
toolCalls: result.toolCalls,
|
|
103
|
+
elapsedMs: result.elapsedMs,
|
|
104
|
+
raw: void 0
|
|
105
|
+
};
|
|
106
|
+
};
|
|
158
107
|
}
|
|
159
108
|
async function ollamaChat(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS, retries = CONFIG.MAX_RETRIES) {
|
|
160
109
|
const body = { model, messages, stream: false, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
|
|
@@ -347,22 +296,7 @@ function model_test_temp_default(pi) {
|
|
|
347
296
|
}
|
|
348
297
|
}
|
|
349
298
|
async function testReasoningProvider(providerInfo, model) {
|
|
350
|
-
|
|
351
|
-
try {
|
|
352
|
-
const result = await providerChat(providerInfo, model, [
|
|
353
|
-
{ role: "user", content: prompt }
|
|
354
|
-
]);
|
|
355
|
-
const msg = result.content.trim();
|
|
356
|
-
if (msg.length === 0) {
|
|
357
|
-
return { pass: false, score: "ERROR", reasoning: "Empty response from provider", answer: "?", elapsedMs: result.elapsedMs };
|
|
358
|
-
}
|
|
359
|
-
const allNumbers = msg.match(/\b(\d+)\b/g) || [];
|
|
360
|
-
const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
|
|
361
|
-
const { score, pass } = scoreReasoning(msg);
|
|
362
|
-
return { pass, score, reasoning: msg, answer, elapsedMs: result.elapsedMs };
|
|
363
|
-
} catch (e) {
|
|
364
|
-
return { pass: false, score: "ERROR", reasoning: e.message, answer: "?", elapsedMs: 0 };
|
|
365
|
-
}
|
|
299
|
+
return testReasoningUnified(makeProviderChatFn(providerInfo), model);
|
|
366
300
|
}
|
|
367
301
|
async function testThinking(model) {
|
|
368
302
|
const prompt = "Multiply 37 by 43. Explain your reasoning step by step and give the final answer.";
|
|
@@ -386,182 +320,10 @@ function model_test_temp_default(pi) {
|
|
|
386
320
|
}
|
|
387
321
|
}
|
|
388
322
|
async function testToolUsage(model) {
|
|
389
|
-
|
|
390
|
-
{
|
|
391
|
-
type: "function",
|
|
392
|
-
function: {
|
|
393
|
-
name: "get_weather",
|
|
394
|
-
description: "Get the current weather for a location",
|
|
395
|
-
parameters: {
|
|
396
|
-
type: "object",
|
|
397
|
-
properties: {
|
|
398
|
-
location: { type: "string", description: "City name" },
|
|
399
|
-
unit: { type: "string", enum: ["celsius", "fahrenheit"] }
|
|
400
|
-
},
|
|
401
|
-
required: ["location"]
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
];
|
|
406
|
-
const body = {
|
|
407
|
-
model,
|
|
408
|
-
messages: [
|
|
409
|
-
{ role: "system", content: "You are a helpful assistant. Use the available tools when needed." },
|
|
410
|
-
{ role: "user", content: "What's the weather like in Paris right now?" }
|
|
411
|
-
],
|
|
412
|
-
tools,
|
|
413
|
-
stream: false,
|
|
414
|
-
options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
|
|
415
|
-
};
|
|
416
|
-
try {
|
|
417
|
-
const controller = new AbortController();
|
|
418
|
-
const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
|
|
419
|
-
const start = Date.now();
|
|
420
|
-
const res = await fetch(`${ollamaBase()}/api/chat`, {
|
|
421
|
-
method: "POST",
|
|
422
|
-
headers: { "Content-Type": "application/json" },
|
|
423
|
-
body: JSON.stringify(body),
|
|
424
|
-
signal: controller.signal
|
|
425
|
-
});
|
|
426
|
-
const elapsedMs = Date.now() - start;
|
|
427
|
-
clearTimeout(timeoutId);
|
|
428
|
-
if (!res.ok) {
|
|
429
|
-
const errorText = await res.text().catch(() => "unknown error");
|
|
430
|
-
return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `fetch error: ${res.status}`, response: "", elapsedMs };
|
|
431
|
-
}
|
|
432
|
-
const text = await res.text();
|
|
433
|
-
if (!text.trim()) throw new Error("Empty response from Ollama");
|
|
434
|
-
const parsed = JSON.parse(text);
|
|
435
|
-
const toolCalls = parsed?.message?.tool_calls;
|
|
436
|
-
const content = parsed?.message?.content || "";
|
|
437
|
-
if (toolCalls && toolCalls.length > 0) {
|
|
438
|
-
const call = toolCalls[0];
|
|
439
|
-
const fn = call.function || {};
|
|
440
|
-
let args = {};
|
|
441
|
-
try {
|
|
442
|
-
args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
|
|
443
|
-
} catch {
|
|
444
|
-
return {
|
|
445
|
-
pass: true,
|
|
446
|
-
score: "WEAK",
|
|
447
|
-
hasToolCalls: true,
|
|
448
|
-
toolCall: `malformed args: ${String(fn.arguments)}`,
|
|
449
|
-
response: content,
|
|
450
|
-
elapsedMs
|
|
451
|
-
};
|
|
452
|
-
}
|
|
453
|
-
const { score, pass } = scoreNativeToolCall(fn.name || "", args);
|
|
454
|
-
return {
|
|
455
|
-
pass,
|
|
456
|
-
score,
|
|
457
|
-
hasToolCalls: true,
|
|
458
|
-
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
459
|
-
response: content,
|
|
460
|
-
elapsedMs
|
|
461
|
-
};
|
|
462
|
-
}
|
|
463
|
-
const textParsed = parseTextToolCall(content);
|
|
464
|
-
if (textParsed) {
|
|
465
|
-
const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
|
|
466
|
-
return {
|
|
467
|
-
pass,
|
|
468
|
-
score,
|
|
469
|
-
hasToolCalls: true,
|
|
470
|
-
toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
|
|
471
|
-
response: content,
|
|
472
|
-
elapsedMs
|
|
473
|
-
};
|
|
474
|
-
}
|
|
475
|
-
return {
|
|
476
|
-
pass: false,
|
|
477
|
-
score: "FAIL",
|
|
478
|
-
hasToolCalls: false,
|
|
479
|
-
toolCall: "none",
|
|
480
|
-
response: content,
|
|
481
|
-
elapsedMs
|
|
482
|
-
};
|
|
483
|
-
} catch (e) {
|
|
484
|
-
return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `error: ${e.message}`, response: "", elapsedMs: 0 };
|
|
485
|
-
}
|
|
323
|
+
return testToolUsageUnified(makeOllamaToolChatFn(), model);
|
|
486
324
|
}
|
|
487
325
|
async function testToolUsageProvider(providerInfo, model) {
|
|
488
|
-
|
|
489
|
-
{
|
|
490
|
-
type: "function",
|
|
491
|
-
function: {
|
|
492
|
-
name: "get_weather",
|
|
493
|
-
description: "Get the current weather for a location",
|
|
494
|
-
parameters: {
|
|
495
|
-
type: "object",
|
|
496
|
-
properties: {
|
|
497
|
-
location: { type: "string", description: "City name" },
|
|
498
|
-
unit: { type: "string", enum: ["celsius", "fahrenheit"] }
|
|
499
|
-
},
|
|
500
|
-
required: ["location"]
|
|
501
|
-
}
|
|
502
|
-
}
|
|
503
|
-
}
|
|
504
|
-
];
|
|
505
|
-
try {
|
|
506
|
-
const result = await providerChat(providerInfo, model, [
|
|
507
|
-
{ role: "system", content: "You are a helpful assistant. Use the available tools when needed." },
|
|
508
|
-
{ role: "user", content: "What's the weather like in Paris right now?" }
|
|
509
|
-
], {
|
|
510
|
-
maxTokens: CONFIG.NUM_PREDICT,
|
|
511
|
-
tools,
|
|
512
|
-
timeoutMs: CONFIG.PROVIDER_TOOL_TIMEOUT_MS
|
|
513
|
-
});
|
|
514
|
-
const content = result.content;
|
|
515
|
-
const toolCalls = result.toolCalls;
|
|
516
|
-
if (toolCalls && toolCalls.length > 0) {
|
|
517
|
-
const call = toolCalls[0];
|
|
518
|
-
const fn = call.function || {};
|
|
519
|
-
let args = {};
|
|
520
|
-
try {
|
|
521
|
-
args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
|
|
522
|
-
} catch {
|
|
523
|
-
return {
|
|
524
|
-
pass: true,
|
|
525
|
-
score: "WEAK",
|
|
526
|
-
hasToolCalls: true,
|
|
527
|
-
toolCall: `malformed args: ${String(fn.arguments)}`,
|
|
528
|
-
response: content,
|
|
529
|
-
elapsedMs: result.elapsedMs
|
|
530
|
-
};
|
|
531
|
-
}
|
|
532
|
-
const { score, pass } = scoreNativeToolCall(fn.name || "", args);
|
|
533
|
-
return {
|
|
534
|
-
pass,
|
|
535
|
-
score,
|
|
536
|
-
hasToolCalls: true,
|
|
537
|
-
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
538
|
-
response: content,
|
|
539
|
-
elapsedMs: result.elapsedMs
|
|
540
|
-
};
|
|
541
|
-
}
|
|
542
|
-
const textParsed = parseTextToolCall(content);
|
|
543
|
-
if (textParsed) {
|
|
544
|
-
const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
|
|
545
|
-
return {
|
|
546
|
-
pass,
|
|
547
|
-
score,
|
|
548
|
-
hasToolCalls: true,
|
|
549
|
-
toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
|
|
550
|
-
response: content,
|
|
551
|
-
elapsedMs: result.elapsedMs
|
|
552
|
-
};
|
|
553
|
-
}
|
|
554
|
-
return {
|
|
555
|
-
pass: false,
|
|
556
|
-
score: "FAIL",
|
|
557
|
-
hasToolCalls: false,
|
|
558
|
-
toolCall: "none",
|
|
559
|
-
response: content,
|
|
560
|
-
elapsedMs: result.elapsedMs
|
|
561
|
-
};
|
|
562
|
-
} catch (e) {
|
|
563
|
-
return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `error: ${e.message}`, response: "", elapsedMs: 0 };
|
|
564
|
-
}
|
|
326
|
+
return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
|
|
565
327
|
}
|
|
566
328
|
async function testReactParsing(model) {
|
|
567
329
|
const systemPrompt = [
|
|
@@ -642,67 +404,35 @@ function model_test_temp_default(pi) {
|
|
|
642
404
|
}
|
|
643
405
|
}
|
|
644
406
|
} else {
|
|
645
|
-
const
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
isParen = true;
|
|
663
|
-
}
|
|
664
|
-
if (m) {
|
|
665
|
-
const toolName = m[1].trim().replace(/[`"']/g, "");
|
|
666
|
-
const rawArgs = m[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
|
|
667
|
-
let argsStr = "";
|
|
668
|
-
if (isParen && rawArgs && !rawArgs.startsWith("{")) {
|
|
669
|
-
const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
|
|
670
|
-
if (pairs) {
|
|
671
|
-
const obj = {};
|
|
672
|
-
for (const p of pairs) {
|
|
673
|
-
const ci = p.indexOf(":");
|
|
674
|
-
let v = p.slice(ci + 1).trim();
|
|
675
|
-
if (v.startsWith('"') && v.endsWith('"') || v.startsWith("'") && v.endsWith("'")) v = v.slice(1, -1);
|
|
676
|
-
obj[p.slice(0, ci).trim()] = v;
|
|
677
|
-
}
|
|
678
|
-
argsStr = JSON.stringify(obj);
|
|
679
|
-
} else {
|
|
680
|
-
argsStr = rawArgs;
|
|
681
|
-
}
|
|
682
|
-
} else {
|
|
683
|
-
const js = rawArgs.indexOf("{");
|
|
684
|
-
if (js !== -1) {
|
|
685
|
-
let d = 0, je = -1;
|
|
686
|
-
for (let i = js; i < rawArgs.length; i++) {
|
|
687
|
-
if (rawArgs[i] === "{") d++;
|
|
688
|
-
else if (rawArgs[i] === "}") {
|
|
689
|
-
d--;
|
|
690
|
-
if (d === 0) {
|
|
691
|
-
je = i;
|
|
407
|
+
for (const dp of ALL_DIALECT_PATTERNS) {
|
|
408
|
+
const result = parseReactWithPatterns(content, dp, true);
|
|
409
|
+
if (result) {
|
|
410
|
+
let argsStr;
|
|
411
|
+
const rawArgs = result.args ? JSON.stringify(result.args) : "";
|
|
412
|
+
if (rawArgs && rawArgs !== "{}") {
|
|
413
|
+
argsStr = rawArgs;
|
|
414
|
+
} else if (result.raw) {
|
|
415
|
+
const jsonStart = result.raw.indexOf("{");
|
|
416
|
+
if (jsonStart !== -1) {
|
|
417
|
+
let depth = 0, jsonEnd = -1;
|
|
418
|
+
for (let i = jsonStart; i < result.raw.length; i++) {
|
|
419
|
+
if (result.raw[i] === "{") depth++;
|
|
420
|
+
else if (result.raw[i] === "}") {
|
|
421
|
+
depth--;
|
|
422
|
+
if (depth === 0) {
|
|
423
|
+
jsonEnd = i;
|
|
692
424
|
break;
|
|
693
425
|
}
|
|
694
426
|
}
|
|
695
427
|
}
|
|
696
|
-
argsStr =
|
|
428
|
+
argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
697
429
|
} else {
|
|
698
|
-
argsStr =
|
|
430
|
+
argsStr = "";
|
|
699
431
|
}
|
|
432
|
+
} else {
|
|
433
|
+
argsStr = "";
|
|
700
434
|
}
|
|
701
|
-
|
|
702
|
-
const thoughtRe = /Thought:\s*(.*?)(?=Action:|Function:|Tool:|Call:|Final Answer:|$)/is;
|
|
703
|
-
const tm = thoughtRe.exec(content);
|
|
704
|
-
if (tm) thought = tm[1].trim();
|
|
705
|
-
parsedResult = { name: toolName, args: argsStr, thought, dialect: dd.name };
|
|
435
|
+
parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
|
|
706
436
|
break;
|
|
707
437
|
}
|
|
708
438
|
}
|
|
@@ -763,158 +493,10 @@ function model_test_temp_default(pi) {
|
|
|
763
493
|
}
|
|
764
494
|
}
|
|
765
495
|
async function testInstructionFollowing(model) {
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
The JSON object must have exactly these 4 keys:
|
|
769
|
-
- "name" (string): your model name
|
|
770
|
-
- "can_count" (boolean): true
|
|
771
|
-
- "sum" (number): the result of 15 + 27
|
|
772
|
-
- "language" (string): the language you are responding in`;
|
|
773
|
-
try {
|
|
774
|
-
const { response, elapsedMs } = await ollamaChat(model, [
|
|
775
|
-
{ role: "user", content: prompt }
|
|
776
|
-
], { num_predict: CONFIG.NUM_PREDICT });
|
|
777
|
-
const msg = (response?.message?.content || "").trim();
|
|
778
|
-
let parsed = null;
|
|
779
|
-
let repairNote = "";
|
|
780
|
-
try {
|
|
781
|
-
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
782
|
-
parsed = JSON.parse(cleaned);
|
|
783
|
-
} catch {
|
|
784
|
-
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
785
|
-
let braceDepth = 0, bracketDepth = 0;
|
|
786
|
-
let inString = false, escapeNext = false;
|
|
787
|
-
for (let i = 0; i < cleaned.length; i++) {
|
|
788
|
-
const c = cleaned[i];
|
|
789
|
-
if (escapeNext) {
|
|
790
|
-
escapeNext = false;
|
|
791
|
-
continue;
|
|
792
|
-
}
|
|
793
|
-
if (c === "\\") {
|
|
794
|
-
if (inString) escapeNext = true;
|
|
795
|
-
continue;
|
|
796
|
-
}
|
|
797
|
-
if (c === '"') {
|
|
798
|
-
inString = !inString;
|
|
799
|
-
continue;
|
|
800
|
-
}
|
|
801
|
-
if (inString) continue;
|
|
802
|
-
if (c === "{") braceDepth++;
|
|
803
|
-
else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
|
|
804
|
-
else if (c === "[") bracketDepth++;
|
|
805
|
-
else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
|
|
806
|
-
}
|
|
807
|
-
if (braceDepth > 0 || bracketDepth > 0) {
|
|
808
|
-
const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
|
|
809
|
-
try {
|
|
810
|
-
parsed = JSON.parse(repaired);
|
|
811
|
-
repairNote = " (repaired truncated JSON)";
|
|
812
|
-
} catch {
|
|
813
|
-
}
|
|
814
|
-
}
|
|
815
|
-
}
|
|
816
|
-
if (!parsed) {
|
|
817
|
-
return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs };
|
|
818
|
-
}
|
|
819
|
-
const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
|
|
820
|
-
const correctSum = parsed.sum === 42;
|
|
821
|
-
const hasCorrectCount = parsed.can_count === true;
|
|
822
|
-
let score;
|
|
823
|
-
if (hasKeys && correctSum && hasCorrectCount) {
|
|
824
|
-
score = "STRONG";
|
|
825
|
-
} else if (hasKeys && (correctSum || hasCorrectCount)) {
|
|
826
|
-
score = "MODERATE";
|
|
827
|
-
} else if (parsed.sum !== void 0 || parsed.name) {
|
|
828
|
-
score = "WEAK";
|
|
829
|
-
} else {
|
|
830
|
-
score = "FAIL";
|
|
831
|
-
}
|
|
832
|
-
return {
|
|
833
|
-
pass: hasKeys,
|
|
834
|
-
score,
|
|
835
|
-
output: JSON.stringify(parsed) + repairNote,
|
|
836
|
-
elapsedMs
|
|
837
|
-
};
|
|
838
|
-
} catch (e) {
|
|
839
|
-
return { pass: false, score: "ERROR", output: e.message, elapsedMs: 0 };
|
|
840
|
-
}
|
|
496
|
+
return testInstructionFollowingUnified(makeOllamaChatFn(), model);
|
|
841
497
|
}
|
|
842
498
|
async function testInstructionFollowingProvider(providerInfo, model) {
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
The JSON object must have exactly these 4 keys:
|
|
846
|
-
- "name" (string): your model name
|
|
847
|
-
- "can_count" (boolean): true
|
|
848
|
-
- "sum" (number): the result of 15 + 27
|
|
849
|
-
- "language" (string): the language you are responding in`;
|
|
850
|
-
try {
|
|
851
|
-
const result = await providerChat(providerInfo, model, [
|
|
852
|
-
{ role: "user", content: prompt }
|
|
853
|
-
]);
|
|
854
|
-
const msg = result.content.trim();
|
|
855
|
-
let parsed = null;
|
|
856
|
-
let repairNote = "";
|
|
857
|
-
try {
|
|
858
|
-
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
859
|
-
parsed = JSON.parse(cleaned);
|
|
860
|
-
} catch {
|
|
861
|
-
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
862
|
-
let braceDepth = 0, bracketDepth = 0;
|
|
863
|
-
let inString = false, escapeNext = false;
|
|
864
|
-
for (let i = 0; i < cleaned.length; i++) {
|
|
865
|
-
const c = cleaned[i];
|
|
866
|
-
if (escapeNext) {
|
|
867
|
-
escapeNext = false;
|
|
868
|
-
continue;
|
|
869
|
-
}
|
|
870
|
-
if (c === "\\") {
|
|
871
|
-
if (inString) escapeNext = true;
|
|
872
|
-
continue;
|
|
873
|
-
}
|
|
874
|
-
if (c === '"') {
|
|
875
|
-
inString = !inString;
|
|
876
|
-
continue;
|
|
877
|
-
}
|
|
878
|
-
if (inString) continue;
|
|
879
|
-
if (c === "{") braceDepth++;
|
|
880
|
-
else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
|
|
881
|
-
else if (c === "[") bracketDepth++;
|
|
882
|
-
else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
|
|
883
|
-
}
|
|
884
|
-
if (braceDepth > 0 || bracketDepth > 0) {
|
|
885
|
-
const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
|
|
886
|
-
try {
|
|
887
|
-
parsed = JSON.parse(repaired);
|
|
888
|
-
repairNote = " (repaired truncated JSON)";
|
|
889
|
-
} catch {
|
|
890
|
-
}
|
|
891
|
-
}
|
|
892
|
-
}
|
|
893
|
-
if (!parsed) {
|
|
894
|
-
return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs: result.elapsedMs };
|
|
895
|
-
}
|
|
896
|
-
const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
|
|
897
|
-
const correctSum = parsed.sum === 42;
|
|
898
|
-
const hasCorrectCount = parsed.can_count === true;
|
|
899
|
-
let score;
|
|
900
|
-
if (hasKeys && correctSum && hasCorrectCount) {
|
|
901
|
-
score = "STRONG";
|
|
902
|
-
} else if (hasKeys && (correctSum || hasCorrectCount)) {
|
|
903
|
-
score = "MODERATE";
|
|
904
|
-
} else if (parsed.sum !== void 0 || parsed.name) {
|
|
905
|
-
score = "WEAK";
|
|
906
|
-
} else {
|
|
907
|
-
score = "FAIL";
|
|
908
|
-
}
|
|
909
|
-
return {
|
|
910
|
-
pass: hasKeys,
|
|
911
|
-
score,
|
|
912
|
-
output: JSON.stringify(parsed) + repairNote,
|
|
913
|
-
elapsedMs: result.elapsedMs
|
|
914
|
-
};
|
|
915
|
-
} catch (e) {
|
|
916
|
-
return { pass: false, score: "ERROR", output: e.message, elapsedMs: 0 };
|
|
917
|
-
}
|
|
499
|
+
return testInstructionFollowingUnified(makeProviderChatFn(providerInfo), model);
|
|
918
500
|
}
|
|
919
501
|
async function testToolSupport(model, family) {
|
|
920
502
|
const cached = getCachedToolSupport(model);
|
|
@@ -926,23 +508,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
926
508
|
elapsedMs: 0
|
|
927
509
|
};
|
|
928
510
|
}
|
|
929
|
-
const tools = [
|
|
930
|
-
{
|
|
931
|
-
type: "function",
|
|
932
|
-
function: {
|
|
933
|
-
name: "get_weather",
|
|
934
|
-
description: "Get the current weather for a location",
|
|
935
|
-
parameters: {
|
|
936
|
-
type: "object",
|
|
937
|
-
properties: {
|
|
938
|
-
location: { type: "string", description: "City name" },
|
|
939
|
-
unit: { type: "string", enum: ["celsius", "fahrenheit"] }
|
|
940
|
-
},
|
|
941
|
-
required: ["location"]
|
|
942
|
-
}
|
|
943
|
-
}
|
|
944
|
-
}
|
|
945
|
-
];
|
|
511
|
+
const tools = [WEATHER_TOOL_DEFINITION];
|
|
946
512
|
const body = {
|
|
947
513
|
model,
|
|
948
514
|
messages: [
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.4",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.1.
|
|
17
|
+
"@vtstech/pi-shared": "1.1.4"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|