@vtstech/pi-model-test 1.0.6 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +75 -31
- package/package.json +2 -2
package/model-test.js
CHANGED
|
@@ -12,7 +12,7 @@ import {
|
|
|
12
12
|
truncate,
|
|
13
13
|
sanitizeForReport
|
|
14
14
|
} from "@vtstech/pi-shared/format";
|
|
15
|
-
import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS } from "@vtstech/pi-shared/ollama";
|
|
15
|
+
import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS, fetchModelContextLength } from "@vtstech/pi-shared/ollama";
|
|
16
16
|
function detectProvider(ctx) {
|
|
17
17
|
const model = ctx.model;
|
|
18
18
|
if (!model) return { kind: "unknown", name: "none" };
|
|
@@ -55,15 +55,15 @@ function detectProvider(ctx) {
|
|
|
55
55
|
}
|
|
56
56
|
var CONFIG = {
|
|
57
57
|
// General API settings
|
|
58
|
-
DEFAULT_TIMEOUT_MS:
|
|
58
|
+
DEFAULT_TIMEOUT_MS: 999999,
|
|
59
59
|
// 8.3 minutes - default timeout for model responses
|
|
60
|
-
CONNECT_TIMEOUT_S:
|
|
60
|
+
CONNECT_TIMEOUT_S: 60,
|
|
61
61
|
// 30 seconds to establish connection
|
|
62
62
|
MAX_RETRIES: 1,
|
|
63
63
|
// Single retry for transient failures
|
|
64
|
-
RETRY_DELAY_MS:
|
|
64
|
+
RETRY_DELAY_MS: 1e4,
|
|
65
65
|
// 2 seconds between retries
|
|
66
|
-
EXEC_BUFFER_MS:
|
|
66
|
+
EXEC_BUFFER_MS: 8e3,
|
|
67
67
|
// Extra buffer for exec timeout over curl timeout
|
|
68
68
|
// Model generation settings
|
|
69
69
|
NUM_PREDICT: 1024,
|
|
@@ -73,28 +73,28 @@ var CONFIG = {
|
|
|
73
73
|
// Test-specific settings
|
|
74
74
|
MIN_THINKING_LENGTH: 10,
|
|
75
75
|
// Minimum chars to consider thinking tokens valid
|
|
76
|
-
TOOL_TEST_TIMEOUT_MS:
|
|
76
|
+
TOOL_TEST_TIMEOUT_MS: 999999,
|
|
77
77
|
// 90 seconds for tool usage tests
|
|
78
|
-
TOOL_TEST_MAX_TIME_S:
|
|
78
|
+
TOOL_TEST_MAX_TIME_S: 999999,
|
|
79
79
|
// Max curl time for tool tests (effectively unlimited)
|
|
80
|
-
TOOL_SUPPORT_TIMEOUT_MS:
|
|
80
|
+
TOOL_SUPPORT_TIMEOUT_MS: 999999,
|
|
81
81
|
// 2+ minutes for tool support detection
|
|
82
|
-
TOOL_SUPPORT_MAX_TIME_S:
|
|
82
|
+
TOOL_SUPPORT_MAX_TIME_S: 999999,
|
|
83
83
|
// Max curl time for tool support detection
|
|
84
84
|
// Metadata retrieval
|
|
85
85
|
TAGS_TIMEOUT_MS: 15e3,
|
|
86
86
|
// 15 seconds for /api/tags
|
|
87
|
-
TAGS_CONNECT_TIMEOUT_S:
|
|
87
|
+
TAGS_CONNECT_TIMEOUT_S: 30,
|
|
88
88
|
// 10 seconds connection timeout for tags
|
|
89
|
-
MODEL_INFO_TIMEOUT_MS:
|
|
89
|
+
MODEL_INFO_TIMEOUT_MS: 3e4,
|
|
90
90
|
// 10 seconds for model info lookup
|
|
91
91
|
// Provider API settings
|
|
92
|
-
PROVIDER_TIMEOUT_MS:
|
|
92
|
+
PROVIDER_TIMEOUT_MS: 999999,
|
|
93
93
|
// 2 minutes for cloud provider API calls
|
|
94
|
-
PROVIDER_TOOL_TIMEOUT_MS:
|
|
94
|
+
PROVIDER_TOOL_TIMEOUT_MS: 12e4,
|
|
95
95
|
// 60 seconds for tool usage tests on providers
|
|
96
96
|
// Rate limiting
|
|
97
|
-
TEST_DELAY_MS:
|
|
97
|
+
TEST_DELAY_MS: 1e4
|
|
98
98
|
// 30 seconds between tests to avoid rate limiting
|
|
99
99
|
};
|
|
100
100
|
var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
|
|
@@ -508,16 +508,20 @@ function model_test_temp_default(pi) {
|
|
|
508
508
|
}
|
|
509
509
|
const hasCorrectTool = fn.name === "get_weather";
|
|
510
510
|
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
511
|
+
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
511
512
|
let score;
|
|
512
|
-
if (hasCorrectTool && hasLocation) {
|
|
513
|
+
if (hasCorrectTool && hasLocation && unitValid) {
|
|
513
514
|
score = "STRONG";
|
|
514
|
-
} else if (hasCorrectTool) {
|
|
515
|
+
} else if (hasCorrectTool && hasLocation) {
|
|
515
516
|
score = "MODERATE";
|
|
517
|
+
} else if (hasCorrectTool) {
|
|
518
|
+
score = "WEAK";
|
|
516
519
|
} else {
|
|
517
520
|
score = "WEAK";
|
|
518
521
|
}
|
|
522
|
+
const pass = score !== "WEAK";
|
|
519
523
|
return {
|
|
520
|
-
pass
|
|
524
|
+
pass,
|
|
521
525
|
score,
|
|
522
526
|
hasToolCalls: true,
|
|
523
527
|
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
@@ -551,8 +555,9 @@ function model_test_temp_default(pi) {
|
|
|
551
555
|
} else {
|
|
552
556
|
score = "WEAK";
|
|
553
557
|
}
|
|
558
|
+
const pass = score !== "WEAK";
|
|
554
559
|
return {
|
|
555
|
-
pass
|
|
560
|
+
pass,
|
|
556
561
|
score,
|
|
557
562
|
hasToolCalls: true,
|
|
558
563
|
toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
|
|
@@ -619,16 +624,20 @@ function model_test_temp_default(pi) {
|
|
|
619
624
|
}
|
|
620
625
|
const hasCorrectTool = fn.name === "get_weather";
|
|
621
626
|
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
627
|
+
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
622
628
|
let score;
|
|
623
|
-
if (hasCorrectTool && hasLocation) {
|
|
629
|
+
if (hasCorrectTool && hasLocation && unitValid) {
|
|
624
630
|
score = "STRONG";
|
|
625
|
-
} else if (hasCorrectTool) {
|
|
631
|
+
} else if (hasCorrectTool && hasLocation) {
|
|
626
632
|
score = "MODERATE";
|
|
633
|
+
} else if (hasCorrectTool) {
|
|
634
|
+
score = "WEAK";
|
|
627
635
|
} else {
|
|
628
636
|
score = "WEAK";
|
|
629
637
|
}
|
|
638
|
+
const pass = score !== "WEAK";
|
|
630
639
|
return {
|
|
631
|
-
pass
|
|
640
|
+
pass,
|
|
632
641
|
score,
|
|
633
642
|
hasToolCalls: true,
|
|
634
643
|
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
@@ -662,8 +671,9 @@ function model_test_temp_default(pi) {
|
|
|
662
671
|
} else {
|
|
663
672
|
score = "WEAK";
|
|
664
673
|
}
|
|
674
|
+
const pass = score !== "WEAK";
|
|
665
675
|
return {
|
|
666
|
-
pass
|
|
676
|
+
pass,
|
|
667
677
|
score,
|
|
668
678
|
hasToolCalls: true,
|
|
669
679
|
toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
|
|
@@ -741,7 +751,18 @@ function model_test_temp_default(pi) {
|
|
|
741
751
|
let match = ACTION_RE.exec(content);
|
|
742
752
|
if (!match) match = ACTION_RE_SAMELINE.exec(content);
|
|
743
753
|
let looseMatch = false;
|
|
744
|
-
if (!match)
|
|
754
|
+
if (!match) {
|
|
755
|
+
const looseResult = ACTION_RE_LOOSE.exec(content);
|
|
756
|
+
if (looseResult) {
|
|
757
|
+
const candidate = looseResult[1].trim().replace(/[`"']/g, "");
|
|
758
|
+
const isToolIdentifier = /^\w+$/.test(candidate) && (candidate.includes("_") || candidate.includes("-"));
|
|
759
|
+
const isKnownTool = /^(get_weather|calculate)$/i.test(candidate);
|
|
760
|
+
if (isToolIdentifier || isKnownTool) {
|
|
761
|
+
match = looseResult;
|
|
762
|
+
looseMatch = true;
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
}
|
|
745
766
|
let parenMatch = false;
|
|
746
767
|
if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
|
|
747
768
|
if (match) {
|
|
@@ -812,8 +833,9 @@ function model_test_temp_default(pi) {
|
|
|
812
833
|
} else {
|
|
813
834
|
score = "WEAK";
|
|
814
835
|
}
|
|
836
|
+
const pass = score !== "WEAK";
|
|
815
837
|
return {
|
|
816
|
-
pass
|
|
838
|
+
pass,
|
|
817
839
|
score,
|
|
818
840
|
toolCall: `${toolName}(${argsStr})`,
|
|
819
841
|
thought,
|
|
@@ -1154,17 +1176,32 @@ The JSON object must have exactly these 4 keys:
|
|
|
1154
1176
|
}
|
|
1155
1177
|
}
|
|
1156
1178
|
const branding = [
|
|
1157
|
-
` \u26A1 Pi Model Benchmark v1.0.
|
|
1179
|
+
` \u26A1 Pi Model Benchmark v1.0.8`,
|
|
1158
1180
|
` Written by VTSTech`,
|
|
1159
1181
|
` GitHub: https://github.com/VTSTech`,
|
|
1160
1182
|
` Website: www.vts-tech.org`
|
|
1161
1183
|
].join("\n");
|
|
1162
|
-
async function testModelOllama(model) {
|
|
1184
|
+
async function testModelOllama(model, providerInfo, ctx) {
|
|
1163
1185
|
const lines = [];
|
|
1164
1186
|
const totalStart = Date.now();
|
|
1165
1187
|
lines.push(branding);
|
|
1166
1188
|
lines.push(section(`MODEL: ${model}`));
|
|
1167
1189
|
lines.push(info("Provider: Ollama (local/remote)"));
|
|
1190
|
+
const modelsJson = readModelsJson();
|
|
1191
|
+
let apiMode = "ollama";
|
|
1192
|
+
const providerName = ctx?.model?.provider || providerInfo?.name || "";
|
|
1193
|
+
if (providerName && modelsJson) {
|
|
1194
|
+
const providerCfg = (modelsJson.providers || {})[providerName];
|
|
1195
|
+
if (providerCfg) {
|
|
1196
|
+
apiMode = providerCfg.api || "ollama";
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
lines.push(info(`API: ${apiMode}`));
|
|
1200
|
+
const nativeContext = await fetchModelContextLength(OLLAMA_BASE, model);
|
|
1201
|
+
if (nativeContext !== void 0) {
|
|
1202
|
+
const ctxStr = nativeContext >= 1e3 ? `${(nativeContext / 1e3).toFixed(1)}k` : String(nativeContext);
|
|
1203
|
+
lines.push(info(`Context: ${ctxStr} tokens (native max)`));
|
|
1204
|
+
}
|
|
1168
1205
|
let modelSize = "unknown";
|
|
1169
1206
|
let modelFamily = "unknown";
|
|
1170
1207
|
let modelParams = "unknown";
|
|
@@ -1336,11 +1373,13 @@ The JSON object must have exactly these 4 keys:
|
|
|
1336
1373
|
lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
|
|
1337
1374
|
lines.push(section("SUMMARY"));
|
|
1338
1375
|
const totalMs = Date.now() - totalStart;
|
|
1376
|
+
const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
|
|
1377
|
+
const reactPass = react.score === "STRONG" || react.score === "MODERATE";
|
|
1339
1378
|
const tests = [
|
|
1340
1379
|
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
1341
1380
|
{ name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
|
|
1342
|
-
{ name: "Tool Usage", pass:
|
|
1343
|
-
{ name: "ReAct Parse", pass:
|
|
1381
|
+
{ name: "Tool Usage", pass: toolPass, score: tools.score },
|
|
1382
|
+
{ name: "ReAct Parse", pass: reactPass, score: react.score },
|
|
1344
1383
|
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
1345
1384
|
{ name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
|
|
1346
1385
|
];
|
|
@@ -1363,7 +1402,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1363
1402
|
}
|
|
1364
1403
|
return lines.join("\n");
|
|
1365
1404
|
}
|
|
1366
|
-
async function testModelProvider(providerInfo, model) {
|
|
1405
|
+
async function testModelProvider(providerInfo, model, ctx) {
|
|
1367
1406
|
const lines = [];
|
|
1368
1407
|
const totalStart = Date.now();
|
|
1369
1408
|
lines.push(branding);
|
|
@@ -1376,6 +1415,11 @@ The JSON object must have exactly these 4 keys:
|
|
|
1376
1415
|
} else {
|
|
1377
1416
|
lines.push(warn(`API Key: NOT SET (${providerInfo.envKey || "env var not found"})`));
|
|
1378
1417
|
}
|
|
1418
|
+
const contextWindow = ctx?.model?.contextWindow ?? null;
|
|
1419
|
+
if (contextWindow !== null) {
|
|
1420
|
+
const ctxStr = contextWindow >= 1e3 ? `${(contextWindow / 1e3).toFixed(1)}k` : String(contextWindow);
|
|
1421
|
+
lines.push(info(`Context: ${ctxStr} tokens`));
|
|
1422
|
+
}
|
|
1379
1423
|
lines.push(section("CONNECTIVITY TEST"));
|
|
1380
1424
|
lines.push(info("Sending minimal request to verify API reachability and key validity..."));
|
|
1381
1425
|
const connectivity = await testConnectivity(providerInfo, model);
|
|
@@ -1496,9 +1540,9 @@ The JSON object must have exactly these 4 keys:
|
|
|
1496
1540
|
async function testModel(model, ctx) {
|
|
1497
1541
|
const providerInfo = ctx ? detectProvider(ctx) : { kind: "ollama", name: "ollama" };
|
|
1498
1542
|
if (providerInfo.kind === "ollama") {
|
|
1499
|
-
return testModelOllama(model);
|
|
1543
|
+
return testModelOllama(model, providerInfo, ctx);
|
|
1500
1544
|
} else if (providerInfo.kind === "builtin") {
|
|
1501
|
-
return testModelProvider(providerInfo, model);
|
|
1545
|
+
return testModelProvider(providerInfo, model, ctx);
|
|
1502
1546
|
} else {
|
|
1503
1547
|
return testModelOllama(model);
|
|
1504
1548
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.8",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.0.
|
|
17
|
+
"@vtstech/pi-shared": "1.0.8"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|