@vtstech/pi-model-test 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +39 -28
- package/package.json +2 -2
package/model-test.js
CHANGED
|
@@ -12,20 +12,7 @@ import {
|
|
|
12
12
|
truncate,
|
|
13
13
|
sanitizeForReport
|
|
14
14
|
} from "@vtstech/pi-shared/format";
|
|
15
|
-
import { getOllamaBaseUrl, detectModelFamily, readModelsJson } from "@vtstech/pi-shared/ollama";
|
|
16
|
-
var BUILTIN_PROVIDERS = {
|
|
17
|
-
openrouter: { api: "openai-completions", baseUrl: "https://openrouter.ai/api/v1", envKey: "OPENROUTER_API_KEY" },
|
|
18
|
-
anthropic: { api: "anthropic-messages", baseUrl: "https://api.anthropic.com/v1", envKey: "ANTHROPIC_API_KEY" },
|
|
19
|
-
google: { api: "gemini", baseUrl: "https://generativelanguage.googleapis.com", envKey: "GOOGLE_API_KEY" },
|
|
20
|
-
openai: { api: "openai-completions", baseUrl: "https://api.openai.com/v1", envKey: "OPENAI_API_KEY" },
|
|
21
|
-
groq: { api: "openai-completions", baseUrl: "https://api.groq.com/v1", envKey: "GROQ_API_KEY" },
|
|
22
|
-
deepseek: { api: "openai-completions", baseUrl: "https://api.deepseek.com/v1", envKey: "DEEPSEEK_API_KEY" },
|
|
23
|
-
mistral: { api: "openai-completions", baseUrl: "https://api.mistral.ai/v1", envKey: "MISTRAL_API_KEY" },
|
|
24
|
-
xai: { api: "openai-completions", baseUrl: "https://api.x.ai/v1", envKey: "XAI_API_KEY" },
|
|
25
|
-
together: { api: "openai-completions", baseUrl: "https://api.together.xyz/v1", envKey: "TOGETHER_API_KEY" },
|
|
26
|
-
fireworks: { api: "openai-completions", baseUrl: "https://api.fireworks.ai/inference/v1", envKey: "FIREWORKS_API_KEY" },
|
|
27
|
-
cohere: { api: "cohere-chat", baseUrl: "https://api.cohere.com/v1", envKey: "COHERE_API_KEY" }
|
|
28
|
-
};
|
|
15
|
+
import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS } from "@vtstech/pi-shared/ollama";
|
|
29
16
|
function detectProvider(ctx) {
|
|
30
17
|
const model = ctx.model;
|
|
31
18
|
if (!model) return { kind: "unknown", name: "none" };
|
|
@@ -37,7 +24,7 @@ function detectProvider(ctx) {
|
|
|
37
24
|
const baseUrl = userProviderCfg.baseUrl || "";
|
|
38
25
|
const apiMode = userProviderCfg.api || "";
|
|
39
26
|
const apiKey = userProviderCfg.apiKey || "";
|
|
40
|
-
const isOllama = /ollama/i.test(providerName) || /localhost:\d+/.test(baseUrl) || /127\.0\.0\.1:\d+/.test(baseUrl) || /\/api\/chat/.test(baseUrl) || apiMode === "ollama";
|
|
27
|
+
const isOllama = /ollama/i.test(providerName) || /localhost:\d+/.test(baseUrl) || /127\.0\.0\.1:\d+/.test(baseUrl) || /0\.0\.0\.0:\d+/.test(baseUrl) || /\/api\/chat/.test(baseUrl) || apiMode === "ollama";
|
|
41
28
|
if (isOllama) {
|
|
42
29
|
return { kind: "ollama", name: providerName, apiMode: "ollama", baseUrl, apiKey };
|
|
43
30
|
}
|
|
@@ -521,16 +508,20 @@ function model_test_temp_default(pi) {
|
|
|
521
508
|
}
|
|
522
509
|
const hasCorrectTool = fn.name === "get_weather";
|
|
523
510
|
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
511
|
+
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
524
512
|
let score;
|
|
525
|
-
if (hasCorrectTool && hasLocation) {
|
|
513
|
+
if (hasCorrectTool && hasLocation && unitValid) {
|
|
526
514
|
score = "STRONG";
|
|
527
|
-
} else if (hasCorrectTool) {
|
|
515
|
+
} else if (hasCorrectTool && hasLocation) {
|
|
528
516
|
score = "MODERATE";
|
|
517
|
+
} else if (hasCorrectTool) {
|
|
518
|
+
score = "WEAK";
|
|
529
519
|
} else {
|
|
530
520
|
score = "WEAK";
|
|
531
521
|
}
|
|
522
|
+
const pass = score !== "WEAK";
|
|
532
523
|
return {
|
|
533
|
-
pass
|
|
524
|
+
pass,
|
|
534
525
|
score,
|
|
535
526
|
hasToolCalls: true,
|
|
536
527
|
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
@@ -564,8 +555,9 @@ function model_test_temp_default(pi) {
|
|
|
564
555
|
} else {
|
|
565
556
|
score = "WEAK";
|
|
566
557
|
}
|
|
558
|
+
const pass = score !== "WEAK";
|
|
567
559
|
return {
|
|
568
|
-
pass
|
|
560
|
+
pass,
|
|
569
561
|
score,
|
|
570
562
|
hasToolCalls: true,
|
|
571
563
|
toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
|
|
@@ -632,16 +624,20 @@ function model_test_temp_default(pi) {
|
|
|
632
624
|
}
|
|
633
625
|
const hasCorrectTool = fn.name === "get_weather";
|
|
634
626
|
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
627
|
+
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
635
628
|
let score;
|
|
636
|
-
if (hasCorrectTool && hasLocation) {
|
|
629
|
+
if (hasCorrectTool && hasLocation && unitValid) {
|
|
637
630
|
score = "STRONG";
|
|
638
|
-
} else if (hasCorrectTool) {
|
|
631
|
+
} else if (hasCorrectTool && hasLocation) {
|
|
639
632
|
score = "MODERATE";
|
|
633
|
+
} else if (hasCorrectTool) {
|
|
634
|
+
score = "WEAK";
|
|
640
635
|
} else {
|
|
641
636
|
score = "WEAK";
|
|
642
637
|
}
|
|
638
|
+
const pass = score !== "WEAK";
|
|
643
639
|
return {
|
|
644
|
-
pass
|
|
640
|
+
pass,
|
|
645
641
|
score,
|
|
646
642
|
hasToolCalls: true,
|
|
647
643
|
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
@@ -675,8 +671,9 @@ function model_test_temp_default(pi) {
|
|
|
675
671
|
} else {
|
|
676
672
|
score = "WEAK";
|
|
677
673
|
}
|
|
674
|
+
const pass = score !== "WEAK";
|
|
678
675
|
return {
|
|
679
|
-
pass
|
|
676
|
+
pass,
|
|
680
677
|
score,
|
|
681
678
|
hasToolCalls: true,
|
|
682
679
|
toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
|
|
@@ -754,7 +751,18 @@ function model_test_temp_default(pi) {
|
|
|
754
751
|
let match = ACTION_RE.exec(content);
|
|
755
752
|
if (!match) match = ACTION_RE_SAMELINE.exec(content);
|
|
756
753
|
let looseMatch = false;
|
|
757
|
-
if (!match)
|
|
754
|
+
if (!match) {
|
|
755
|
+
const looseResult = ACTION_RE_LOOSE.exec(content);
|
|
756
|
+
if (looseResult) {
|
|
757
|
+
const candidate = looseResult[1].trim().replace(/[`"']/g, "");
|
|
758
|
+
const isToolIdentifier = /^\w+$/.test(candidate) && (candidate.includes("_") || candidate.includes("-"));
|
|
759
|
+
const isKnownTool = /^(get_weather|calculate)$/i.test(candidate);
|
|
760
|
+
if (isToolIdentifier || isKnownTool) {
|
|
761
|
+
match = looseResult;
|
|
762
|
+
looseMatch = true;
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
}
|
|
758
766
|
let parenMatch = false;
|
|
759
767
|
if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
|
|
760
768
|
if (match) {
|
|
@@ -825,8 +833,9 @@ function model_test_temp_default(pi) {
|
|
|
825
833
|
} else {
|
|
826
834
|
score = "WEAK";
|
|
827
835
|
}
|
|
836
|
+
const pass = score !== "WEAK";
|
|
828
837
|
return {
|
|
829
|
-
pass
|
|
838
|
+
pass,
|
|
830
839
|
score,
|
|
831
840
|
toolCall: `${toolName}(${argsStr})`,
|
|
832
841
|
thought,
|
|
@@ -1167,7 +1176,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1167
1176
|
}
|
|
1168
1177
|
}
|
|
1169
1178
|
const branding = [
|
|
1170
|
-
` \u26A1 Pi Model Benchmark v1.0.
|
|
1179
|
+
` \u26A1 Pi Model Benchmark v1.0.7`,
|
|
1171
1180
|
` Written by VTSTech`,
|
|
1172
1181
|
` GitHub: https://github.com/VTSTech`,
|
|
1173
1182
|
` Website: www.vts-tech.org`
|
|
@@ -1349,11 +1358,13 @@ The JSON object must have exactly these 4 keys:
|
|
|
1349
1358
|
lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
|
|
1350
1359
|
lines.push(section("SUMMARY"));
|
|
1351
1360
|
const totalMs = Date.now() - totalStart;
|
|
1361
|
+
const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
|
|
1362
|
+
const reactPass = react.score === "STRONG" || react.score === "MODERATE";
|
|
1352
1363
|
const tests = [
|
|
1353
1364
|
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
1354
1365
|
{ name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
|
|
1355
|
-
{ name: "Tool Usage", pass:
|
|
1356
|
-
{ name: "ReAct Parse", pass:
|
|
1366
|
+
{ name: "Tool Usage", pass: toolPass, score: tools.score },
|
|
1367
|
+
{ name: "ReAct Parse", pass: reactPass, score: react.score },
|
|
1357
1368
|
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
1358
1369
|
{ name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
|
|
1359
1370
|
];
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.7",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.0.
|
|
17
|
+
"@vtstech/pi-shared": "1.0.7"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|