@vtstech/pi-model-test 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +37 -13
- package/package.json +2 -2
package/model-test.js
CHANGED
|
@@ -508,16 +508,20 @@ function model_test_temp_default(pi) {
|
|
|
508
508
|
}
|
|
509
509
|
const hasCorrectTool = fn.name === "get_weather";
|
|
510
510
|
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
511
|
+
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
511
512
|
let score;
|
|
512
|
-
if (hasCorrectTool && hasLocation) {
|
|
513
|
+
if (hasCorrectTool && hasLocation && unitValid) {
|
|
513
514
|
score = "STRONG";
|
|
514
|
-
} else if (hasCorrectTool) {
|
|
515
|
+
} else if (hasCorrectTool && hasLocation) {
|
|
515
516
|
score = "MODERATE";
|
|
517
|
+
} else if (hasCorrectTool) {
|
|
518
|
+
score = "WEAK";
|
|
516
519
|
} else {
|
|
517
520
|
score = "WEAK";
|
|
518
521
|
}
|
|
522
|
+
const pass = score !== "WEAK";
|
|
519
523
|
return {
|
|
520
|
-
pass
|
|
524
|
+
pass,
|
|
521
525
|
score,
|
|
522
526
|
hasToolCalls: true,
|
|
523
527
|
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
@@ -551,8 +555,9 @@ function model_test_temp_default(pi) {
|
|
|
551
555
|
} else {
|
|
552
556
|
score = "WEAK";
|
|
553
557
|
}
|
|
558
|
+
const pass = score !== "WEAK";
|
|
554
559
|
return {
|
|
555
|
-
pass
|
|
560
|
+
pass,
|
|
556
561
|
score,
|
|
557
562
|
hasToolCalls: true,
|
|
558
563
|
toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
|
|
@@ -619,16 +624,20 @@ function model_test_temp_default(pi) {
|
|
|
619
624
|
}
|
|
620
625
|
const hasCorrectTool = fn.name === "get_weather";
|
|
621
626
|
const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
|
|
627
|
+
const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
|
|
622
628
|
let score;
|
|
623
|
-
if (hasCorrectTool && hasLocation) {
|
|
629
|
+
if (hasCorrectTool && hasLocation && unitValid) {
|
|
624
630
|
score = "STRONG";
|
|
625
|
-
} else if (hasCorrectTool) {
|
|
631
|
+
} else if (hasCorrectTool && hasLocation) {
|
|
626
632
|
score = "MODERATE";
|
|
633
|
+
} else if (hasCorrectTool) {
|
|
634
|
+
score = "WEAK";
|
|
627
635
|
} else {
|
|
628
636
|
score = "WEAK";
|
|
629
637
|
}
|
|
638
|
+
const pass = score !== "WEAK";
|
|
630
639
|
return {
|
|
631
|
-
pass
|
|
640
|
+
pass,
|
|
632
641
|
score,
|
|
633
642
|
hasToolCalls: true,
|
|
634
643
|
toolCall: `${fn.name}(${JSON.stringify(args)})`,
|
|
@@ -662,8 +671,9 @@ function model_test_temp_default(pi) {
|
|
|
662
671
|
} else {
|
|
663
672
|
score = "WEAK";
|
|
664
673
|
}
|
|
674
|
+
const pass = score !== "WEAK";
|
|
665
675
|
return {
|
|
666
|
-
pass
|
|
676
|
+
pass,
|
|
667
677
|
score,
|
|
668
678
|
hasToolCalls: true,
|
|
669
679
|
toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
|
|
@@ -741,7 +751,18 @@ function model_test_temp_default(pi) {
|
|
|
741
751
|
let match = ACTION_RE.exec(content);
|
|
742
752
|
if (!match) match = ACTION_RE_SAMELINE.exec(content);
|
|
743
753
|
let looseMatch = false;
|
|
744
|
-
if (!match)
|
|
754
|
+
if (!match) {
|
|
755
|
+
const looseResult = ACTION_RE_LOOSE.exec(content);
|
|
756
|
+
if (looseResult) {
|
|
757
|
+
const candidate = looseResult[1].trim().replace(/[`"']/g, "");
|
|
758
|
+
const isToolIdentifier = /^\w+$/.test(candidate) && (candidate.includes("_") || candidate.includes("-"));
|
|
759
|
+
const isKnownTool = /^(get_weather|calculate)$/i.test(candidate);
|
|
760
|
+
if (isToolIdentifier || isKnownTool) {
|
|
761
|
+
match = looseResult;
|
|
762
|
+
looseMatch = true;
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
}
|
|
745
766
|
let parenMatch = false;
|
|
746
767
|
if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
|
|
747
768
|
if (match) {
|
|
@@ -812,8 +833,9 @@ function model_test_temp_default(pi) {
|
|
|
812
833
|
} else {
|
|
813
834
|
score = "WEAK";
|
|
814
835
|
}
|
|
836
|
+
const pass = score !== "WEAK";
|
|
815
837
|
return {
|
|
816
|
-
pass
|
|
838
|
+
pass,
|
|
817
839
|
score,
|
|
818
840
|
toolCall: `${toolName}(${argsStr})`,
|
|
819
841
|
thought,
|
|
@@ -1154,7 +1176,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1154
1176
|
}
|
|
1155
1177
|
}
|
|
1156
1178
|
const branding = [
|
|
1157
|
-
` \u26A1 Pi Model Benchmark v1.0.
|
|
1179
|
+
` \u26A1 Pi Model Benchmark v1.0.7`,
|
|
1158
1180
|
` Written by VTSTech`,
|
|
1159
1181
|
` GitHub: https://github.com/VTSTech`,
|
|
1160
1182
|
` Website: www.vts-tech.org`
|
|
@@ -1336,11 +1358,13 @@ The JSON object must have exactly these 4 keys:
|
|
|
1336
1358
|
lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
|
|
1337
1359
|
lines.push(section("SUMMARY"));
|
|
1338
1360
|
const totalMs = Date.now() - totalStart;
|
|
1361
|
+
const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
|
|
1362
|
+
const reactPass = react.score === "STRONG" || react.score === "MODERATE";
|
|
1339
1363
|
const tests = [
|
|
1340
1364
|
{ name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
|
|
1341
1365
|
{ name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
|
|
1342
|
-
{ name: "Tool Usage", pass:
|
|
1343
|
-
{ name: "ReAct Parse", pass:
|
|
1366
|
+
{ name: "Tool Usage", pass: toolPass, score: tools.score },
|
|
1367
|
+
{ name: "ReAct Parse", pass: reactPass, score: react.score },
|
|
1344
1368
|
{ name: "Instructions", pass: instructions.pass, score: instructions.score },
|
|
1345
1369
|
{ name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
|
|
1346
1370
|
];
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.7",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.0.
|
|
17
|
+
"@vtstech/pi-shared": "1.0.7"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|