@vtstech/pi-model-test 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +37 -13
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -508,16 +508,20 @@ function model_test_temp_default(pi) {
508
508
  }
509
509
  const hasCorrectTool = fn.name === "get_weather";
510
510
  const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
511
+ const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
511
512
  let score;
512
- if (hasCorrectTool && hasLocation) {
513
+ if (hasCorrectTool && hasLocation && unitValid) {
513
514
  score = "STRONG";
514
- } else if (hasCorrectTool) {
515
+ } else if (hasCorrectTool && hasLocation) {
515
516
  score = "MODERATE";
517
+ } else if (hasCorrectTool) {
518
+ score = "WEAK";
516
519
  } else {
517
520
  score = "WEAK";
518
521
  }
522
+ const pass = score !== "WEAK";
519
523
  return {
520
- pass: true,
524
+ pass,
521
525
  score,
522
526
  hasToolCalls: true,
523
527
  toolCall: `${fn.name}(${JSON.stringify(args)})`,
@@ -551,8 +555,9 @@ function model_test_temp_default(pi) {
551
555
  } else {
552
556
  score = "WEAK";
553
557
  }
558
+ const pass = score !== "WEAK";
554
559
  return {
555
- pass: true,
560
+ pass,
556
561
  score,
557
562
  hasToolCalls: true,
558
563
  toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
@@ -619,16 +624,20 @@ function model_test_temp_default(pi) {
619
624
  }
620
625
  const hasCorrectTool = fn.name === "get_weather";
621
626
  const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
627
+ const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
622
628
  let score;
623
- if (hasCorrectTool && hasLocation) {
629
+ if (hasCorrectTool && hasLocation && unitValid) {
624
630
  score = "STRONG";
625
- } else if (hasCorrectTool) {
631
+ } else if (hasCorrectTool && hasLocation) {
626
632
  score = "MODERATE";
633
+ } else if (hasCorrectTool) {
634
+ score = "WEAK";
627
635
  } else {
628
636
  score = "WEAK";
629
637
  }
638
+ const pass = score !== "WEAK";
630
639
  return {
631
- pass: true,
640
+ pass,
632
641
  score,
633
642
  hasToolCalls: true,
634
643
  toolCall: `${fn.name}(${JSON.stringify(args)})`,
@@ -662,8 +671,9 @@ function model_test_temp_default(pi) {
662
671
  } else {
663
672
  score = "WEAK";
664
673
  }
674
+ const pass = score !== "WEAK";
665
675
  return {
666
- pass: true,
676
+ pass,
667
677
  score,
668
678
  hasToolCalls: true,
669
679
  toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
@@ -741,7 +751,18 @@ function model_test_temp_default(pi) {
741
751
  let match = ACTION_RE.exec(content);
742
752
  if (!match) match = ACTION_RE_SAMELINE.exec(content);
743
753
  let looseMatch = false;
744
- if (!match) match = ACTION_RE_LOOSE.exec(content), looseMatch = true;
754
+ if (!match) {
755
+ const looseResult = ACTION_RE_LOOSE.exec(content);
756
+ if (looseResult) {
757
+ const candidate = looseResult[1].trim().replace(/[`"']/g, "");
758
+ const isToolIdentifier = /^\w+$/.test(candidate) && (candidate.includes("_") || candidate.includes("-"));
759
+ const isKnownTool = /^(get_weather|calculate)$/i.test(candidate);
760
+ if (isToolIdentifier || isKnownTool) {
761
+ match = looseResult;
762
+ looseMatch = true;
763
+ }
764
+ }
765
+ }
745
766
  let parenMatch = false;
746
767
  if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
747
768
  if (match) {
@@ -812,8 +833,9 @@ function model_test_temp_default(pi) {
812
833
  } else {
813
834
  score = "WEAK";
814
835
  }
836
+ const pass = score !== "WEAK";
815
837
  return {
816
- pass: true,
838
+ pass,
817
839
  score,
818
840
  toolCall: `${toolName}(${argsStr})`,
819
841
  thought,
@@ -1154,7 +1176,7 @@ The JSON object must have exactly these 4 keys:
1154
1176
  }
1155
1177
  }
1156
1178
  const branding = [
1157
- ` \u26A1 Pi Model Benchmark v1.0.6`,
1179
+ ` \u26A1 Pi Model Benchmark v1.0.7`,
1158
1180
  ` Written by VTSTech`,
1159
1181
  ` GitHub: https://github.com/VTSTech`,
1160
1182
  ` Website: www.vts-tech.org`
@@ -1336,11 +1358,13 @@ The JSON object must have exactly these 4 keys:
1336
1358
  lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
1337
1359
  lines.push(section("SUMMARY"));
1338
1360
  const totalMs = Date.now() - totalStart;
1361
+ const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
1362
+ const reactPass = react.score === "STRONG" || react.score === "MODERATE";
1339
1363
  const tests = [
1340
1364
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
1341
1365
  { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
1342
- { name: "Tool Usage", pass: tools.pass, score: tools.score },
1343
- { name: "ReAct Parse", pass: react.pass, score: react.score },
1366
+ { name: "Tool Usage", pass: toolPass, score: tools.score },
1367
+ { name: "ReAct Parse", pass: reactPass, score: react.score },
1344
1368
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
1345
1369
  { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
1346
1370
  ];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.0.6",
3
+ "version": "1.0.7",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.0.6"
17
+ "@vtstech/pi-shared": "1.0.7"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"