@vtstech/pi-model-test 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +39 -28
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -12,20 +12,7 @@ import {
12
12
  truncate,
13
13
  sanitizeForReport
14
14
  } from "@vtstech/pi-shared/format";
15
- import { getOllamaBaseUrl, detectModelFamily, readModelsJson } from "@vtstech/pi-shared/ollama";
16
- var BUILTIN_PROVIDERS = {
17
- openrouter: { api: "openai-completions", baseUrl: "https://openrouter.ai/api/v1", envKey: "OPENROUTER_API_KEY" },
18
- anthropic: { api: "anthropic-messages", baseUrl: "https://api.anthropic.com/v1", envKey: "ANTHROPIC_API_KEY" },
19
- google: { api: "gemini", baseUrl: "https://generativelanguage.googleapis.com", envKey: "GOOGLE_API_KEY" },
20
- openai: { api: "openai-completions", baseUrl: "https://api.openai.com/v1", envKey: "OPENAI_API_KEY" },
21
- groq: { api: "openai-completions", baseUrl: "https://api.groq.com/v1", envKey: "GROQ_API_KEY" },
22
- deepseek: { api: "openai-completions", baseUrl: "https://api.deepseek.com/v1", envKey: "DEEPSEEK_API_KEY" },
23
- mistral: { api: "openai-completions", baseUrl: "https://api.mistral.ai/v1", envKey: "MISTRAL_API_KEY" },
24
- xai: { api: "openai-completions", baseUrl: "https://api.x.ai/v1", envKey: "XAI_API_KEY" },
25
- together: { api: "openai-completions", baseUrl: "https://api.together.xyz/v1", envKey: "TOGETHER_API_KEY" },
26
- fireworks: { api: "openai-completions", baseUrl: "https://api.fireworks.ai/inference/v1", envKey: "FIREWORKS_API_KEY" },
27
- cohere: { api: "cohere-chat", baseUrl: "https://api.cohere.com/v1", envKey: "COHERE_API_KEY" }
28
- };
15
+ import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS } from "@vtstech/pi-shared/ollama";
29
16
  function detectProvider(ctx) {
30
17
  const model = ctx.model;
31
18
  if (!model) return { kind: "unknown", name: "none" };
@@ -37,7 +24,7 @@ function detectProvider(ctx) {
37
24
  const baseUrl = userProviderCfg.baseUrl || "";
38
25
  const apiMode = userProviderCfg.api || "";
39
26
  const apiKey = userProviderCfg.apiKey || "";
40
- const isOllama = /ollama/i.test(providerName) || /localhost:\d+/.test(baseUrl) || /127\.0\.0\.1:\d+/.test(baseUrl) || /\/api\/chat/.test(baseUrl) || apiMode === "ollama";
27
+ const isOllama = /ollama/i.test(providerName) || /localhost:\d+/.test(baseUrl) || /127\.0\.0\.1:\d+/.test(baseUrl) || /0\.0\.0\.0:\d+/.test(baseUrl) || /\/api\/chat/.test(baseUrl) || apiMode === "ollama";
41
28
  if (isOllama) {
42
29
  return { kind: "ollama", name: providerName, apiMode: "ollama", baseUrl, apiKey };
43
30
  }
@@ -521,16 +508,20 @@ function model_test_temp_default(pi) {
521
508
  }
522
509
  const hasCorrectTool = fn.name === "get_weather";
523
510
  const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
511
+ const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
524
512
  let score;
525
- if (hasCorrectTool && hasLocation) {
513
+ if (hasCorrectTool && hasLocation && unitValid) {
526
514
  score = "STRONG";
527
- } else if (hasCorrectTool) {
515
+ } else if (hasCorrectTool && hasLocation) {
528
516
  score = "MODERATE";
517
+ } else if (hasCorrectTool) {
518
+ score = "WEAK";
529
519
  } else {
530
520
  score = "WEAK";
531
521
  }
522
+ const pass = score !== "WEAK";
532
523
  return {
533
- pass: true,
524
+ pass,
534
525
  score,
535
526
  hasToolCalls: true,
536
527
  toolCall: `${fn.name}(${JSON.stringify(args)})`,
@@ -564,8 +555,9 @@ function model_test_temp_default(pi) {
564
555
  } else {
565
556
  score = "WEAK";
566
557
  }
558
+ const pass = score !== "WEAK";
567
559
  return {
568
- pass: true,
560
+ pass,
569
561
  score,
570
562
  hasToolCalls: true,
571
563
  toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
@@ -632,16 +624,20 @@ function model_test_temp_default(pi) {
632
624
  }
633
625
  const hasCorrectTool = fn.name === "get_weather";
634
626
  const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
627
+ const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
635
628
  let score;
636
- if (hasCorrectTool && hasLocation) {
629
+ if (hasCorrectTool && hasLocation && unitValid) {
637
630
  score = "STRONG";
638
- } else if (hasCorrectTool) {
631
+ } else if (hasCorrectTool && hasLocation) {
639
632
  score = "MODERATE";
633
+ } else if (hasCorrectTool) {
634
+ score = "WEAK";
640
635
  } else {
641
636
  score = "WEAK";
642
637
  }
638
+ const pass = score !== "WEAK";
643
639
  return {
644
- pass: true,
640
+ pass,
645
641
  score,
646
642
  hasToolCalls: true,
647
643
  toolCall: `${fn.name}(${JSON.stringify(args)})`,
@@ -675,8 +671,9 @@ function model_test_temp_default(pi) {
675
671
  } else {
676
672
  score = "WEAK";
677
673
  }
674
+ const pass = score !== "WEAK";
678
675
  return {
679
- pass: true,
676
+ pass,
680
677
  score,
681
678
  hasToolCalls: true,
682
679
  toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
@@ -754,7 +751,18 @@ function model_test_temp_default(pi) {
754
751
  let match = ACTION_RE.exec(content);
755
752
  if (!match) match = ACTION_RE_SAMELINE.exec(content);
756
753
  let looseMatch = false;
757
- if (!match) match = ACTION_RE_LOOSE.exec(content), looseMatch = true;
754
+ if (!match) {
755
+ const looseResult = ACTION_RE_LOOSE.exec(content);
756
+ if (looseResult) {
757
+ const candidate = looseResult[1].trim().replace(/[`"']/g, "");
758
+ const isToolIdentifier = /^\w+$/.test(candidate) && (candidate.includes("_") || candidate.includes("-"));
759
+ const isKnownTool = /^(get_weather|calculate)$/i.test(candidate);
760
+ if (isToolIdentifier || isKnownTool) {
761
+ match = looseResult;
762
+ looseMatch = true;
763
+ }
764
+ }
765
+ }
758
766
  let parenMatch = false;
759
767
  if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
760
768
  if (match) {
@@ -825,8 +833,9 @@ function model_test_temp_default(pi) {
825
833
  } else {
826
834
  score = "WEAK";
827
835
  }
836
+ const pass = score !== "WEAK";
828
837
  return {
829
- pass: true,
838
+ pass,
830
839
  score,
831
840
  toolCall: `${toolName}(${argsStr})`,
832
841
  thought,
@@ -1167,7 +1176,7 @@ The JSON object must have exactly these 4 keys:
1167
1176
  }
1168
1177
  }
1169
1178
  const branding = [
1170
- ` \u26A1 Pi Model Benchmark v1.0.5`,
1179
+ ` \u26A1 Pi Model Benchmark v1.0.7`,
1171
1180
  ` Written by VTSTech`,
1172
1181
  ` GitHub: https://github.com/VTSTech`,
1173
1182
  ` Website: www.vts-tech.org`
@@ -1349,11 +1358,13 @@ The JSON object must have exactly these 4 keys:
1349
1358
  lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
1350
1359
  lines.push(section("SUMMARY"));
1351
1360
  const totalMs = Date.now() - totalStart;
1361
+ const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
1362
+ const reactPass = react.score === "STRONG" || react.score === "MODERATE";
1352
1363
  const tests = [
1353
1364
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
1354
1365
  { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
1355
- { name: "Tool Usage", pass: tools.pass, score: tools.score },
1356
- { name: "ReAct Parse", pass: react.pass, score: react.score },
1366
+ { name: "Tool Usage", pass: toolPass, score: tools.score },
1367
+ { name: "ReAct Parse", pass: reactPass, score: react.score },
1357
1368
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
1358
1369
  { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
1359
1370
  ];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.0.5",
3
+ "version": "1.0.7",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.0.5"
17
+ "@vtstech/pi-shared": "1.0.7"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"