@vtstech/pi-model-test 1.0.6 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +75 -31
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -12,7 +12,7 @@ import {
12
12
  truncate,
13
13
  sanitizeForReport
14
14
  } from "@vtstech/pi-shared/format";
15
- import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS } from "@vtstech/pi-shared/ollama";
15
+ import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS, fetchModelContextLength } from "@vtstech/pi-shared/ollama";
16
16
  function detectProvider(ctx) {
17
17
  const model = ctx.model;
18
18
  if (!model) return { kind: "unknown", name: "none" };
@@ -55,15 +55,15 @@ function detectProvider(ctx) {
55
55
  }
56
56
  var CONFIG = {
57
57
  // General API settings
58
- DEFAULT_TIMEOUT_MS: 6e5,
58
+ DEFAULT_TIMEOUT_MS: 999999,
59
59
  // 8.3 minutes - default timeout for model responses
60
- CONNECT_TIMEOUT_S: 30,
60
+ CONNECT_TIMEOUT_S: 60,
61
61
  // 30 seconds to establish connection
62
62
  MAX_RETRIES: 1,
63
63
  // Single retry for transient failures
64
- RETRY_DELAY_MS: 2e3,
64
+ RETRY_DELAY_MS: 1e4,
65
65
  // 2 seconds between retries
66
- EXEC_BUFFER_MS: 5e3,
66
+ EXEC_BUFFER_MS: 8e3,
67
67
  // Extra buffer for exec timeout over curl timeout
68
68
  // Model generation settings
69
69
  NUM_PREDICT: 1024,
@@ -73,28 +73,28 @@ var CONFIG = {
73
73
  // Test-specific settings
74
74
  MIN_THINKING_LENGTH: 10,
75
75
  // Minimum chars to consider thinking tokens valid
76
- TOOL_TEST_TIMEOUT_MS: 9e4,
76
+ TOOL_TEST_TIMEOUT_MS: 999999,
77
77
  // 90 seconds for tool usage tests
78
- TOOL_TEST_MAX_TIME_S: 9999,
78
+ TOOL_TEST_MAX_TIME_S: 999999,
79
79
  // Max curl time for tool tests (effectively unlimited)
80
- TOOL_SUPPORT_TIMEOUT_MS: 26e4,
80
+ TOOL_SUPPORT_TIMEOUT_MS: 999999,
81
81
  // 2+ minutes for tool support detection
82
- TOOL_SUPPORT_MAX_TIME_S: 240,
82
+ TOOL_SUPPORT_MAX_TIME_S: 999999,
83
83
  // Max curl time for tool support detection
84
84
  // Metadata retrieval
85
85
  TAGS_TIMEOUT_MS: 15e3,
86
86
  // 15 seconds for /api/tags
87
- TAGS_CONNECT_TIMEOUT_S: 10,
87
+ TAGS_CONNECT_TIMEOUT_S: 30,
88
88
  // 10 seconds connection timeout for tags
89
- MODEL_INFO_TIMEOUT_MS: 1e4,
89
+ MODEL_INFO_TIMEOUT_MS: 3e4,
90
90
  // 10 seconds for model info lookup
91
91
  // Provider API settings
92
- PROVIDER_TIMEOUT_MS: 12e4,
92
+ PROVIDER_TIMEOUT_MS: 999999,
93
93
  // 2 minutes for cloud provider API calls
94
- PROVIDER_TOOL_TIMEOUT_MS: 6e4,
94
+ PROVIDER_TOOL_TIMEOUT_MS: 12e4,
95
95
  // 60 seconds for tool usage tests on providers
96
96
  // Rate limiting
97
- TEST_DELAY_MS: 3e4
97
+ TEST_DELAY_MS: 1e4
98
98
  // 30 seconds between tests to avoid rate limiting
99
99
  };
100
100
  var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
@@ -508,16 +508,20 @@ function model_test_temp_default(pi) {
508
508
  }
509
509
  const hasCorrectTool = fn.name === "get_weather";
510
510
  const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
511
+ const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
511
512
  let score;
512
- if (hasCorrectTool && hasLocation) {
513
+ if (hasCorrectTool && hasLocation && unitValid) {
513
514
  score = "STRONG";
514
- } else if (hasCorrectTool) {
515
+ } else if (hasCorrectTool && hasLocation) {
515
516
  score = "MODERATE";
517
+ } else if (hasCorrectTool) {
518
+ score = "WEAK";
516
519
  } else {
517
520
  score = "WEAK";
518
521
  }
522
+ const pass = score !== "WEAK";
519
523
  return {
520
- pass: true,
524
+ pass,
521
525
  score,
522
526
  hasToolCalls: true,
523
527
  toolCall: `${fn.name}(${JSON.stringify(args)})`,
@@ -551,8 +555,9 @@ function model_test_temp_default(pi) {
551
555
  } else {
552
556
  score = "WEAK";
553
557
  }
558
+ const pass = score !== "WEAK";
554
559
  return {
555
- pass: true,
560
+ pass,
556
561
  score,
557
562
  hasToolCalls: true,
558
563
  toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
@@ -619,16 +624,20 @@ function model_test_temp_default(pi) {
619
624
  }
620
625
  const hasCorrectTool = fn.name === "get_weather";
621
626
  const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
627
+ const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
622
628
  let score;
623
- if (hasCorrectTool && hasLocation) {
629
+ if (hasCorrectTool && hasLocation && unitValid) {
624
630
  score = "STRONG";
625
- } else if (hasCorrectTool) {
631
+ } else if (hasCorrectTool && hasLocation) {
626
632
  score = "MODERATE";
633
+ } else if (hasCorrectTool) {
634
+ score = "WEAK";
627
635
  } else {
628
636
  score = "WEAK";
629
637
  }
638
+ const pass = score !== "WEAK";
630
639
  return {
631
- pass: true,
640
+ pass,
632
641
  score,
633
642
  hasToolCalls: true,
634
643
  toolCall: `${fn.name}(${JSON.stringify(args)})`,
@@ -662,8 +671,9 @@ function model_test_temp_default(pi) {
662
671
  } else {
663
672
  score = "WEAK";
664
673
  }
674
+ const pass = score !== "WEAK";
665
675
  return {
666
- pass: true,
676
+ pass,
667
677
  score,
668
678
  hasToolCalls: true,
669
679
  toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
@@ -741,7 +751,18 @@ function model_test_temp_default(pi) {
741
751
  let match = ACTION_RE.exec(content);
742
752
  if (!match) match = ACTION_RE_SAMELINE.exec(content);
743
753
  let looseMatch = false;
744
- if (!match) match = ACTION_RE_LOOSE.exec(content), looseMatch = true;
754
+ if (!match) {
755
+ const looseResult = ACTION_RE_LOOSE.exec(content);
756
+ if (looseResult) {
757
+ const candidate = looseResult[1].trim().replace(/[`"']/g, "");
758
+ const isToolIdentifier = /^\w+$/.test(candidate) && (candidate.includes("_") || candidate.includes("-"));
759
+ const isKnownTool = /^(get_weather|calculate)$/i.test(candidate);
760
+ if (isToolIdentifier || isKnownTool) {
761
+ match = looseResult;
762
+ looseMatch = true;
763
+ }
764
+ }
765
+ }
745
766
  let parenMatch = false;
746
767
  if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
747
768
  if (match) {
@@ -812,8 +833,9 @@ function model_test_temp_default(pi) {
812
833
  } else {
813
834
  score = "WEAK";
814
835
  }
836
+ const pass = score !== "WEAK";
815
837
  return {
816
- pass: true,
838
+ pass,
817
839
  score,
818
840
  toolCall: `${toolName}(${argsStr})`,
819
841
  thought,
@@ -1154,17 +1176,32 @@ The JSON object must have exactly these 4 keys:
1154
1176
  }
1155
1177
  }
1156
1178
  const branding = [
1157
- ` \u26A1 Pi Model Benchmark v1.0.6`,
1179
+ ` \u26A1 Pi Model Benchmark v1.0.8`,
1158
1180
  ` Written by VTSTech`,
1159
1181
  ` GitHub: https://github.com/VTSTech`,
1160
1182
  ` Website: www.vts-tech.org`
1161
1183
  ].join("\n");
1162
- async function testModelOllama(model) {
1184
+ async function testModelOllama(model, providerInfo, ctx) {
1163
1185
  const lines = [];
1164
1186
  const totalStart = Date.now();
1165
1187
  lines.push(branding);
1166
1188
  lines.push(section(`MODEL: ${model}`));
1167
1189
  lines.push(info("Provider: Ollama (local/remote)"));
1190
+ const modelsJson = readModelsJson();
1191
+ let apiMode = "ollama";
1192
+ const providerName = ctx?.model?.provider || providerInfo?.name || "";
1193
+ if (providerName && modelsJson) {
1194
+ const providerCfg = (modelsJson.providers || {})[providerName];
1195
+ if (providerCfg) {
1196
+ apiMode = providerCfg.api || "ollama";
1197
+ }
1198
+ }
1199
+ lines.push(info(`API: ${apiMode}`));
1200
+ const nativeContext = await fetchModelContextLength(OLLAMA_BASE, model);
1201
+ if (nativeContext !== void 0) {
1202
+ const ctxStr = nativeContext >= 1e3 ? `${(nativeContext / 1e3).toFixed(1)}k` : String(nativeContext);
1203
+ lines.push(info(`Context: ${ctxStr} tokens (native max)`));
1204
+ }
1168
1205
  let modelSize = "unknown";
1169
1206
  let modelFamily = "unknown";
1170
1207
  let modelParams = "unknown";
@@ -1336,11 +1373,13 @@ The JSON object must have exactly these 4 keys:
1336
1373
  lines.push(info(`Cache: ${TOOL_SUPPORT_CACHE_PATH}`));
1337
1374
  lines.push(section("SUMMARY"));
1338
1375
  const totalMs = Date.now() - totalStart;
1376
+ const toolPass = tools.score === "STRONG" || tools.score === "MODERATE";
1377
+ const reactPass = react.score === "STRONG" || react.score === "MODERATE";
1339
1378
  const tests = [
1340
1379
  { name: "Reasoning", pass: reasoning.score === "STRONG" || reasoning.score === "MODERATE", score: reasoning.score },
1341
1380
  { name: "Thinking", pass: thinking.supported, score: thinking.supported ? "YES" : "NO" },
1342
- { name: "Tool Usage", pass: tools.pass, score: tools.score },
1343
- { name: "ReAct Parse", pass: react.pass, score: react.score },
1381
+ { name: "Tool Usage", pass: toolPass, score: tools.score },
1382
+ { name: "ReAct Parse", pass: reactPass, score: react.score },
1344
1383
  { name: "Instructions", pass: instructions.pass, score: instructions.score },
1345
1384
  { name: "Tool Support", pass: toolSupport.level === "native" || toolSupport.level === "react", score: toolSupport.level.toUpperCase() }
1346
1385
  ];
@@ -1363,7 +1402,7 @@ The JSON object must have exactly these 4 keys:
1363
1402
  }
1364
1403
  return lines.join("\n");
1365
1404
  }
1366
- async function testModelProvider(providerInfo, model) {
1405
+ async function testModelProvider(providerInfo, model, ctx) {
1367
1406
  const lines = [];
1368
1407
  const totalStart = Date.now();
1369
1408
  lines.push(branding);
@@ -1376,6 +1415,11 @@ The JSON object must have exactly these 4 keys:
1376
1415
  } else {
1377
1416
  lines.push(warn(`API Key: NOT SET (${providerInfo.envKey || "env var not found"})`));
1378
1417
  }
1418
+ const contextWindow = ctx?.model?.contextWindow ?? null;
1419
+ if (contextWindow !== null) {
1420
+ const ctxStr = contextWindow >= 1e3 ? `${(contextWindow / 1e3).toFixed(1)}k` : String(contextWindow);
1421
+ lines.push(info(`Context: ${ctxStr} tokens`));
1422
+ }
1379
1423
  lines.push(section("CONNECTIVITY TEST"));
1380
1424
  lines.push(info("Sending minimal request to verify API reachability and key validity..."));
1381
1425
  const connectivity = await testConnectivity(providerInfo, model);
@@ -1496,9 +1540,9 @@ The JSON object must have exactly these 4 keys:
1496
1540
  async function testModel(model, ctx) {
1497
1541
  const providerInfo = ctx ? detectProvider(ctx) : { kind: "ollama", name: "ollama" };
1498
1542
  if (providerInfo.kind === "ollama") {
1499
- return testModelOllama(model);
1543
+ return testModelOllama(model, providerInfo, ctx);
1500
1544
  } else if (providerInfo.kind === "builtin") {
1501
- return testModelProvider(providerInfo, model);
1545
+ return testModelProvider(providerInfo, model, ctx);
1502
1546
  } else {
1503
1547
  return testModelOllama(model);
1504
1548
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.0.6",
3
+ "version": "1.0.8",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.0.6"
17
+ "@vtstech/pi-shared": "1.0.8"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"