@vtstech/pi-model-test 1.0.7 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +176 -109
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -12,7 +12,7 @@ import {
12
12
  truncate,
13
13
  sanitizeForReport
14
14
  } from "@vtstech/pi-shared/format";
15
- import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS } from "@vtstech/pi-shared/ollama";
15
+ import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS, fetchModelContextLength } from "@vtstech/pi-shared/ollama";
16
16
  function detectProvider(ctx) {
17
17
  const model = ctx.model;
18
18
  if (!model) return { kind: "unknown", name: "none" };
@@ -55,15 +55,15 @@ function detectProvider(ctx) {
55
55
  }
56
56
  var CONFIG = {
57
57
  // General API settings
58
- DEFAULT_TIMEOUT_MS: 6e5,
58
+ DEFAULT_TIMEOUT_MS: 999999,
59
59
  // 8.3 minutes - default timeout for model responses
60
- CONNECT_TIMEOUT_S: 30,
60
+ CONNECT_TIMEOUT_S: 60,
61
61
  // 30 seconds to establish connection
62
62
  MAX_RETRIES: 1,
63
63
  // Single retry for transient failures
64
- RETRY_DELAY_MS: 2e3,
64
+ RETRY_DELAY_MS: 1e4,
65
65
  // 2 seconds between retries
66
- EXEC_BUFFER_MS: 5e3,
66
+ EXEC_BUFFER_MS: 8e3,
67
67
  // Extra buffer for exec timeout over curl timeout
68
68
  // Model generation settings
69
69
  NUM_PREDICT: 1024,
@@ -73,28 +73,28 @@ var CONFIG = {
73
73
  // Test-specific settings
74
74
  MIN_THINKING_LENGTH: 10,
75
75
  // Minimum chars to consider thinking tokens valid
76
- TOOL_TEST_TIMEOUT_MS: 9e4,
76
+ TOOL_TEST_TIMEOUT_MS: 999999,
77
77
  // 90 seconds for tool usage tests
78
- TOOL_TEST_MAX_TIME_S: 9999,
78
+ TOOL_TEST_MAX_TIME_S: 999999,
79
79
  // Max curl time for tool tests (effectively unlimited)
80
- TOOL_SUPPORT_TIMEOUT_MS: 26e4,
80
+ TOOL_SUPPORT_TIMEOUT_MS: 999999,
81
81
  // 2+ minutes for tool support detection
82
- TOOL_SUPPORT_MAX_TIME_S: 240,
82
+ TOOL_SUPPORT_MAX_TIME_S: 999999,
83
83
  // Max curl time for tool support detection
84
84
  // Metadata retrieval
85
85
  TAGS_TIMEOUT_MS: 15e3,
86
86
  // 15 seconds for /api/tags
87
- TAGS_CONNECT_TIMEOUT_S: 10,
87
+ TAGS_CONNECT_TIMEOUT_S: 30,
88
88
  // 10 seconds connection timeout for tags
89
- MODEL_INFO_TIMEOUT_MS: 1e4,
89
+ MODEL_INFO_TIMEOUT_MS: 3e4,
90
90
  // 10 seconds for model info lookup
91
91
  // Provider API settings
92
- PROVIDER_TIMEOUT_MS: 12e4,
92
+ PROVIDER_TIMEOUT_MS: 999999,
93
93
  // 2 minutes for cloud provider API calls
94
- PROVIDER_TOOL_TIMEOUT_MS: 6e4,
94
+ PROVIDER_TOOL_TIMEOUT_MS: 12e4,
95
95
  // 60 seconds for tool usage tests on providers
96
96
  // Rate limiting
97
- TEST_DELAY_MS: 3e4
97
+ TEST_DELAY_MS: 1e4
98
98
  // 30 seconds between tests to avoid rate limiting
99
99
  };
100
100
  var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
@@ -740,90 +740,111 @@ function model_test_temp_default(pi) {
740
740
  if (!content) {
741
741
  return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
742
742
  }
743
- const THOUGHT_RE = /Thought:\s*(.*?)(?=Action:|Final Answer:|$)/is;
744
- const ACTION_RE = /Action:\s*[`"']?(\w+)[`"']?\s*\n?\s*Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:|Action:)|$)/is;
745
- const ACTION_RE_SAMELINE = /Action:\s*[`"']?(\w+)[`"']?\s+Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:)|$)/is;
746
- const ACTION_RE_LOOSE = /Action:\s*(.+?)\n\s*Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:|Action:)|$)/is;
747
- const ACTION_RE_PAREN = /Action:\s*(\w+)\s*\(([^)]*)\)/i;
748
- let thought = "";
749
- const thoughtMatch = THOUGHT_RE.exec(content);
750
- if (thoughtMatch) thought = thoughtMatch[1].trim();
751
- let match = ACTION_RE.exec(content);
752
- if (!match) match = ACTION_RE_SAMELINE.exec(content);
753
- let looseMatch = false;
754
- if (!match) {
755
- const looseResult = ACTION_RE_LOOSE.exec(content);
756
- if (looseResult) {
757
- const candidate = looseResult[1].trim().replace(/[`"']/g, "");
758
- const isToolIdentifier = /^\w+$/.test(candidate) && (candidate.includes("_") || candidate.includes("-"));
759
- const isKnownTool = /^(get_weather|calculate)$/i.test(candidate);
760
- if (isToolIdentifier || isKnownTool) {
761
- match = looseResult;
762
- looseMatch = true;
763
- }
764
- }
765
- }
766
- let parenMatch = false;
767
- if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
768
- if (match) {
769
- let toolName = match[1].trim().replace(/[`"']/g, "");
770
- if (looseMatch) {
771
- const actionText = toolName.toLowerCase();
772
- if (actionText.includes("get_weather")) toolName = "get_weather";
773
- else {
774
- const toolWords = actionText.match(/\b[a-z][a-z0-9]*(?:[_-][a-z0-9]+)+\b/gi) || [];
775
- if (toolWords.length > 0) toolName = toolWords[0];
776
- }
777
- }
778
- const rawArgs = parenMatch ? match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim() : match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
779
- let argsParsed = false;
780
- let argsStr = rawArgs;
781
- if (parenMatch && rawArgs && !rawArgs.startsWith("{")) {
782
- const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
783
- if (pairs) {
784
- const obj = {};
785
- for (const p of pairs) {
786
- const colonIdx = p.indexOf(":");
787
- const key = p.slice(0, colonIdx).trim();
788
- let val = p.slice(colonIdx + 1).trim();
789
- if (val.startsWith('"') && val.endsWith('"') || val.startsWith("'") && val.endsWith("'")) {
790
- val = val.slice(1, -1);
743
+ let parsedResult = null;
744
+ const sharedParser = pi._reactParser;
745
+ if (sharedParser?.ALL_DIALECT_PATTERNS) {
746
+ for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
747
+ const result2 = sharedParser.parseReactWithPatterns(content, dp, true);
748
+ if (result2) {
749
+ let toolName = result2.name;
750
+ let argsStr;
751
+ const rawArgs = result2.args ? JSON.stringify(result2.args) : "";
752
+ if (rawArgs && rawArgs !== "{}") {
753
+ argsStr = rawArgs;
754
+ } else if (result2.raw) {
755
+ const jsonStart = result2.raw.indexOf("{");
756
+ if (jsonStart !== -1) {
757
+ let depth = 0, jsonEnd = -1;
758
+ for (let i = jsonStart; i < result2.raw.length; i++) {
759
+ if (result2.raw[i] === "{") depth++;
760
+ else if (result2.raw[i] === "}") {
761
+ depth--;
762
+ if (depth === 0) {
763
+ jsonEnd = i;
764
+ break;
765
+ }
766
+ }
767
+ }
768
+ argsStr = jsonEnd !== -1 ? result2.raw.slice(jsonStart, jsonEnd + 1) : "";
769
+ } else {
770
+ argsStr = "";
791
771
  }
792
- obj[key] = val;
793
- }
794
- try {
795
- argsStr = JSON.stringify(obj);
796
- argsParsed = true;
797
- } catch {
772
+ } else {
773
+ argsStr = "";
798
774
  }
775
+ parsedResult = { name: toolName, args: argsStr, thought: result2.thought || "", dialect: result2.dialect };
776
+ break;
799
777
  }
800
778
  }
801
- if (!argsParsed) {
802
- const jsonStart = rawArgs.indexOf("{");
803
- if (jsonStart !== -1) {
804
- let depth = 0;
805
- let jsonEnd = -1;
806
- for (let i = jsonStart; i < rawArgs.length; i++) {
807
- if (rawArgs[i] === "{") depth++;
808
- else if (rawArgs[i] === "}") {
809
- depth--;
810
- if (depth === 0) {
811
- jsonEnd = i;
812
- break;
779
+ } else {
780
+ const dialectDefs = [
781
+ { name: "react", action: "Action:", input: "Action Input:" },
782
+ { name: "function", action: "Function:", input: "Function Input:" },
783
+ { name: "tool", action: "Tool:", input: "Tool Input:" },
784
+ { name: "call", action: "Call:", input: "Input:" }
785
+ ];
786
+ for (const dd of dialectDefs) {
787
+ const esc = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
788
+ const aT = esc(dd.action);
789
+ const iT = esc(dd.input);
790
+ const primaryRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s*\\n?\\s*${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
791
+ const sameRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s+${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
792
+ const parenRe = new RegExp(`${aT}\\s*(\\w+)\\s*\\(([^)]*)\\)`, "i");
793
+ let m = primaryRe.exec(content) || sameRe.exec(content);
794
+ let isParen = false;
795
+ if (!m) {
796
+ m = parenRe.exec(content);
797
+ isParen = true;
798
+ }
799
+ if (m) {
800
+ const toolName = m[1].trim().replace(/[`"']/g, "");
801
+ const rawArgs = m[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
802
+ let argsStr = "";
803
+ if (isParen && rawArgs && !rawArgs.startsWith("{")) {
804
+ const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
805
+ if (pairs) {
806
+ const obj = {};
807
+ for (const p of pairs) {
808
+ const ci = p.indexOf(":");
809
+ let v = p.slice(ci + 1).trim();
810
+ if (v.startsWith('"') && v.endsWith('"') || v.startsWith("'") && v.endsWith("'")) v = v.slice(1, -1);
811
+ obj[p.slice(0, ci).trim()] = v;
813
812
  }
813
+ argsStr = JSON.stringify(obj);
814
+ } else {
815
+ argsStr = rawArgs;
814
816
  }
815
- }
816
- if (jsonEnd !== -1) {
817
- const jsonStr = rawArgs.slice(jsonStart, jsonEnd + 1);
818
- try {
819
- JSON.parse(jsonStr);
820
- argsParsed = true;
821
- argsStr = jsonStr;
822
- } catch {
817
+ } else {
818
+ const js = rawArgs.indexOf("{");
819
+ if (js !== -1) {
820
+ let d = 0, je = -1;
821
+ for (let i = js; i < rawArgs.length; i++) {
822
+ if (rawArgs[i] === "{") d++;
823
+ else if (rawArgs[i] === "}") {
824
+ d--;
825
+ if (d === 0) {
826
+ je = i;
827
+ break;
828
+ }
829
+ }
830
+ }
831
+ argsStr = je !== -1 ? rawArgs.slice(js, je + 1) : rawArgs;
832
+ } else {
833
+ argsStr = rawArgs;
823
834
  }
824
835
  }
836
+ let thought = "";
837
+ const thoughtRe = /Thought:\s*(.*?)(?=Action:|Function:|Tool:|Call:|Final Answer:|$)/is;
838
+ const tm = thoughtRe.exec(content);
839
+ if (tm) thought = tm[1].trim();
840
+ parsedResult = { name: toolName, args: argsStr, thought, dialect: dd.name };
841
+ break;
825
842
  }
826
843
  }
844
+ }
845
+ if (parsedResult) {
846
+ let { name: toolName, args: argsStr, thought, dialect } = parsedResult;
847
+ const argsParsed = argsStr.length > 0;
827
848
  let score;
828
849
  const isWeatherTool = toolName.toLowerCase().includes("get_weather") || toolName.toLowerCase() === "get_weather";
829
850
  if (isWeatherTool && argsParsed) {
@@ -840,15 +861,25 @@ function model_test_temp_default(pi) {
840
861
  toolCall: `${toolName}(${argsStr})`,
841
862
  thought,
842
863
  response: content,
843
- elapsedMs
864
+ elapsedMs,
865
+ dialect: dialect || "react"
844
866
  };
845
867
  }
868
+ const altTagPatterns = [
869
+ /^\s*Function:\s*/im,
870
+ /^\s*Tool:\s*/im,
871
+ /^\s*Call:\s*/im,
872
+ /<function_call/i,
873
+ /<invoke\s/i
874
+ ];
875
+ const hasAltTag = altTagPatterns.some((p) => p.test(content));
846
876
  const hasToolMention = /\bget_weather\b/i.test(content) || /\btool\b/i.test(content);
847
- if (hasToolMention) {
877
+ if (hasAltTag || hasToolMention) {
878
+ const detail = hasAltTag ? "model used alternative tool-call tags but format was not parseable" : "model mentioned tool but not in ReAct format";
848
879
  return {
849
880
  pass: false,
850
881
  score: "FAIL",
851
- toolCall: "none \u2014 model mentioned tool but not in ReAct format",
882
+ toolCall: `none \u2014 ${detail}`,
852
883
  thought: "",
853
884
  response: content,
854
885
  elapsedMs
@@ -1071,25 +1102,40 @@ The JSON object must have exactly these 4 keys:
1071
1102
  };
1072
1103
  }
1073
1104
  const reactPatterns = [
1105
+ // Classic ReAct
1074
1106
  /^\s*Action:\s*/im,
1075
- // "Action: get_weather"
1076
1107
  /^\s*Action Input:\s*/im,
1077
- // "Action Input: {"location": "Tokyo"}"
1078
1108
  /^\s*Thought:\s*/im,
1079
- // "Thought: I need to look up the weather"
1080
1109
  /Action:\s*\w+/i,
1081
- // "Action: get_weather" anywhere
1082
- /Action Input:\s*\{/i
1083
- // "Action Input: {..." anywhere
1110
+ /Action Input:\s*\{/i,
1111
+ // Function dialect
1112
+ /^\s*Function:\s*/im,
1113
+ /^\s*Function Input:\s*/im,
1114
+ /Function:\s*\w+/i,
1115
+ // Tool dialect
1116
+ /^\s*Tool:\s*/im,
1117
+ /^\s*Tool Input:\s*/im,
1118
+ /Tool:\s*\w+/i,
1119
+ // Call dialect
1120
+ /^\s*Call:\s*/im,
1121
+ /^\s*Input:\s*/im,
1122
+ /Call:\s*\w+/i
1084
1123
  ];
1085
- const hasReActPattern = reactPatterns.some((p) => p.test(content));
1086
- if (hasReActPattern) {
1124
+ const matchedPatterns = [];
1125
+ for (const p of reactPatterns) {
1126
+ if (p.test(content)) matchedPatterns.push(p.source);
1127
+ }
1128
+ if (matchedPatterns.length > 0) {
1129
+ let dialectName = "react";
1130
+ if (/Function:/i.test(content)) dialectName = "function";
1131
+ else if (/Tool:/i.test(content)) dialectName = "tool";
1132
+ else if (/Call:/i.test(content)) dialectName = "call";
1087
1133
  const level2 = "react";
1088
1134
  cacheToolSupport(model, level2, family);
1089
1135
  return {
1090
1136
  level: level2,
1091
1137
  cached: false,
1092
- evidence: `ReAct format detected in text response`,
1138
+ evidence: `ReAct format detected (${dialectName} dialect) in text response`,
1093
1139
  elapsedMs
1094
1140
  };
1095
1141
  }
@@ -1176,17 +1222,32 @@ The JSON object must have exactly these 4 keys:
1176
1222
  }
1177
1223
  }
1178
1224
  const branding = [
1179
- ` \u26A1 Pi Model Benchmark v1.0.7`,
1225
+ ` \u26A1 Pi Model Benchmark v1.0.9`,
1180
1226
  ` Written by VTSTech`,
1181
1227
  ` GitHub: https://github.com/VTSTech`,
1182
1228
  ` Website: www.vts-tech.org`
1183
1229
  ].join("\n");
1184
- async function testModelOllama(model) {
1230
+ async function testModelOllama(model, providerInfo, ctx) {
1185
1231
  const lines = [];
1186
1232
  const totalStart = Date.now();
1187
1233
  lines.push(branding);
1188
1234
  lines.push(section(`MODEL: ${model}`));
1189
1235
  lines.push(info("Provider: Ollama (local/remote)"));
1236
+ const modelsJson = readModelsJson();
1237
+ let apiMode = "ollama";
1238
+ const providerName = ctx?.model?.provider || providerInfo?.name || "";
1239
+ if (providerName && modelsJson) {
1240
+ const providerCfg = (modelsJson.providers || {})[providerName];
1241
+ if (providerCfg) {
1242
+ apiMode = providerCfg.api || "ollama";
1243
+ }
1244
+ }
1245
+ lines.push(info(`API: ${apiMode}`));
1246
+ const nativeContext = await fetchModelContextLength(OLLAMA_BASE, model);
1247
+ if (nativeContext !== void 0) {
1248
+ const ctxStr = nativeContext >= 1e3 ? `${(nativeContext / 1e3).toFixed(1)}k` : String(nativeContext);
1249
+ lines.push(info(`Context: ${ctxStr} tokens (native max)`));
1250
+ }
1190
1251
  let modelSize = "unknown";
1191
1252
  let modelFamily = "unknown";
1192
1253
  let modelParams = "unknown";
@@ -1286,23 +1347,24 @@ The JSON object must have exactly these 4 keys:
1286
1347
  await rateLimitDelay(lines);
1287
1348
  const react = await testReactParsing(model);
1288
1349
  lines.push(info(`Time: ${msHuman(react.elapsedMs)}`));
1350
+ const dialectTag = react.dialect && react.dialect !== "react" ? ` [${react.dialect} dialect]` : "";
1289
1351
  if (react.score === "STRONG") {
1290
- lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
1352
+ lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})${dialectTag}`));
1291
1353
  if (react.thought) {
1292
1354
  lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1293
1355
  }
1294
1356
  } else if (react.score === "MODERATE") {
1295
- lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
1357
+ lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})${dialectTag}`));
1296
1358
  if (react.thought) {
1297
1359
  lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1298
1360
  }
1299
1361
  } else if (react.score === "WEAK") {
1300
- lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args`));
1362
+ lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args${dialectTag}`));
1301
1363
  if (react.thought) {
1302
1364
  lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1303
1365
  }
1304
1366
  } else if (react.score === "FAIL") {
1305
- lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})`));
1367
+ lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})${dialectTag}`));
1306
1368
  if (react.response) {
1307
1369
  lines.push(info(`Response: ${sanitizeForReport(react.response)}`));
1308
1370
  }
@@ -1387,7 +1449,7 @@ The JSON object must have exactly these 4 keys:
1387
1449
  }
1388
1450
  return lines.join("\n");
1389
1451
  }
1390
- async function testModelProvider(providerInfo, model) {
1452
+ async function testModelProvider(providerInfo, model, ctx) {
1391
1453
  const lines = [];
1392
1454
  const totalStart = Date.now();
1393
1455
  lines.push(branding);
@@ -1400,6 +1462,11 @@ The JSON object must have exactly these 4 keys:
1400
1462
  } else {
1401
1463
  lines.push(warn(`API Key: NOT SET (${providerInfo.envKey || "env var not found"})`));
1402
1464
  }
1465
+ const contextWindow = ctx?.model?.contextWindow ?? null;
1466
+ if (contextWindow !== null) {
1467
+ const ctxStr = contextWindow >= 1e3 ? `${(contextWindow / 1e3).toFixed(1)}k` : String(contextWindow);
1468
+ lines.push(info(`Context: ${ctxStr} tokens`));
1469
+ }
1403
1470
  lines.push(section("CONNECTIVITY TEST"));
1404
1471
  lines.push(info("Sending minimal request to verify API reachability and key validity..."));
1405
1472
  const connectivity = await testConnectivity(providerInfo, model);
@@ -1520,9 +1587,9 @@ The JSON object must have exactly these 4 keys:
1520
1587
  async function testModel(model, ctx) {
1521
1588
  const providerInfo = ctx ? detectProvider(ctx) : { kind: "ollama", name: "ollama" };
1522
1589
  if (providerInfo.kind === "ollama") {
1523
- return testModelOllama(model);
1590
+ return testModelOllama(model, providerInfo, ctx);
1524
1591
  } else if (providerInfo.kind === "builtin") {
1525
- return testModelProvider(providerInfo, model);
1592
+ return testModelProvider(providerInfo, model, ctx);
1526
1593
  } else {
1527
1594
  return testModelOllama(model);
1528
1595
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.0.7",
3
+ "version": "1.0.9",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.0.7"
17
+ "@vtstech/pi-shared": "1.0.9"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"