@vtstech/pi-model-test 1.0.8 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +138 -91
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -740,90 +740,111 @@ function model_test_temp_default(pi) {
740
740
  if (!content) {
741
741
  return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
742
742
  }
743
- const THOUGHT_RE = /Thought:\s*(.*?)(?=Action:|Final Answer:|$)/is;
744
- const ACTION_RE = /Action:\s*[`"']?(\w+)[`"']?\s*\n?\s*Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:|Action:)|$)/is;
745
- const ACTION_RE_SAMELINE = /Action:\s*[`"']?(\w+)[`"']?\s+Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:)|$)/is;
746
- const ACTION_RE_LOOSE = /Action:\s*(.+?)\n\s*Action Input:\s*(.*?)(?=\n\s*(?:Observation:|Thought:|Final Answer:|Action:)|$)/is;
747
- const ACTION_RE_PAREN = /Action:\s*(\w+)\s*\(([^)]*)\)/i;
748
- let thought = "";
749
- const thoughtMatch = THOUGHT_RE.exec(content);
750
- if (thoughtMatch) thought = thoughtMatch[1].trim();
751
- let match = ACTION_RE.exec(content);
752
- if (!match) match = ACTION_RE_SAMELINE.exec(content);
753
- let looseMatch = false;
754
- if (!match) {
755
- const looseResult = ACTION_RE_LOOSE.exec(content);
756
- if (looseResult) {
757
- const candidate = looseResult[1].trim().replace(/[`"']/g, "");
758
- const isToolIdentifier = /^\w+$/.test(candidate) && (candidate.includes("_") || candidate.includes("-"));
759
- const isKnownTool = /^(get_weather|calculate)$/i.test(candidate);
760
- if (isToolIdentifier || isKnownTool) {
761
- match = looseResult;
762
- looseMatch = true;
763
- }
764
- }
765
- }
766
- let parenMatch = false;
767
- if (!match) match = ACTION_RE_PAREN.exec(content), parenMatch = true;
768
- if (match) {
769
- let toolName = match[1].trim().replace(/[`"']/g, "");
770
- if (looseMatch) {
771
- const actionText = toolName.toLowerCase();
772
- if (actionText.includes("get_weather")) toolName = "get_weather";
773
- else {
774
- const toolWords = actionText.match(/\b[a-z][a-z0-9]*(?:[_-][a-z0-9]+)+\b/gi) || [];
775
- if (toolWords.length > 0) toolName = toolWords[0];
776
- }
777
- }
778
- const rawArgs = parenMatch ? match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim() : match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
779
- let argsParsed = false;
780
- let argsStr = rawArgs;
781
- if (parenMatch && rawArgs && !rawArgs.startsWith("{")) {
782
- const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
783
- if (pairs) {
784
- const obj = {};
785
- for (const p of pairs) {
786
- const colonIdx = p.indexOf(":");
787
- const key = p.slice(0, colonIdx).trim();
788
- let val = p.slice(colonIdx + 1).trim();
789
- if (val.startsWith('"') && val.endsWith('"') || val.startsWith("'") && val.endsWith("'")) {
790
- val = val.slice(1, -1);
743
+ let parsedResult = null;
744
+ const sharedParser = pi._reactParser;
745
+ if (sharedParser?.ALL_DIALECT_PATTERNS) {
746
+ for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
747
+ const result2 = sharedParser.parseReactWithPatterns(content, dp, true);
748
+ if (result2) {
749
+ let toolName = result2.name;
750
+ let argsStr;
751
+ const rawArgs = result2.args ? JSON.stringify(result2.args) : "";
752
+ if (rawArgs && rawArgs !== "{}") {
753
+ argsStr = rawArgs;
754
+ } else if (result2.raw) {
755
+ const jsonStart = result2.raw.indexOf("{");
756
+ if (jsonStart !== -1) {
757
+ let depth = 0, jsonEnd = -1;
758
+ for (let i = jsonStart; i < result2.raw.length; i++) {
759
+ if (result2.raw[i] === "{") depth++;
760
+ else if (result2.raw[i] === "}") {
761
+ depth--;
762
+ if (depth === 0) {
763
+ jsonEnd = i;
764
+ break;
765
+ }
766
+ }
767
+ }
768
+ argsStr = jsonEnd !== -1 ? result2.raw.slice(jsonStart, jsonEnd + 1) : "";
769
+ } else {
770
+ argsStr = "";
791
771
  }
792
- obj[key] = val;
793
- }
794
- try {
795
- argsStr = JSON.stringify(obj);
796
- argsParsed = true;
797
- } catch {
772
+ } else {
773
+ argsStr = "";
798
774
  }
775
+ parsedResult = { name: toolName, args: argsStr, thought: result2.thought || "", dialect: result2.dialect };
776
+ break;
799
777
  }
800
778
  }
801
- if (!argsParsed) {
802
- const jsonStart = rawArgs.indexOf("{");
803
- if (jsonStart !== -1) {
804
- let depth = 0;
805
- let jsonEnd = -1;
806
- for (let i = jsonStart; i < rawArgs.length; i++) {
807
- if (rawArgs[i] === "{") depth++;
808
- else if (rawArgs[i] === "}") {
809
- depth--;
810
- if (depth === 0) {
811
- jsonEnd = i;
812
- break;
779
+ } else {
780
+ const dialectDefs = [
781
+ { name: "react", action: "Action:", input: "Action Input:" },
782
+ { name: "function", action: "Function:", input: "Function Input:" },
783
+ { name: "tool", action: "Tool:", input: "Tool Input:" },
784
+ { name: "call", action: "Call:", input: "Input:" }
785
+ ];
786
+ for (const dd of dialectDefs) {
787
+ const esc = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
788
+ const aT = esc(dd.action);
789
+ const iT = esc(dd.input);
790
+ const primaryRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s*\\n?\\s*${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
791
+ const sameRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s+${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
792
+ const parenRe = new RegExp(`${aT}\\s*(\\w+)\\s*\\(([^)]*)\\)`, "i");
793
+ let m = primaryRe.exec(content) || sameRe.exec(content);
794
+ let isParen = false;
795
+ if (!m) {
796
+ m = parenRe.exec(content);
797
+ isParen = true;
798
+ }
799
+ if (m) {
800
+ const toolName = m[1].trim().replace(/[`"']/g, "");
801
+ const rawArgs = m[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
802
+ let argsStr = "";
803
+ if (isParen && rawArgs && !rawArgs.startsWith("{")) {
804
+ const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
805
+ if (pairs) {
806
+ const obj = {};
807
+ for (const p of pairs) {
808
+ const ci = p.indexOf(":");
809
+ let v = p.slice(ci + 1).trim();
810
+ if (v.startsWith('"') && v.endsWith('"') || v.startsWith("'") && v.endsWith("'")) v = v.slice(1, -1);
811
+ obj[p.slice(0, ci).trim()] = v;
813
812
  }
813
+ argsStr = JSON.stringify(obj);
814
+ } else {
815
+ argsStr = rawArgs;
814
816
  }
815
- }
816
- if (jsonEnd !== -1) {
817
- const jsonStr = rawArgs.slice(jsonStart, jsonEnd + 1);
818
- try {
819
- JSON.parse(jsonStr);
820
- argsParsed = true;
821
- argsStr = jsonStr;
822
- } catch {
817
+ } else {
818
+ const js = rawArgs.indexOf("{");
819
+ if (js !== -1) {
820
+ let d = 0, je = -1;
821
+ for (let i = js; i < rawArgs.length; i++) {
822
+ if (rawArgs[i] === "{") d++;
823
+ else if (rawArgs[i] === "}") {
824
+ d--;
825
+ if (d === 0) {
826
+ je = i;
827
+ break;
828
+ }
829
+ }
830
+ }
831
+ argsStr = je !== -1 ? rawArgs.slice(js, je + 1) : rawArgs;
832
+ } else {
833
+ argsStr = rawArgs;
823
834
  }
824
835
  }
836
+ let thought = "";
837
+ const thoughtRe = /Thought:\s*(.*?)(?=Action:|Function:|Tool:|Call:|Final Answer:|$)/is;
838
+ const tm = thoughtRe.exec(content);
839
+ if (tm) thought = tm[1].trim();
840
+ parsedResult = { name: toolName, args: argsStr, thought, dialect: dd.name };
841
+ break;
825
842
  }
826
843
  }
844
+ }
845
+ if (parsedResult) {
846
+ let { name: toolName, args: argsStr, thought, dialect } = parsedResult;
847
+ const argsParsed = argsStr.length > 0;
827
848
  let score;
828
849
  const isWeatherTool = toolName.toLowerCase().includes("get_weather") || toolName.toLowerCase() === "get_weather";
829
850
  if (isWeatherTool && argsParsed) {
@@ -840,15 +861,25 @@ function model_test_temp_default(pi) {
840
861
  toolCall: `${toolName}(${argsStr})`,
841
862
  thought,
842
863
  response: content,
843
- elapsedMs
864
+ elapsedMs,
865
+ dialect: dialect || "react"
844
866
  };
845
867
  }
868
+ const altTagPatterns = [
869
+ /^\s*Function:\s*/im,
870
+ /^\s*Tool:\s*/im,
871
+ /^\s*Call:\s*/im,
872
+ /<function_call/i,
873
+ /<invoke\s/i
874
+ ];
875
+ const hasAltTag = altTagPatterns.some((p) => p.test(content));
846
876
  const hasToolMention = /\bget_weather\b/i.test(content) || /\btool\b/i.test(content);
847
- if (hasToolMention) {
877
+ if (hasAltTag || hasToolMention) {
878
+ const detail = hasAltTag ? "model used alternative tool-call tags but format was not parseable" : "model mentioned tool but not in ReAct format";
848
879
  return {
849
880
  pass: false,
850
881
  score: "FAIL",
851
- toolCall: "none \u2014 model mentioned tool but not in ReAct format",
882
+ toolCall: `none \u2014 ${detail}`,
852
883
  thought: "",
853
884
  response: content,
854
885
  elapsedMs
@@ -1071,25 +1102,40 @@ The JSON object must have exactly these 4 keys:
1071
1102
  };
1072
1103
  }
1073
1104
  const reactPatterns = [
1105
+ // Classic ReAct
1074
1106
  /^\s*Action:\s*/im,
1075
- // "Action: get_weather"
1076
1107
  /^\s*Action Input:\s*/im,
1077
- // "Action Input: {"location": "Tokyo"}"
1078
1108
  /^\s*Thought:\s*/im,
1079
- // "Thought: I need to look up the weather"
1080
1109
  /Action:\s*\w+/i,
1081
- // "Action: get_weather" anywhere
1082
- /Action Input:\s*\{/i
1083
- // "Action Input: {..." anywhere
1110
+ /Action Input:\s*\{/i,
1111
+ // Function dialect
1112
+ /^\s*Function:\s*/im,
1113
+ /^\s*Function Input:\s*/im,
1114
+ /Function:\s*\w+/i,
1115
+ // Tool dialect
1116
+ /^\s*Tool:\s*/im,
1117
+ /^\s*Tool Input:\s*/im,
1118
+ /Tool:\s*\w+/i,
1119
+ // Call dialect
1120
+ /^\s*Call:\s*/im,
1121
+ /^\s*Input:\s*/im,
1122
+ /Call:\s*\w+/i
1084
1123
  ];
1085
- const hasReActPattern = reactPatterns.some((p) => p.test(content));
1086
- if (hasReActPattern) {
1124
+ const matchedPatterns = [];
1125
+ for (const p of reactPatterns) {
1126
+ if (p.test(content)) matchedPatterns.push(p.source);
1127
+ }
1128
+ if (matchedPatterns.length > 0) {
1129
+ let dialectName = "react";
1130
+ if (/Function:/i.test(content)) dialectName = "function";
1131
+ else if (/Tool:/i.test(content)) dialectName = "tool";
1132
+ else if (/Call:/i.test(content)) dialectName = "call";
1087
1133
  const level2 = "react";
1088
1134
  cacheToolSupport(model, level2, family);
1089
1135
  return {
1090
1136
  level: level2,
1091
1137
  cached: false,
1092
- evidence: `ReAct format detected in text response`,
1138
+ evidence: `ReAct format detected (${dialectName} dialect) in text response`,
1093
1139
  elapsedMs
1094
1140
  };
1095
1141
  }
@@ -1176,7 +1222,7 @@ The JSON object must have exactly these 4 keys:
1176
1222
  }
1177
1223
  }
1178
1224
  const branding = [
1179
- ` \u26A1 Pi Model Benchmark v1.0.8`,
1225
+ ` \u26A1 Pi Model Benchmark v1.0.9`,
1180
1226
  ` Written by VTSTech`,
1181
1227
  ` GitHub: https://github.com/VTSTech`,
1182
1228
  ` Website: www.vts-tech.org`
@@ -1301,23 +1347,24 @@ The JSON object must have exactly these 4 keys:
1301
1347
  await rateLimitDelay(lines);
1302
1348
  const react = await testReactParsing(model);
1303
1349
  lines.push(info(`Time: ${msHuman(react.elapsedMs)}`));
1350
+ const dialectTag = react.dialect && react.dialect !== "react" ? ` [${react.dialect} dialect]` : "";
1304
1351
  if (react.score === "STRONG") {
1305
- lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
1352
+ lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})${dialectTag}`));
1306
1353
  if (react.thought) {
1307
1354
  lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1308
1355
  }
1309
1356
  } else if (react.score === "MODERATE") {
1310
- lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
1357
+ lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})${dialectTag}`));
1311
1358
  if (react.thought) {
1312
1359
  lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1313
1360
  }
1314
1361
  } else if (react.score === "WEAK") {
1315
- lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args`));
1362
+ lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args${dialectTag}`));
1316
1363
  if (react.thought) {
1317
1364
  lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
1318
1365
  }
1319
1366
  } else if (react.score === "FAIL") {
1320
- lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})`));
1367
+ lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})${dialectTag}`));
1321
1368
  if (react.response) {
1322
1369
  lines.push(info(`Response: ${sanitizeForReport(react.response)}`));
1323
1370
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.0.8",
3
+ "version": "1.0.9",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.0.8"
17
+ "@vtstech/pi-shared": "1.0.9"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"