@vtstech/pi-model-test 1.0.8 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +138 -91
- package/package.json +2 -2
package/model-test.js
CHANGED
|
@@ -740,90 +740,111 @@ function model_test_temp_default(pi) {
|
|
|
740
740
|
if (!content) {
|
|
741
741
|
return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
|
|
742
742
|
}
|
|
743
|
-
|
|
744
|
-
const
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
const actionText = toolName.toLowerCase();
|
|
772
|
-
if (actionText.includes("get_weather")) toolName = "get_weather";
|
|
773
|
-
else {
|
|
774
|
-
const toolWords = actionText.match(/\b[a-z][a-z0-9]*(?:[_-][a-z0-9]+)+\b/gi) || [];
|
|
775
|
-
if (toolWords.length > 0) toolName = toolWords[0];
|
|
776
|
-
}
|
|
777
|
-
}
|
|
778
|
-
const rawArgs = parenMatch ? match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim() : match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
|
|
779
|
-
let argsParsed = false;
|
|
780
|
-
let argsStr = rawArgs;
|
|
781
|
-
if (parenMatch && rawArgs && !rawArgs.startsWith("{")) {
|
|
782
|
-
const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
|
|
783
|
-
if (pairs) {
|
|
784
|
-
const obj = {};
|
|
785
|
-
for (const p of pairs) {
|
|
786
|
-
const colonIdx = p.indexOf(":");
|
|
787
|
-
const key = p.slice(0, colonIdx).trim();
|
|
788
|
-
let val = p.slice(colonIdx + 1).trim();
|
|
789
|
-
if (val.startsWith('"') && val.endsWith('"') || val.startsWith("'") && val.endsWith("'")) {
|
|
790
|
-
val = val.slice(1, -1);
|
|
743
|
+
let parsedResult = null;
|
|
744
|
+
const sharedParser = pi._reactParser;
|
|
745
|
+
if (sharedParser?.ALL_DIALECT_PATTERNS) {
|
|
746
|
+
for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
|
|
747
|
+
const result2 = sharedParser.parseReactWithPatterns(content, dp, true);
|
|
748
|
+
if (result2) {
|
|
749
|
+
let toolName = result2.name;
|
|
750
|
+
let argsStr;
|
|
751
|
+
const rawArgs = result2.args ? JSON.stringify(result2.args) : "";
|
|
752
|
+
if (rawArgs && rawArgs !== "{}") {
|
|
753
|
+
argsStr = rawArgs;
|
|
754
|
+
} else if (result2.raw) {
|
|
755
|
+
const jsonStart = result2.raw.indexOf("{");
|
|
756
|
+
if (jsonStart !== -1) {
|
|
757
|
+
let depth = 0, jsonEnd = -1;
|
|
758
|
+
for (let i = jsonStart; i < result2.raw.length; i++) {
|
|
759
|
+
if (result2.raw[i] === "{") depth++;
|
|
760
|
+
else if (result2.raw[i] === "}") {
|
|
761
|
+
depth--;
|
|
762
|
+
if (depth === 0) {
|
|
763
|
+
jsonEnd = i;
|
|
764
|
+
break;
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
argsStr = jsonEnd !== -1 ? result2.raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
769
|
+
} else {
|
|
770
|
+
argsStr = "";
|
|
791
771
|
}
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
try {
|
|
795
|
-
argsStr = JSON.stringify(obj);
|
|
796
|
-
argsParsed = true;
|
|
797
|
-
} catch {
|
|
772
|
+
} else {
|
|
773
|
+
argsStr = "";
|
|
798
774
|
}
|
|
775
|
+
parsedResult = { name: toolName, args: argsStr, thought: result2.thought || "", dialect: result2.dialect };
|
|
776
|
+
break;
|
|
799
777
|
}
|
|
800
778
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
779
|
+
} else {
|
|
780
|
+
const dialectDefs = [
|
|
781
|
+
{ name: "react", action: "Action:", input: "Action Input:" },
|
|
782
|
+
{ name: "function", action: "Function:", input: "Function Input:" },
|
|
783
|
+
{ name: "tool", action: "Tool:", input: "Tool Input:" },
|
|
784
|
+
{ name: "call", action: "Call:", input: "Input:" }
|
|
785
|
+
];
|
|
786
|
+
for (const dd of dialectDefs) {
|
|
787
|
+
const esc = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
788
|
+
const aT = esc(dd.action);
|
|
789
|
+
const iT = esc(dd.input);
|
|
790
|
+
const primaryRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s*\\n?\\s*${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
|
|
791
|
+
const sameRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s+${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
|
|
792
|
+
const parenRe = new RegExp(`${aT}\\s*(\\w+)\\s*\\(([^)]*)\\)`, "i");
|
|
793
|
+
let m = primaryRe.exec(content) || sameRe.exec(content);
|
|
794
|
+
let isParen = false;
|
|
795
|
+
if (!m) {
|
|
796
|
+
m = parenRe.exec(content);
|
|
797
|
+
isParen = true;
|
|
798
|
+
}
|
|
799
|
+
if (m) {
|
|
800
|
+
const toolName = m[1].trim().replace(/[`"']/g, "");
|
|
801
|
+
const rawArgs = m[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
|
|
802
|
+
let argsStr = "";
|
|
803
|
+
if (isParen && rawArgs && !rawArgs.startsWith("{")) {
|
|
804
|
+
const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
|
|
805
|
+
if (pairs) {
|
|
806
|
+
const obj = {};
|
|
807
|
+
for (const p of pairs) {
|
|
808
|
+
const ci = p.indexOf(":");
|
|
809
|
+
let v = p.slice(ci + 1).trim();
|
|
810
|
+
if (v.startsWith('"') && v.endsWith('"') || v.startsWith("'") && v.endsWith("'")) v = v.slice(1, -1);
|
|
811
|
+
obj[p.slice(0, ci).trim()] = v;
|
|
813
812
|
}
|
|
813
|
+
argsStr = JSON.stringify(obj);
|
|
814
|
+
} else {
|
|
815
|
+
argsStr = rawArgs;
|
|
814
816
|
}
|
|
815
|
-
}
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
817
|
+
} else {
|
|
818
|
+
const js = rawArgs.indexOf("{");
|
|
819
|
+
if (js !== -1) {
|
|
820
|
+
let d = 0, je = -1;
|
|
821
|
+
for (let i = js; i < rawArgs.length; i++) {
|
|
822
|
+
if (rawArgs[i] === "{") d++;
|
|
823
|
+
else if (rawArgs[i] === "}") {
|
|
824
|
+
d--;
|
|
825
|
+
if (d === 0) {
|
|
826
|
+
je = i;
|
|
827
|
+
break;
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
argsStr = je !== -1 ? rawArgs.slice(js, je + 1) : rawArgs;
|
|
832
|
+
} else {
|
|
833
|
+
argsStr = rawArgs;
|
|
823
834
|
}
|
|
824
835
|
}
|
|
836
|
+
let thought = "";
|
|
837
|
+
const thoughtRe = /Thought:\s*(.*?)(?=Action:|Function:|Tool:|Call:|Final Answer:|$)/is;
|
|
838
|
+
const tm = thoughtRe.exec(content);
|
|
839
|
+
if (tm) thought = tm[1].trim();
|
|
840
|
+
parsedResult = { name: toolName, args: argsStr, thought, dialect: dd.name };
|
|
841
|
+
break;
|
|
825
842
|
}
|
|
826
843
|
}
|
|
844
|
+
}
|
|
845
|
+
if (parsedResult) {
|
|
846
|
+
let { name: toolName, args: argsStr, thought, dialect } = parsedResult;
|
|
847
|
+
const argsParsed = argsStr.length > 0;
|
|
827
848
|
let score;
|
|
828
849
|
const isWeatherTool = toolName.toLowerCase().includes("get_weather") || toolName.toLowerCase() === "get_weather";
|
|
829
850
|
if (isWeatherTool && argsParsed) {
|
|
@@ -840,15 +861,25 @@ function model_test_temp_default(pi) {
|
|
|
840
861
|
toolCall: `${toolName}(${argsStr})`,
|
|
841
862
|
thought,
|
|
842
863
|
response: content,
|
|
843
|
-
elapsedMs
|
|
864
|
+
elapsedMs,
|
|
865
|
+
dialect: dialect || "react"
|
|
844
866
|
};
|
|
845
867
|
}
|
|
868
|
+
const altTagPatterns = [
|
|
869
|
+
/^\s*Function:\s*/im,
|
|
870
|
+
/^\s*Tool:\s*/im,
|
|
871
|
+
/^\s*Call:\s*/im,
|
|
872
|
+
/<function_call/i,
|
|
873
|
+
/<invoke\s/i
|
|
874
|
+
];
|
|
875
|
+
const hasAltTag = altTagPatterns.some((p) => p.test(content));
|
|
846
876
|
const hasToolMention = /\bget_weather\b/i.test(content) || /\btool\b/i.test(content);
|
|
847
|
-
if (hasToolMention) {
|
|
877
|
+
if (hasAltTag || hasToolMention) {
|
|
878
|
+
const detail = hasAltTag ? "model used alternative tool-call tags but format was not parseable" : "model mentioned tool but not in ReAct format";
|
|
848
879
|
return {
|
|
849
880
|
pass: false,
|
|
850
881
|
score: "FAIL",
|
|
851
|
-
toolCall:
|
|
882
|
+
toolCall: `none \u2014 ${detail}`,
|
|
852
883
|
thought: "",
|
|
853
884
|
response: content,
|
|
854
885
|
elapsedMs
|
|
@@ -1071,25 +1102,40 @@ The JSON object must have exactly these 4 keys:
|
|
|
1071
1102
|
};
|
|
1072
1103
|
}
|
|
1073
1104
|
const reactPatterns = [
|
|
1105
|
+
// Classic ReAct
|
|
1074
1106
|
/^\s*Action:\s*/im,
|
|
1075
|
-
// "Action: get_weather"
|
|
1076
1107
|
/^\s*Action Input:\s*/im,
|
|
1077
|
-
// "Action Input: {"location": "Tokyo"}"
|
|
1078
1108
|
/^\s*Thought:\s*/im,
|
|
1079
|
-
// "Thought: I need to look up the weather"
|
|
1080
1109
|
/Action:\s*\w+/i,
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1110
|
+
/Action Input:\s*\{/i,
|
|
1111
|
+
// Function dialect
|
|
1112
|
+
/^\s*Function:\s*/im,
|
|
1113
|
+
/^\s*Function Input:\s*/im,
|
|
1114
|
+
/Function:\s*\w+/i,
|
|
1115
|
+
// Tool dialect
|
|
1116
|
+
/^\s*Tool:\s*/im,
|
|
1117
|
+
/^\s*Tool Input:\s*/im,
|
|
1118
|
+
/Tool:\s*\w+/i,
|
|
1119
|
+
// Call dialect
|
|
1120
|
+
/^\s*Call:\s*/im,
|
|
1121
|
+
/^\s*Input:\s*/im,
|
|
1122
|
+
/Call:\s*\w+/i
|
|
1084
1123
|
];
|
|
1085
|
-
const
|
|
1086
|
-
|
|
1124
|
+
const matchedPatterns = [];
|
|
1125
|
+
for (const p of reactPatterns) {
|
|
1126
|
+
if (p.test(content)) matchedPatterns.push(p.source);
|
|
1127
|
+
}
|
|
1128
|
+
if (matchedPatterns.length > 0) {
|
|
1129
|
+
let dialectName = "react";
|
|
1130
|
+
if (/Function:/i.test(content)) dialectName = "function";
|
|
1131
|
+
else if (/Tool:/i.test(content)) dialectName = "tool";
|
|
1132
|
+
else if (/Call:/i.test(content)) dialectName = "call";
|
|
1087
1133
|
const level2 = "react";
|
|
1088
1134
|
cacheToolSupport(model, level2, family);
|
|
1089
1135
|
return {
|
|
1090
1136
|
level: level2,
|
|
1091
1137
|
cached: false,
|
|
1092
|
-
evidence: `ReAct format detected in text response`,
|
|
1138
|
+
evidence: `ReAct format detected (${dialectName} dialect) in text response`,
|
|
1093
1139
|
elapsedMs
|
|
1094
1140
|
};
|
|
1095
1141
|
}
|
|
@@ -1176,7 +1222,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1176
1222
|
}
|
|
1177
1223
|
}
|
|
1178
1224
|
const branding = [
|
|
1179
|
-
` \u26A1 Pi Model Benchmark v1.0.
|
|
1225
|
+
` \u26A1 Pi Model Benchmark v1.0.9`,
|
|
1180
1226
|
` Written by VTSTech`,
|
|
1181
1227
|
` GitHub: https://github.com/VTSTech`,
|
|
1182
1228
|
` Website: www.vts-tech.org`
|
|
@@ -1301,23 +1347,24 @@ The JSON object must have exactly these 4 keys:
|
|
|
1301
1347
|
await rateLimitDelay(lines);
|
|
1302
1348
|
const react = await testReactParsing(model);
|
|
1303
1349
|
lines.push(info(`Time: ${msHuman(react.elapsedMs)}`));
|
|
1350
|
+
const dialectTag = react.dialect && react.dialect !== "react" ? ` [${react.dialect} dialect]` : "";
|
|
1304
1351
|
if (react.score === "STRONG") {
|
|
1305
|
-
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
|
|
1352
|
+
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})${dialectTag}`));
|
|
1306
1353
|
if (react.thought) {
|
|
1307
1354
|
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1308
1355
|
}
|
|
1309
1356
|
} else if (react.score === "MODERATE") {
|
|
1310
|
-
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
|
|
1357
|
+
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})${dialectTag}`));
|
|
1311
1358
|
if (react.thought) {
|
|
1312
1359
|
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1313
1360
|
}
|
|
1314
1361
|
} else if (react.score === "WEAK") {
|
|
1315
|
-
lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args`));
|
|
1362
|
+
lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args${dialectTag}`));
|
|
1316
1363
|
if (react.thought) {
|
|
1317
1364
|
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1318
1365
|
}
|
|
1319
1366
|
} else if (react.score === "FAIL") {
|
|
1320
|
-
lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})`));
|
|
1367
|
+
lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})${dialectTag}`));
|
|
1321
1368
|
if (react.response) {
|
|
1322
1369
|
lines.push(info(`Response: ${sanitizeForReport(react.response)}`));
|
|
1323
1370
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.9",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.0.
|
|
17
|
+
"@vtstech/pi-shared": "1.0.9"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|