@vtstech/pi-model-test 1.0.8 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/model-test.js +204 -169
- package/package.json +2 -2
package/model-test.js
CHANGED
|
@@ -12,57 +12,17 @@ import {
|
|
|
12
12
|
truncate,
|
|
13
13
|
sanitizeForReport
|
|
14
14
|
} from "@vtstech/pi-shared/format";
|
|
15
|
-
import { getOllamaBaseUrl, detectModelFamily, readModelsJson,
|
|
16
|
-
function detectProvider(ctx) {
|
|
17
|
-
const model = ctx.model;
|
|
18
|
-
if (!model) return { kind: "unknown", name: "none" };
|
|
19
|
-
const providerName = model.provider || "";
|
|
20
|
-
if (!providerName) return { kind: "unknown", name: "none" };
|
|
21
|
-
const modelsJson = readModelsJson();
|
|
22
|
-
const userProviderCfg = (modelsJson.providers || {})[providerName];
|
|
23
|
-
if (userProviderCfg) {
|
|
24
|
-
const baseUrl = userProviderCfg.baseUrl || "";
|
|
25
|
-
const apiMode = userProviderCfg.api || "";
|
|
26
|
-
const apiKey = userProviderCfg.apiKey || "";
|
|
27
|
-
const isOllama = /ollama/i.test(providerName) || /localhost:\d+/.test(baseUrl) || /127\.0\.0\.1:\d+/.test(baseUrl) || /0\.0\.0\.0:\d+/.test(baseUrl) || /\/api\/chat/.test(baseUrl) || apiMode === "ollama";
|
|
28
|
-
if (isOllama) {
|
|
29
|
-
return { kind: "ollama", name: providerName, apiMode: "ollama", baseUrl, apiKey };
|
|
30
|
-
}
|
|
31
|
-
if (/\/api\/chat/.test(baseUrl)) {
|
|
32
|
-
return { kind: "ollama", name: providerName, apiMode: "ollama", baseUrl, apiKey };
|
|
33
|
-
}
|
|
34
|
-
return {
|
|
35
|
-
kind: "builtin",
|
|
36
|
-
name: providerName,
|
|
37
|
-
apiMode: apiMode || userProviderCfg.api || "openai-completions",
|
|
38
|
-
baseUrl,
|
|
39
|
-
apiKey
|
|
40
|
-
};
|
|
41
|
-
}
|
|
42
|
-
const builtin = BUILTIN_PROVIDERS[providerName];
|
|
43
|
-
if (builtin) {
|
|
44
|
-
const apiKey = process.env[builtin.envKey] || "";
|
|
45
|
-
return {
|
|
46
|
-
kind: "builtin",
|
|
47
|
-
name: providerName,
|
|
48
|
-
apiMode: builtin.api,
|
|
49
|
-
baseUrl: builtin.baseUrl,
|
|
50
|
-
envKey: builtin.envKey,
|
|
51
|
-
apiKey
|
|
52
|
-
};
|
|
53
|
-
}
|
|
54
|
-
return { kind: "unknown", name: providerName };
|
|
55
|
-
}
|
|
15
|
+
import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
|
|
56
16
|
var CONFIG = {
|
|
57
17
|
// General API settings
|
|
58
18
|
DEFAULT_TIMEOUT_MS: 999999,
|
|
59
|
-
//
|
|
19
|
+
// ~16.7 minutes — effectively unlimited for slow models
|
|
60
20
|
CONNECT_TIMEOUT_S: 60,
|
|
61
|
-
//
|
|
21
|
+
// 60 seconds to establish connection
|
|
62
22
|
MAX_RETRIES: 1,
|
|
63
23
|
// Single retry for transient failures
|
|
64
24
|
RETRY_DELAY_MS: 1e4,
|
|
65
|
-
//
|
|
25
|
+
// 10 seconds between retries
|
|
66
26
|
EXEC_BUFFER_MS: 8e3,
|
|
67
27
|
// Extra buffer for exec timeout over curl timeout
|
|
68
28
|
// Model generation settings
|
|
@@ -74,31 +34,32 @@ var CONFIG = {
|
|
|
74
34
|
MIN_THINKING_LENGTH: 10,
|
|
75
35
|
// Minimum chars to consider thinking tokens valid
|
|
76
36
|
TOOL_TEST_TIMEOUT_MS: 999999,
|
|
77
|
-
//
|
|
37
|
+
// Effectively unlimited for slow tool usage tests
|
|
78
38
|
TOOL_TEST_MAX_TIME_S: 999999,
|
|
79
39
|
// Max curl time for tool tests (effectively unlimited)
|
|
80
40
|
TOOL_SUPPORT_TIMEOUT_MS: 999999,
|
|
81
|
-
//
|
|
41
|
+
// Effectively unlimited for tool support detection
|
|
82
42
|
TOOL_SUPPORT_MAX_TIME_S: 999999,
|
|
83
43
|
// Max curl time for tool support detection
|
|
84
44
|
// Metadata retrieval
|
|
85
45
|
TAGS_TIMEOUT_MS: 15e3,
|
|
86
46
|
// 15 seconds for /api/tags
|
|
87
47
|
TAGS_CONNECT_TIMEOUT_S: 30,
|
|
88
|
-
//
|
|
48
|
+
// 30 seconds connection timeout for tags
|
|
89
49
|
MODEL_INFO_TIMEOUT_MS: 3e4,
|
|
90
|
-
//
|
|
50
|
+
// 30 seconds for model info lookup
|
|
91
51
|
// Provider API settings
|
|
92
52
|
PROVIDER_TIMEOUT_MS: 999999,
|
|
93
|
-
//
|
|
53
|
+
// Effectively unlimited for cloud provider API calls
|
|
94
54
|
PROVIDER_TOOL_TIMEOUT_MS: 12e4,
|
|
95
|
-
//
|
|
55
|
+
// 120 seconds for tool usage tests on providers
|
|
96
56
|
// Rate limiting
|
|
97
57
|
TEST_DELAY_MS: 1e4
|
|
98
|
-
//
|
|
58
|
+
// 10 seconds between tests to avoid rate limiting
|
|
99
59
|
};
|
|
100
60
|
var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
|
|
101
61
|
var TOOL_SUPPORT_CACHE_PATH = path.join(TOOL_SUPPORT_CACHE_DIR, "tool_support.json");
|
|
62
|
+
var _toolSupportCacheInMemory = null;
|
|
102
63
|
function readToolSupportCache() {
|
|
103
64
|
try {
|
|
104
65
|
if (fs.existsSync(TOOL_SUPPORT_CACHE_PATH)) {
|
|
@@ -116,19 +77,21 @@ function writeToolSupportCache(cache) {
|
|
|
116
77
|
fs.writeFileSync(TOOL_SUPPORT_CACHE_PATH, JSON.stringify(cache, null, 2) + "\n", "utf-8");
|
|
117
78
|
}
|
|
118
79
|
function getCachedToolSupport(model) {
|
|
119
|
-
const cache = readToolSupportCache();
|
|
80
|
+
const cache = _toolSupportCacheInMemory || readToolSupportCache();
|
|
81
|
+
if (!_toolSupportCacheInMemory) _toolSupportCacheInMemory = cache;
|
|
120
82
|
const entry = cache[model];
|
|
121
83
|
if (!entry) return null;
|
|
122
84
|
if (!entry.support || !["native", "react", "none"].includes(entry.support)) return null;
|
|
123
85
|
return entry;
|
|
124
86
|
}
|
|
125
87
|
function cacheToolSupport(model, support, family) {
|
|
126
|
-
const cache = readToolSupportCache();
|
|
88
|
+
const cache = _toolSupportCacheInMemory || readToolSupportCache();
|
|
127
89
|
cache[model] = {
|
|
128
90
|
support,
|
|
129
91
|
testedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
130
92
|
family
|
|
131
93
|
};
|
|
94
|
+
_toolSupportCacheInMemory = cache;
|
|
132
95
|
writeToolSupportCache(cache);
|
|
133
96
|
}
|
|
134
97
|
function model_test_temp_default(pi) {
|
|
@@ -247,10 +210,8 @@ function model_test_temp_default(pi) {
|
|
|
247
210
|
{ role: "user", content: "Reply with exactly: PONG" }
|
|
248
211
|
], { maxTokens: 10, timeoutMs: 3e4 });
|
|
249
212
|
const elapsedMs = Date.now() - start;
|
|
250
|
-
const content = result.content.trim().toUpperCase();
|
|
251
213
|
const reachable = true;
|
|
252
214
|
const authValid = true;
|
|
253
|
-
const hasPong = content.includes("PONG");
|
|
254
215
|
return {
|
|
255
216
|
pass: reachable && authValid,
|
|
256
217
|
reachable,
|
|
@@ -259,7 +220,6 @@ function model_test_temp_default(pi) {
|
|
|
259
220
|
elapsedMs
|
|
260
221
|
};
|
|
261
222
|
} catch (e) {
|
|
262
|
-
const start = Date.now();
|
|
263
223
|
let reachable = false;
|
|
264
224
|
let authValid = false;
|
|
265
225
|
const msg = e.message || "";
|
|
@@ -290,7 +250,6 @@ function model_test_temp_default(pi) {
|
|
|
290
250
|
const prompt = `A snail climbs 3 feet up a wall each day, but slides back 2 feet each night. The wall is 10 feet tall. How many days does it take the snail to reach the top? Think step by step and give the final answer on its own line like: ANSWER: <number>`;
|
|
291
251
|
try {
|
|
292
252
|
let response, elapsedMs;
|
|
293
|
-
let usedThinkingFallback = false;
|
|
294
253
|
try {
|
|
295
254
|
const result = await ollamaChat(model, [
|
|
296
255
|
{ role: "user", content: prompt }
|
|
@@ -309,7 +268,6 @@ function model_test_temp_default(pi) {
|
|
|
309
268
|
], { think: true });
|
|
310
269
|
response = retry.response;
|
|
311
270
|
elapsedMs = retry.elapsedMs;
|
|
312
|
-
usedThinkingFallback = true;
|
|
313
271
|
} else {
|
|
314
272
|
throw firstErr;
|
|
315
273
|
}
|
|
@@ -740,90 +698,111 @@ function model_test_temp_default(pi) {
|
|
|
740
698
|
if (!content) {
|
|
741
699
|
return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
|
|
742
700
|
}
|
|
743
|
-
|
|
744
|
-
const
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
const actionText = toolName.toLowerCase();
|
|
772
|
-
if (actionText.includes("get_weather")) toolName = "get_weather";
|
|
773
|
-
else {
|
|
774
|
-
const toolWords = actionText.match(/\b[a-z][a-z0-9]*(?:[_-][a-z0-9]+)+\b/gi) || [];
|
|
775
|
-
if (toolWords.length > 0) toolName = toolWords[0];
|
|
776
|
-
}
|
|
777
|
-
}
|
|
778
|
-
const rawArgs = parenMatch ? match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim() : match[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
|
|
779
|
-
let argsParsed = false;
|
|
780
|
-
let argsStr = rawArgs;
|
|
781
|
-
if (parenMatch && rawArgs && !rawArgs.startsWith("{")) {
|
|
782
|
-
const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
|
|
783
|
-
if (pairs) {
|
|
784
|
-
const obj = {};
|
|
785
|
-
for (const p of pairs) {
|
|
786
|
-
const colonIdx = p.indexOf(":");
|
|
787
|
-
const key = p.slice(0, colonIdx).trim();
|
|
788
|
-
let val = p.slice(colonIdx + 1).trim();
|
|
789
|
-
if (val.startsWith('"') && val.endsWith('"') || val.startsWith("'") && val.endsWith("'")) {
|
|
790
|
-
val = val.slice(1, -1);
|
|
701
|
+
let parsedResult = null;
|
|
702
|
+
const sharedParser = pi._reactParser;
|
|
703
|
+
if (sharedParser?.ALL_DIALECT_PATTERNS) {
|
|
704
|
+
for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
|
|
705
|
+
const result2 = sharedParser.parseReactWithPatterns(content, dp, true);
|
|
706
|
+
if (result2) {
|
|
707
|
+
let toolName = result2.name;
|
|
708
|
+
let argsStr;
|
|
709
|
+
const rawArgs = result2.args ? JSON.stringify(result2.args) : "";
|
|
710
|
+
if (rawArgs && rawArgs !== "{}") {
|
|
711
|
+
argsStr = rawArgs;
|
|
712
|
+
} else if (result2.raw) {
|
|
713
|
+
const jsonStart = result2.raw.indexOf("{");
|
|
714
|
+
if (jsonStart !== -1) {
|
|
715
|
+
let depth = 0, jsonEnd = -1;
|
|
716
|
+
for (let i = jsonStart; i < result2.raw.length; i++) {
|
|
717
|
+
if (result2.raw[i] === "{") depth++;
|
|
718
|
+
else if (result2.raw[i] === "}") {
|
|
719
|
+
depth--;
|
|
720
|
+
if (depth === 0) {
|
|
721
|
+
jsonEnd = i;
|
|
722
|
+
break;
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
argsStr = jsonEnd !== -1 ? result2.raw.slice(jsonStart, jsonEnd + 1) : "";
|
|
727
|
+
} else {
|
|
728
|
+
argsStr = "";
|
|
791
729
|
}
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
try {
|
|
795
|
-
argsStr = JSON.stringify(obj);
|
|
796
|
-
argsParsed = true;
|
|
797
|
-
} catch {
|
|
730
|
+
} else {
|
|
731
|
+
argsStr = "";
|
|
798
732
|
}
|
|
733
|
+
parsedResult = { name: toolName, args: argsStr, thought: result2.thought || "", dialect: result2.dialect };
|
|
734
|
+
break;
|
|
799
735
|
}
|
|
800
736
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
737
|
+
} else {
|
|
738
|
+
const dialectDefs = [
|
|
739
|
+
{ name: "react", action: "Action:", input: "Action Input:" },
|
|
740
|
+
{ name: "function", action: "Function:", input: "Function Input:" },
|
|
741
|
+
{ name: "tool", action: "Tool:", input: "Tool Input:" },
|
|
742
|
+
{ name: "call", action: "Call:", input: "Input:" }
|
|
743
|
+
];
|
|
744
|
+
for (const dd of dialectDefs) {
|
|
745
|
+
const esc = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
746
|
+
const aT = esc(dd.action);
|
|
747
|
+
const iT = esc(dd.input);
|
|
748
|
+
const primaryRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s*\\n?\\s*${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
|
|
749
|
+
const sameRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s+${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
|
|
750
|
+
const parenRe = new RegExp(`${aT}\\s*(\\w+)\\s*\\(([^)]*)\\)`, "i");
|
|
751
|
+
let m = primaryRe.exec(content) || sameRe.exec(content);
|
|
752
|
+
let isParen = false;
|
|
753
|
+
if (!m) {
|
|
754
|
+
m = parenRe.exec(content);
|
|
755
|
+
isParen = true;
|
|
756
|
+
}
|
|
757
|
+
if (m) {
|
|
758
|
+
const toolName = m[1].trim().replace(/[`"']/g, "");
|
|
759
|
+
const rawArgs = m[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
|
|
760
|
+
let argsStr = "";
|
|
761
|
+
if (isParen && rawArgs && !rawArgs.startsWith("{")) {
|
|
762
|
+
const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
|
|
763
|
+
if (pairs) {
|
|
764
|
+
const obj = {};
|
|
765
|
+
for (const p of pairs) {
|
|
766
|
+
const ci = p.indexOf(":");
|
|
767
|
+
let v = p.slice(ci + 1).trim();
|
|
768
|
+
if (v.startsWith('"') && v.endsWith('"') || v.startsWith("'") && v.endsWith("'")) v = v.slice(1, -1);
|
|
769
|
+
obj[p.slice(0, ci).trim()] = v;
|
|
813
770
|
}
|
|
771
|
+
argsStr = JSON.stringify(obj);
|
|
772
|
+
} else {
|
|
773
|
+
argsStr = rawArgs;
|
|
814
774
|
}
|
|
815
|
-
}
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
775
|
+
} else {
|
|
776
|
+
const js = rawArgs.indexOf("{");
|
|
777
|
+
if (js !== -1) {
|
|
778
|
+
let d = 0, je = -1;
|
|
779
|
+
for (let i = js; i < rawArgs.length; i++) {
|
|
780
|
+
if (rawArgs[i] === "{") d++;
|
|
781
|
+
else if (rawArgs[i] === "}") {
|
|
782
|
+
d--;
|
|
783
|
+
if (d === 0) {
|
|
784
|
+
je = i;
|
|
785
|
+
break;
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
argsStr = je !== -1 ? rawArgs.slice(js, je + 1) : rawArgs;
|
|
790
|
+
} else {
|
|
791
|
+
argsStr = rawArgs;
|
|
823
792
|
}
|
|
824
793
|
}
|
|
794
|
+
let thought = "";
|
|
795
|
+
const thoughtRe = /Thought:\s*(.*?)(?=Action:|Function:|Tool:|Call:|Final Answer:|$)/is;
|
|
796
|
+
const tm = thoughtRe.exec(content);
|
|
797
|
+
if (tm) thought = tm[1].trim();
|
|
798
|
+
parsedResult = { name: toolName, args: argsStr, thought, dialect: dd.name };
|
|
799
|
+
break;
|
|
825
800
|
}
|
|
826
801
|
}
|
|
802
|
+
}
|
|
803
|
+
if (parsedResult) {
|
|
804
|
+
let { name: toolName, args: argsStr, thought, dialect } = parsedResult;
|
|
805
|
+
const argsParsed = argsStr.length > 0;
|
|
827
806
|
let score;
|
|
828
807
|
const isWeatherTool = toolName.toLowerCase().includes("get_weather") || toolName.toLowerCase() === "get_weather";
|
|
829
808
|
if (isWeatherTool && argsParsed) {
|
|
@@ -840,15 +819,25 @@ function model_test_temp_default(pi) {
|
|
|
840
819
|
toolCall: `${toolName}(${argsStr})`,
|
|
841
820
|
thought,
|
|
842
821
|
response: content,
|
|
843
|
-
elapsedMs
|
|
822
|
+
elapsedMs,
|
|
823
|
+
dialect: dialect || "react"
|
|
844
824
|
};
|
|
845
825
|
}
|
|
826
|
+
const altTagPatterns = [
|
|
827
|
+
/^\s*Function:\s*/im,
|
|
828
|
+
/^\s*Tool:\s*/im,
|
|
829
|
+
/^\s*Call:\s*/im,
|
|
830
|
+
/<function_call/i,
|
|
831
|
+
/<invoke\s/i
|
|
832
|
+
];
|
|
833
|
+
const hasAltTag = altTagPatterns.some((p) => p.test(content));
|
|
846
834
|
const hasToolMention = /\bget_weather\b/i.test(content) || /\btool\b/i.test(content);
|
|
847
|
-
if (hasToolMention) {
|
|
835
|
+
if (hasAltTag || hasToolMention) {
|
|
836
|
+
const detail = hasAltTag ? "model used alternative tool-call tags but format was not parseable" : "model mentioned tool but not in ReAct format";
|
|
848
837
|
return {
|
|
849
838
|
pass: false,
|
|
850
839
|
score: "FAIL",
|
|
851
|
-
toolCall:
|
|
840
|
+
toolCall: `none \u2014 ${detail}`,
|
|
852
841
|
thought: "",
|
|
853
842
|
response: content,
|
|
854
843
|
elapsedMs
|
|
@@ -886,12 +875,30 @@ The JSON object must have exactly these 4 keys:
|
|
|
886
875
|
parsed = JSON.parse(cleaned);
|
|
887
876
|
} catch {
|
|
888
877
|
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
878
|
+
let braceDepth = 0, bracketDepth = 0;
|
|
879
|
+
let inString = false, escapeNext = false;
|
|
880
|
+
for (let i = 0; i < cleaned.length; i++) {
|
|
881
|
+
const c = cleaned[i];
|
|
882
|
+
if (escapeNext) {
|
|
883
|
+
escapeNext = false;
|
|
884
|
+
continue;
|
|
885
|
+
}
|
|
886
|
+
if (c === "\\") {
|
|
887
|
+
if (inString) escapeNext = true;
|
|
888
|
+
continue;
|
|
889
|
+
}
|
|
890
|
+
if (c === '"') {
|
|
891
|
+
inString = !inString;
|
|
892
|
+
continue;
|
|
893
|
+
}
|
|
894
|
+
if (inString) continue;
|
|
895
|
+
if (c === "{") braceDepth++;
|
|
896
|
+
else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
|
|
897
|
+
else if (c === "[") bracketDepth++;
|
|
898
|
+
else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
|
|
899
|
+
}
|
|
900
|
+
if (braceDepth > 0 || bracketDepth > 0) {
|
|
901
|
+
const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
|
|
895
902
|
try {
|
|
896
903
|
parsed = JSON.parse(repaired);
|
|
897
904
|
repairNote = " (repaired truncated JSON)";
|
|
@@ -945,12 +952,30 @@ The JSON object must have exactly these 4 keys:
|
|
|
945
952
|
parsed = JSON.parse(cleaned);
|
|
946
953
|
} catch {
|
|
947
954
|
const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
955
|
+
let braceDepth = 0, bracketDepth = 0;
|
|
956
|
+
let inString = false, escapeNext = false;
|
|
957
|
+
for (let i = 0; i < cleaned.length; i++) {
|
|
958
|
+
const c = cleaned[i];
|
|
959
|
+
if (escapeNext) {
|
|
960
|
+
escapeNext = false;
|
|
961
|
+
continue;
|
|
962
|
+
}
|
|
963
|
+
if (c === "\\") {
|
|
964
|
+
if (inString) escapeNext = true;
|
|
965
|
+
continue;
|
|
966
|
+
}
|
|
967
|
+
if (c === '"') {
|
|
968
|
+
inString = !inString;
|
|
969
|
+
continue;
|
|
970
|
+
}
|
|
971
|
+
if (inString) continue;
|
|
972
|
+
if (c === "{") braceDepth++;
|
|
973
|
+
else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
|
|
974
|
+
else if (c === "[") bracketDepth++;
|
|
975
|
+
else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
|
|
976
|
+
}
|
|
977
|
+
if (braceDepth > 0 || bracketDepth > 0) {
|
|
978
|
+
const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
|
|
954
979
|
try {
|
|
955
980
|
parsed = JSON.parse(repaired);
|
|
956
981
|
repairNote = " (repaired truncated JSON)";
|
|
@@ -1071,25 +1096,40 @@ The JSON object must have exactly these 4 keys:
|
|
|
1071
1096
|
};
|
|
1072
1097
|
}
|
|
1073
1098
|
const reactPatterns = [
|
|
1099
|
+
// Classic ReAct
|
|
1074
1100
|
/^\s*Action:\s*/im,
|
|
1075
|
-
// "Action: get_weather"
|
|
1076
1101
|
/^\s*Action Input:\s*/im,
|
|
1077
|
-
// "Action Input: {"location": "Tokyo"}"
|
|
1078
1102
|
/^\s*Thought:\s*/im,
|
|
1079
|
-
// "Thought: I need to look up the weather"
|
|
1080
1103
|
/Action:\s*\w+/i,
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1104
|
+
/Action Input:\s*\{/i,
|
|
1105
|
+
// Function dialect
|
|
1106
|
+
/^\s*Function:\s*/im,
|
|
1107
|
+
/^\s*Function Input:\s*/im,
|
|
1108
|
+
/Function:\s*\w+/i,
|
|
1109
|
+
// Tool dialect
|
|
1110
|
+
/^\s*Tool:\s*/im,
|
|
1111
|
+
/^\s*Tool Input:\s*/im,
|
|
1112
|
+
/Tool:\s*\w+/i,
|
|
1113
|
+
// Call dialect
|
|
1114
|
+
/^\s*Call:\s*/im,
|
|
1115
|
+
/^\s*Input:\s*/im,
|
|
1116
|
+
/Call:\s*\w+/i
|
|
1084
1117
|
];
|
|
1085
|
-
const
|
|
1086
|
-
|
|
1118
|
+
const matchedPatterns = [];
|
|
1119
|
+
for (const p of reactPatterns) {
|
|
1120
|
+
if (p.test(content)) matchedPatterns.push(p.source);
|
|
1121
|
+
}
|
|
1122
|
+
if (matchedPatterns.length > 0) {
|
|
1123
|
+
let dialectName = "react";
|
|
1124
|
+
if (/Function:/i.test(content)) dialectName = "function";
|
|
1125
|
+
else if (/Tool:/i.test(content)) dialectName = "tool";
|
|
1126
|
+
else if (/Call:/i.test(content)) dialectName = "call";
|
|
1087
1127
|
const level2 = "react";
|
|
1088
1128
|
cacheToolSupport(model, level2, family);
|
|
1089
1129
|
return {
|
|
1090
1130
|
level: level2,
|
|
1091
1131
|
cached: false,
|
|
1092
|
-
evidence: `ReAct format detected in text response`,
|
|
1132
|
+
evidence: `ReAct format detected (${dialectName} dialect) in text response`,
|
|
1093
1133
|
elapsedMs
|
|
1094
1134
|
};
|
|
1095
1135
|
}
|
|
@@ -1141,14 +1181,8 @@ The JSON object must have exactly these 4 keys:
|
|
|
1141
1181
|
return ctx.model?.id;
|
|
1142
1182
|
}
|
|
1143
1183
|
function updateModelsJsonReasoning(model, hasReasoning) {
|
|
1144
|
-
const agentDir = path.join(os.homedir(), ".pi", "agent");
|
|
1145
|
-
const modelsJsonPath = path.join(agentDir, "models.json");
|
|
1146
|
-
if (!fs.existsSync(modelsJsonPath)) {
|
|
1147
|
-
return { updated: false, message: "models.json not found \u2014 skipped" };
|
|
1148
|
-
}
|
|
1149
1184
|
try {
|
|
1150
|
-
const
|
|
1151
|
-
const config = JSON.parse(raw);
|
|
1185
|
+
const config = readModelsJson();
|
|
1152
1186
|
let updated = false;
|
|
1153
1187
|
for (const provider of Object.values(config.providers || {})) {
|
|
1154
1188
|
const models = provider.models || [];
|
|
@@ -1168,7 +1202,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1168
1202
|
if (!updated) {
|
|
1169
1203
|
return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
|
|
1170
1204
|
}
|
|
1171
|
-
|
|
1205
|
+
writeModelsJson(config);
|
|
1172
1206
|
const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
|
|
1173
1207
|
return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
|
|
1174
1208
|
} catch (e) {
|
|
@@ -1176,7 +1210,7 @@ The JSON object must have exactly these 4 keys:
|
|
|
1176
1210
|
}
|
|
1177
1211
|
}
|
|
1178
1212
|
const branding = [
|
|
1179
|
-
` \u26A1 Pi Model Benchmark
|
|
1213
|
+
` \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
|
|
1180
1214
|
` Written by VTSTech`,
|
|
1181
1215
|
` GitHub: https://github.com/VTSTech`,
|
|
1182
1216
|
` Website: www.vts-tech.org`
|
|
@@ -1301,23 +1335,24 @@ The JSON object must have exactly these 4 keys:
|
|
|
1301
1335
|
await rateLimitDelay(lines);
|
|
1302
1336
|
const react = await testReactParsing(model);
|
|
1303
1337
|
lines.push(info(`Time: ${msHuman(react.elapsedMs)}`));
|
|
1338
|
+
const dialectTag = react.dialect && react.dialect !== "react" ? ` [${react.dialect} dialect]` : "";
|
|
1304
1339
|
if (react.score === "STRONG") {
|
|
1305
|
-
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
|
|
1340
|
+
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})${dialectTag}`));
|
|
1306
1341
|
if (react.thought) {
|
|
1307
1342
|
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1308
1343
|
}
|
|
1309
1344
|
} else if (react.score === "MODERATE") {
|
|
1310
|
-
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})`));
|
|
1345
|
+
lines.push(ok(`ReAct parsed: ${react.toolCall} (${react.score})${dialectTag}`));
|
|
1311
1346
|
if (react.thought) {
|
|
1312
1347
|
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1313
1348
|
}
|
|
1314
1349
|
} else if (react.score === "WEAK") {
|
|
1315
|
-
lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args`));
|
|
1350
|
+
lines.push(warn(`ReAct parsed: ${react.toolCall} (${react.score}) \u2014 wrong tool or malformed args${dialectTag}`));
|
|
1316
1351
|
if (react.thought) {
|
|
1317
1352
|
lines.push(info(`Thought: ${sanitizeForReport(react.thought)}`));
|
|
1318
1353
|
}
|
|
1319
1354
|
} else if (react.score === "FAIL") {
|
|
1320
|
-
lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})`));
|
|
1355
|
+
lines.push(fail(`ReAct parsing: ${react.toolCall} (${react.score})${dialectTag}`));
|
|
1321
1356
|
if (react.response) {
|
|
1322
1357
|
lines.push(info(`Response: ${sanitizeForReport(react.response)}`));
|
|
1323
1358
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vtstech/pi-model-test",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Model benchmark/testing extension for Pi Coding Agent",
|
|
5
5
|
"main": "model-test.js",
|
|
6
6
|
"keywords": ["pi-extensions"],
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"url": "https://github.com/VTSTech/pi-coding-agent"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@vtstech/pi-shared": "1.0
|
|
17
|
+
"@vtstech/pi-shared": "1.1.0"
|
|
18
18
|
},
|
|
19
19
|
"peerDependencies": {
|
|
20
20
|
"@mariozechner/pi-coding-agent": ">=0.66"
|