@vtstech/pi-model-test 1.0.9 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +3 -2
  2. package/model-test.js +234 -339
  3. package/package.json +2 -2
package/README.md CHANGED
@@ -43,12 +43,13 @@ pi install "npm:@vtstech/pi-model-test"
43
43
  ## Features
44
44
 
45
45
  - Auto-detects Ollama vs cloud provider (OpenRouter, Anthropic, Google, OpenAI, Groq, DeepSeek, Mistral, xAI, Together, Fireworks, Cohere)
46
- - Automatic remote Ollama URL resolution
46
+ - Uses native `fetch()` for all HTTP communication (no shell subprocess or curl dependency)
47
+ - Automatic remote Ollama URL resolution (reads from `models.json` on every call — picks up config changes immediately)
47
48
  - Timeout resilience with auto-retry on empty responses
48
49
  - Rate limit delay between tests (configurable)
49
50
  - Thinking model fallback (retries with `think: true`)
50
51
  - Tool support cache (`~/.pi/agent/cache/tool_support.json`)
51
- - JSON repair for truncated output
52
+ - JSON repair for truncated output (stack-based nesting-aware parser)
52
53
  - Tab-completion for model names
53
54
 
54
55
  ## Links
package/model-test.js CHANGED
@@ -12,59 +12,17 @@ import {
12
12
  truncate,
13
13
  sanitizeForReport
14
14
  } from "@vtstech/pi-shared/format";
15
- import { getOllamaBaseUrl, detectModelFamily, readModelsJson, BUILTIN_PROVIDERS, fetchModelContextLength } from "@vtstech/pi-shared/ollama";
16
- function detectProvider(ctx) {
17
- const model = ctx.model;
18
- if (!model) return { kind: "unknown", name: "none" };
19
- const providerName = model.provider || "";
20
- if (!providerName) return { kind: "unknown", name: "none" };
21
- const modelsJson = readModelsJson();
22
- const userProviderCfg = (modelsJson.providers || {})[providerName];
23
- if (userProviderCfg) {
24
- const baseUrl = userProviderCfg.baseUrl || "";
25
- const apiMode = userProviderCfg.api || "";
26
- const apiKey = userProviderCfg.apiKey || "";
27
- const isOllama = /ollama/i.test(providerName) || /localhost:\d+/.test(baseUrl) || /127\.0\.0\.1:\d+/.test(baseUrl) || /0\.0\.0\.0:\d+/.test(baseUrl) || /\/api\/chat/.test(baseUrl) || apiMode === "ollama";
28
- if (isOllama) {
29
- return { kind: "ollama", name: providerName, apiMode: "ollama", baseUrl, apiKey };
30
- }
31
- if (/\/api\/chat/.test(baseUrl)) {
32
- return { kind: "ollama", name: providerName, apiMode: "ollama", baseUrl, apiKey };
33
- }
34
- return {
35
- kind: "builtin",
36
- name: providerName,
37
- apiMode: apiMode || userProviderCfg.api || "openai-completions",
38
- baseUrl,
39
- apiKey
40
- };
41
- }
42
- const builtin = BUILTIN_PROVIDERS[providerName];
43
- if (builtin) {
44
- const apiKey = process.env[builtin.envKey] || "";
45
- return {
46
- kind: "builtin",
47
- name: providerName,
48
- apiMode: builtin.api,
49
- baseUrl: builtin.baseUrl,
50
- envKey: builtin.envKey,
51
- apiKey
52
- };
53
- }
54
- return { kind: "unknown", name: providerName };
55
- }
15
+ import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
56
16
  var CONFIG = {
57
17
  // General API settings
58
18
  DEFAULT_TIMEOUT_MS: 999999,
59
- // 8.3 minutes - default timeout for model responses
19
+ // ~16.7 minutes effectively unlimited for slow models
60
20
  CONNECT_TIMEOUT_S: 60,
61
- // 30 seconds to establish connection
21
+ // 60 seconds to establish connection
62
22
  MAX_RETRIES: 1,
63
23
  // Single retry for transient failures
64
24
  RETRY_DELAY_MS: 1e4,
65
- // 2 seconds between retries
66
- EXEC_BUFFER_MS: 8e3,
67
- // Extra buffer for exec timeout over curl timeout
25
+ // 10 seconds between retries
68
26
  // Model generation settings
69
27
  NUM_PREDICT: 1024,
70
28
  // Max tokens in response
@@ -74,31 +32,26 @@ var CONFIG = {
74
32
  MIN_THINKING_LENGTH: 10,
75
33
  // Minimum chars to consider thinking tokens valid
76
34
  TOOL_TEST_TIMEOUT_MS: 999999,
77
- // 90 seconds for tool usage tests
78
- TOOL_TEST_MAX_TIME_S: 999999,
79
- // Max curl time for tool tests (effectively unlimited)
35
+ // Effectively unlimited for slow tool usage tests
80
36
  TOOL_SUPPORT_TIMEOUT_MS: 999999,
81
- // 2+ minutes for tool support detection
82
- TOOL_SUPPORT_MAX_TIME_S: 999999,
83
- // Max curl time for tool support detection
37
+ // Effectively unlimited for tool support detection
84
38
  // Metadata retrieval
85
39
  TAGS_TIMEOUT_MS: 15e3,
86
40
  // 15 seconds for /api/tags
87
- TAGS_CONNECT_TIMEOUT_S: 30,
88
- // 10 seconds connection timeout for tags
89
41
  MODEL_INFO_TIMEOUT_MS: 3e4,
90
- // 10 seconds for model info lookup
42
+ // 30 seconds for model info lookup
91
43
  // Provider API settings
92
44
  PROVIDER_TIMEOUT_MS: 999999,
93
- // 2 minutes for cloud provider API calls
45
+ // Effectively unlimited for cloud provider API calls
94
46
  PROVIDER_TOOL_TIMEOUT_MS: 12e4,
95
- // 60 seconds for tool usage tests on providers
47
+ // 120 seconds for tool usage tests on providers
96
48
  // Rate limiting
97
49
  TEST_DELAY_MS: 1e4
98
- // 30 seconds between tests to avoid rate limiting
50
+ // 10 seconds between tests to avoid rate limiting
99
51
  };
100
52
  var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
101
53
  var TOOL_SUPPORT_CACHE_PATH = path.join(TOOL_SUPPORT_CACHE_DIR, "tool_support.json");
54
+ var _toolSupportCacheInMemory = null;
102
55
  function readToolSupportCache() {
103
56
  try {
104
57
  if (fs.existsSync(TOOL_SUPPORT_CACHE_PATH)) {
@@ -116,69 +69,138 @@ function writeToolSupportCache(cache) {
116
69
  fs.writeFileSync(TOOL_SUPPORT_CACHE_PATH, JSON.stringify(cache, null, 2) + "\n", "utf-8");
117
70
  }
118
71
  function getCachedToolSupport(model) {
119
- const cache = readToolSupportCache();
72
+ const cache = _toolSupportCacheInMemory || readToolSupportCache();
73
+ if (!_toolSupportCacheInMemory) _toolSupportCacheInMemory = cache;
120
74
  const entry = cache[model];
121
75
  if (!entry) return null;
122
76
  if (!entry.support || !["native", "react", "none"].includes(entry.support)) return null;
123
77
  return entry;
124
78
  }
125
79
  function cacheToolSupport(model, support, family) {
126
- const cache = readToolSupportCache();
80
+ const cache = _toolSupportCacheInMemory || readToolSupportCache();
127
81
  cache[model] = {
128
82
  support,
129
83
  testedAt: (/* @__PURE__ */ new Date()).toISOString(),
130
84
  family
131
85
  };
86
+ _toolSupportCacheInMemory = cache;
132
87
  writeToolSupportCache(cache);
133
88
  }
134
89
  function model_test_temp_default(pi) {
135
- const OLLAMA_BASE = getOllamaBaseUrl();
90
+ function ollamaBase() {
91
+ return getOllamaBaseUrl();
92
+ }
136
93
  async function rateLimitDelay(lines) {
137
94
  if (CONFIG.TEST_DELAY_MS > 0) {
138
95
  lines.push(info(`Waiting ${msHuman(CONFIG.TEST_DELAY_MS)} to avoid rate limiting...`));
139
96
  await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
140
97
  }
141
98
  }
99
+ function scoreReasoning(msg) {
100
+ const allNumbers = msg.match(/\b(\d+)\b/g) || [];
101
+ const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
102
+ const isCorrect = answer === "8";
103
+ const reasoningPatterns = [
104
+ "because",
105
+ "therefore",
106
+ "since",
107
+ "step",
108
+ "subtract",
109
+ "minus",
110
+ "each day",
111
+ "each night",
112
+ "slides",
113
+ "climbs",
114
+ "night",
115
+ "reaches",
116
+ "finally",
117
+ "last day"
118
+ ];
119
+ const hasReasoningWords = reasoningPatterns.some((w) => msg.toLowerCase().includes(w));
120
+ const hasNumberedSteps = /^\s*\d+\.\s/m.test(msg);
121
+ const hasReasoning = hasReasoningWords || hasNumberedSteps;
122
+ if (isCorrect && hasReasoning) return { score: "STRONG", pass: true };
123
+ if (isCorrect) return { score: "MODERATE", pass: true };
124
+ if (hasReasoning) return { score: "WEAK", pass: false };
125
+ return { score: "FAIL", pass: false };
126
+ }
127
+ function scoreNativeToolCall(fnName, args) {
128
+ const hasCorrectTool = fnName === "get_weather";
129
+ const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
130
+ const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
131
+ if (hasCorrectTool && hasLocation && unitValid) return { score: "STRONG", pass: true };
132
+ if (hasCorrectTool && hasLocation) return { score: "MODERATE", pass: true };
133
+ return { score: "WEAK", pass: false };
134
+ }
135
+ function scoreTextToolCall(fnName, args) {
136
+ const isWeatherTool = fnName === "get_weather";
137
+ const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
138
+ if (isWeatherTool && hasLocation) return { score: "STRONG", pass: true };
139
+ if (isWeatherTool) return { score: "MODERATE", pass: true };
140
+ return { score: "WEAK", pass: false };
141
+ }
142
+ function parseTextToolCall(content) {
143
+ const firstBrace = content.indexOf("{");
144
+ if (firstBrace === -1) return null;
145
+ const lastBrace = content.lastIndexOf("}");
146
+ if (lastBrace <= firstBrace) return null;
147
+ const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
148
+ let textToolParsed = null;
149
+ try {
150
+ textToolParsed = JSON.parse(jsonCandidate);
151
+ } catch {
152
+ return null;
153
+ }
154
+ if (!textToolParsed || typeof textToolParsed.name !== "string") return null;
155
+ const rawArgs = textToolParsed.arguments || { ...textToolParsed };
156
+ const { name: _, ...fnArgs } = rawArgs;
157
+ return { fnName: textToolParsed.name, args: fnArgs };
158
+ }
142
159
  async function ollamaChat(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS, retries = CONFIG.MAX_RETRIES) {
143
160
  const body = { model, messages, stream: false, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
161
+ const url = `${ollamaBase()}/api/chat`;
144
162
  for (let attempt = 0; attempt <= retries; attempt++) {
145
163
  const start = Date.now();
164
+ const controller = new AbortController();
165
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
146
166
  try {
147
- const result = await pi.exec("curl", [
148
- "-s",
149
- "--fail-with-body",
150
- "-X",
151
- "POST",
152
- "--connect-timeout",
153
- String(CONFIG.CONNECT_TIMEOUT_S),
154
- "--max-time",
155
- String(Math.ceil(timeoutMs / 1e3)),
156
- `${OLLAMA_BASE}/api/chat`,
157
- "-H",
158
- "Content-Type: application/json",
159
- "-d",
160
- JSON.stringify(body)
161
- ], { timeout: timeoutMs + CONFIG.EXEC_BUFFER_MS });
167
+ const res = await fetch(url, {
168
+ method: "POST",
169
+ headers: { "Content-Type": "application/json" },
170
+ body: JSON.stringify(body),
171
+ signal: controller.signal
172
+ });
162
173
  const elapsedMs = Date.now() - start;
163
- if (result.code !== 0) {
164
- const detail = result.stderr?.trim() || result.stdout?.trim() || "unknown error";
165
- throw new Error(`curl exited ${result.code}: ${detail}`);
174
+ if (!res.ok) {
175
+ const errorText = await res.text().catch(() => "unknown error");
176
+ throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
166
177
  }
167
- if (!result.stdout.trim()) {
178
+ const text = await res.text();
179
+ if (!text.trim()) {
168
180
  if (attempt < retries) {
169
181
  await new Promise((r) => setTimeout(r, CONFIG.RETRY_DELAY_MS));
170
182
  continue;
171
183
  }
172
184
  throw new Error(`Empty response from Ollama after ${attempt + 1} attempt(s)`);
173
185
  }
174
- const parsed = JSON.parse(result.stdout);
186
+ const parsed = JSON.parse(text);
175
187
  return { response: parsed, elapsedMs };
176
188
  } catch (e) {
177
- if (attempt < retries && (e.message.includes("Empty response") || e.message.includes("timed out") || e.message.includes("curl exited 22") || e.message.includes("curl exited 28") || e.message.includes("curl exited 35") || e.message.includes("curl exited 52"))) {
189
+ const msg = e instanceof Error ? e.message : String(e);
190
+ if (e instanceof Error && e.name === "AbortError") {
191
+ if (attempt < retries) {
192
+ await new Promise((r) => setTimeout(r, CONFIG.RETRY_DELAY_MS));
193
+ continue;
194
+ }
195
+ throw new Error(`Ollama API timed out after ${msHuman(timeoutMs)}`);
196
+ }
197
+ if (attempt < retries && (msg.includes("Empty response") || msg.includes("ECONNREFUSED") || msg.includes("ECONNRESET") || msg.includes("fetch failed"))) {
178
198
  await new Promise((r) => setTimeout(r, CONFIG.RETRY_DELAY_MS));
179
199
  continue;
180
200
  }
181
201
  throw e;
202
+ } finally {
203
+ clearTimeout(timeoutId);
182
204
  }
183
205
  }
184
206
  throw new Error("Unreachable");
@@ -247,10 +269,8 @@ function model_test_temp_default(pi) {
247
269
  { role: "user", content: "Reply with exactly: PONG" }
248
270
  ], { maxTokens: 10, timeoutMs: 3e4 });
249
271
  const elapsedMs = Date.now() - start;
250
- const content = result.content.trim().toUpperCase();
251
272
  const reachable = true;
252
273
  const authValid = true;
253
- const hasPong = content.includes("PONG");
254
274
  return {
255
275
  pass: reachable && authValid,
256
276
  reachable,
@@ -259,7 +279,6 @@ function model_test_temp_default(pi) {
259
279
  elapsedMs
260
280
  };
261
281
  } catch (e) {
262
- const start = Date.now();
263
282
  let reachable = false;
264
283
  let authValid = false;
265
284
  const msg = e.message || "";
@@ -290,7 +309,6 @@ function model_test_temp_default(pi) {
290
309
  const prompt = `A snail climbs 3 feet up a wall each day, but slides back 2 feet each night. The wall is 10 feet tall. How many days does it take the snail to reach the top? Think step by step and give the final answer on its own line like: ANSWER: <number>`;
291
310
  try {
292
311
  let response, elapsedMs;
293
- let usedThinkingFallback = false;
294
312
  try {
295
313
  const result = await ollamaChat(model, [
296
314
  { role: "user", content: prompt }
@@ -309,7 +327,6 @@ function model_test_temp_default(pi) {
309
327
  ], { think: true });
310
328
  response = retry.response;
311
329
  elapsedMs = retry.elapsedMs;
312
- usedThinkingFallback = true;
313
330
  } else {
314
331
  throw firstErr;
315
332
  }
@@ -322,41 +339,7 @@ function model_test_temp_default(pi) {
322
339
  }
323
340
  const allNumbers = effectiveMsg.match(/\b(\d+)\b/g) || [];
324
341
  const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
325
- const isCorrect = answer === "8";
326
- const reasoningPatterns = [
327
- "because",
328
- "therefore",
329
- "since",
330
- "step",
331
- "subtract",
332
- "minus",
333
- "each day",
334
- "each night",
335
- "slides",
336
- "climbs",
337
- "night",
338
- "reaches",
339
- "finally",
340
- "last day"
341
- ];
342
- const hasReasoningWords = reasoningPatterns.some((w) => effectiveMsg.toLowerCase().includes(w));
343
- const hasNumberedSteps = /^\s*\d+\.\s/m.test(effectiveMsg);
344
- const hasReasoning = hasReasoningWords || hasNumberedSteps;
345
- let score;
346
- let pass;
347
- if (isCorrect && hasReasoning) {
348
- score = "STRONG";
349
- pass = true;
350
- } else if (isCorrect) {
351
- score = "MODERATE";
352
- pass = true;
353
- } else if (hasReasoning) {
354
- score = "WEAK";
355
- pass = false;
356
- } else {
357
- score = "FAIL";
358
- pass = false;
359
- }
342
+ const { score, pass } = scoreReasoning(effectiveMsg);
360
343
  const displayMsg = msg.trim().length > 0 ? effectiveMsg : `[thinking tokens] ${effectiveMsg}`;
361
344
  return { pass, score, reasoning: displayMsg, answer, elapsedMs };
362
345
  } catch (e) {
@@ -375,41 +358,7 @@ function model_test_temp_default(pi) {
375
358
  }
376
359
  const allNumbers = msg.match(/\b(\d+)\b/g) || [];
377
360
  const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
378
- const isCorrect = answer === "8";
379
- const reasoningPatterns = [
380
- "because",
381
- "therefore",
382
- "since",
383
- "step",
384
- "subtract",
385
- "minus",
386
- "each day",
387
- "each night",
388
- "slides",
389
- "climbs",
390
- "night",
391
- "reaches",
392
- "finally",
393
- "last day"
394
- ];
395
- const hasReasoningWords = reasoningPatterns.some((w) => msg.toLowerCase().includes(w));
396
- const hasNumberedSteps = /^\s*\d+\.\s/m.test(msg);
397
- const hasReasoning = hasReasoningWords || hasNumberedSteps;
398
- let score;
399
- let pass;
400
- if (isCorrect && hasReasoning) {
401
- score = "STRONG";
402
- pass = true;
403
- } else if (isCorrect) {
404
- score = "MODERATE";
405
- pass = true;
406
- } else if (hasReasoning) {
407
- score = "WEAK";
408
- pass = false;
409
- } else {
410
- score = "FAIL";
411
- pass = false;
412
- }
361
+ const { score, pass } = scoreReasoning(msg);
413
362
  return { pass, score, reasoning: msg, answer, elapsedMs: result.elapsedMs };
414
363
  } catch (e) {
415
364
  return { pass: false, score: "ERROR", reasoning: e.message, answer: "?", elapsedMs: 0 };
@@ -465,29 +414,24 @@ function model_test_temp_default(pi) {
465
414
  options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
466
415
  };
467
416
  try {
417
+ const controller = new AbortController();
418
+ const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
468
419
  const start = Date.now();
469
- const result = await pi.exec("curl", [
470
- "-s",
471
- "--fail-with-body",
472
- "-X",
473
- "POST",
474
- "--connect-timeout",
475
- String(CONFIG.CONNECT_TIMEOUT_S),
476
- "--max-time",
477
- String(CONFIG.TOOL_TEST_MAX_TIME_S),
478
- `${OLLAMA_BASE}/api/chat`,
479
- "-H",
480
- "Content-Type: application/json",
481
- "-d",
482
- JSON.stringify(body)
483
- ], { timeout: CONFIG.TOOL_TEST_TIMEOUT_MS });
420
+ const res = await fetch(`${ollamaBase()}/api/chat`, {
421
+ method: "POST",
422
+ headers: { "Content-Type": "application/json" },
423
+ body: JSON.stringify(body),
424
+ signal: controller.signal
425
+ });
484
426
  const elapsedMs = Date.now() - start;
485
- if (result.code !== 0) {
486
- const detail = result.stderr?.trim() || result.stdout?.trim() || "unknown error";
487
- return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `curl error: ${result.code}: ${detail}`, response: "", elapsedMs };
427
+ clearTimeout(timeoutId);
428
+ if (!res.ok) {
429
+ const errorText = await res.text().catch(() => "unknown error");
430
+ return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `fetch error: ${res.status}`, response: "", elapsedMs };
488
431
  }
489
- if (!result.stdout.trim()) throw new Error("Empty response from Ollama");
490
- const parsed = JSON.parse(result.stdout);
432
+ const text = await res.text();
433
+ if (!text.trim()) throw new Error("Empty response from Ollama");
434
+ const parsed = JSON.parse(text);
491
435
  const toolCalls = parsed?.message?.tool_calls;
492
436
  const content = parsed?.message?.content || "";
493
437
  if (toolCalls && toolCalls.length > 0) {
@@ -506,20 +450,7 @@ function model_test_temp_default(pi) {
506
450
  elapsedMs
507
451
  };
508
452
  }
509
- const hasCorrectTool = fn.name === "get_weather";
510
- const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
511
- const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
512
- let score;
513
- if (hasCorrectTool && hasLocation && unitValid) {
514
- score = "STRONG";
515
- } else if (hasCorrectTool && hasLocation) {
516
- score = "MODERATE";
517
- } else if (hasCorrectTool) {
518
- score = "WEAK";
519
- } else {
520
- score = "WEAK";
521
- }
522
- const pass = score !== "WEAK";
453
+ const { score, pass } = scoreNativeToolCall(fn.name || "", args);
523
454
  return {
524
455
  pass,
525
456
  score,
@@ -529,38 +460,14 @@ function model_test_temp_default(pi) {
529
460
  elapsedMs
530
461
  };
531
462
  }
532
- const firstBrace = content.indexOf("{");
533
- let textToolParsed = null;
534
- if (firstBrace !== -1) {
535
- const lastBrace = content.lastIndexOf("}");
536
- if (lastBrace > firstBrace) {
537
- const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
538
- try {
539
- textToolParsed = JSON.parse(jsonCandidate);
540
- } catch {
541
- }
542
- }
543
- }
544
- if (textToolParsed && typeof textToolParsed.name === "string") {
545
- const fnName = textToolParsed.name;
546
- const rawArgs = textToolParsed.arguments || { ...textToolParsed };
547
- const { name: _, ...fnArgs } = rawArgs;
548
- const isWeatherTool = fnName === "get_weather";
549
- const hasLocation = typeof fnArgs.location === "string" && fnArgs.location.toLowerCase().includes("paris");
550
- let score;
551
- if (isWeatherTool && hasLocation) {
552
- score = "STRONG";
553
- } else if (isWeatherTool) {
554
- score = "MODERATE";
555
- } else {
556
- score = "WEAK";
557
- }
558
- const pass = score !== "WEAK";
463
+ const textParsed = parseTextToolCall(content);
464
+ if (textParsed) {
465
+ const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
559
466
  return {
560
467
  pass,
561
468
  score,
562
469
  hasToolCalls: true,
563
- toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
470
+ toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
564
471
  response: content,
565
472
  elapsedMs
566
473
  };
@@ -622,20 +529,7 @@ function model_test_temp_default(pi) {
622
529
  elapsedMs: result.elapsedMs
623
530
  };
624
531
  }
625
- const hasCorrectTool = fn.name === "get_weather";
626
- const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
627
- const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
628
- let score;
629
- if (hasCorrectTool && hasLocation && unitValid) {
630
- score = "STRONG";
631
- } else if (hasCorrectTool && hasLocation) {
632
- score = "MODERATE";
633
- } else if (hasCorrectTool) {
634
- score = "WEAK";
635
- } else {
636
- score = "WEAK";
637
- }
638
- const pass = score !== "WEAK";
532
+ const { score, pass } = scoreNativeToolCall(fn.name || "", args);
639
533
  return {
640
534
  pass,
641
535
  score,
@@ -645,38 +539,14 @@ function model_test_temp_default(pi) {
645
539
  elapsedMs: result.elapsedMs
646
540
  };
647
541
  }
648
- const firstBrace = content.indexOf("{");
649
- let textToolParsed = null;
650
- if (firstBrace !== -1) {
651
- const lastBrace = content.lastIndexOf("}");
652
- if (lastBrace > firstBrace) {
653
- const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
654
- try {
655
- textToolParsed = JSON.parse(jsonCandidate);
656
- } catch {
657
- }
658
- }
659
- }
660
- if (textToolParsed && typeof textToolParsed.name === "string") {
661
- const fnName = textToolParsed.name;
662
- const rawArgs = textToolParsed.arguments || { ...textToolParsed };
663
- const { name: _, ...fnArgs } = rawArgs;
664
- const isWeatherTool = fnName === "get_weather";
665
- const hasLocation = typeof fnArgs.location === "string" && fnArgs.location.toLowerCase().includes("paris");
666
- let score;
667
- if (isWeatherTool && hasLocation) {
668
- score = "STRONG";
669
- } else if (isWeatherTool) {
670
- score = "MODERATE";
671
- } else {
672
- score = "WEAK";
673
- }
674
- const pass = score !== "WEAK";
542
+ const textParsed = parseTextToolCall(content);
543
+ if (textParsed) {
544
+ const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
675
545
  return {
676
546
  pass,
677
547
  score,
678
548
  hasToolCalls: true,
679
- toolCall: `${fnName}(${JSON.stringify(fnArgs)})`,
549
+ toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
680
550
  response: content,
681
551
  elapsedMs: result.elapsedMs
682
552
  };
@@ -713,29 +583,24 @@ function model_test_temp_default(pi) {
713
583
  options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
714
584
  };
715
585
  try {
586
+ const controller = new AbortController();
587
+ const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
716
588
  const start = Date.now();
717
- const result = await pi.exec("curl", [
718
- "-s",
719
- "--fail-with-body",
720
- "-X",
721
- "POST",
722
- "--connect-timeout",
723
- String(CONFIG.CONNECT_TIMEOUT_S),
724
- "--max-time",
725
- String(CONFIG.TOOL_TEST_MAX_TIME_S),
726
- `${OLLAMA_BASE}/api/chat`,
727
- "-H",
728
- "Content-Type: application/json",
729
- "-d",
730
- JSON.stringify(body)
731
- ], { timeout: CONFIG.TOOL_TEST_TIMEOUT_MS });
589
+ const res = await fetch(`${ollamaBase()}/api/chat`, {
590
+ method: "POST",
591
+ headers: { "Content-Type": "application/json" },
592
+ body: JSON.stringify(body),
593
+ signal: controller.signal
594
+ });
732
595
  const elapsedMs = Date.now() - start;
733
- if (result.code !== 0) {
734
- const detail = result.stderr?.trim() || result.stdout?.trim() || "unknown error";
735
- return { pass: false, score: "ERROR", toolCall: `curl error: ${result.code}: ${detail}`, thought: "", response: "", elapsedMs };
596
+ clearTimeout(timeoutId);
597
+ if (!res.ok) {
598
+ const errorText = await res.text().catch(() => "unknown error");
599
+ return { pass: false, score: "ERROR", toolCall: `fetch error: ${res.status}`, thought: "", response: "", elapsedMs };
736
600
  }
737
- if (!result.stdout.trim()) throw new Error("Empty response from Ollama");
738
- const parsed = JSON.parse(result.stdout);
601
+ const text = await res.text();
602
+ if (!text.trim()) throw new Error("Empty response from Ollama");
603
+ const parsed = JSON.parse(text);
739
604
  const content = (parsed?.message?.content || "").trim();
740
605
  if (!content) {
741
606
  return { pass: false, score: "FAIL", toolCall: "empty response", thought: "", response: "", elapsedMs };
@@ -744,20 +609,20 @@ function model_test_temp_default(pi) {
744
609
  const sharedParser = pi._reactParser;
745
610
  if (sharedParser?.ALL_DIALECT_PATTERNS) {
746
611
  for (const dp of sharedParser.ALL_DIALECT_PATTERNS) {
747
- const result2 = sharedParser.parseReactWithPatterns(content, dp, true);
748
- if (result2) {
749
- let toolName = result2.name;
612
+ const result = sharedParser.parseReactWithPatterns(content, dp, true);
613
+ if (result) {
614
+ let toolName = result.name;
750
615
  let argsStr;
751
- const rawArgs = result2.args ? JSON.stringify(result2.args) : "";
616
+ const rawArgs = result.args ? JSON.stringify(result.args) : "";
752
617
  if (rawArgs && rawArgs !== "{}") {
753
618
  argsStr = rawArgs;
754
- } else if (result2.raw) {
755
- const jsonStart = result2.raw.indexOf("{");
619
+ } else if (result.raw) {
620
+ const jsonStart = result.raw.indexOf("{");
756
621
  if (jsonStart !== -1) {
757
622
  let depth = 0, jsonEnd = -1;
758
- for (let i = jsonStart; i < result2.raw.length; i++) {
759
- if (result2.raw[i] === "{") depth++;
760
- else if (result2.raw[i] === "}") {
623
+ for (let i = jsonStart; i < result.raw.length; i++) {
624
+ if (result.raw[i] === "{") depth++;
625
+ else if (result.raw[i] === "}") {
761
626
  depth--;
762
627
  if (depth === 0) {
763
628
  jsonEnd = i;
@@ -765,14 +630,14 @@ function model_test_temp_default(pi) {
765
630
  }
766
631
  }
767
632
  }
768
- argsStr = jsonEnd !== -1 ? result2.raw.slice(jsonStart, jsonEnd + 1) : "";
633
+ argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
769
634
  } else {
770
635
  argsStr = "";
771
636
  }
772
637
  } else {
773
638
  argsStr = "";
774
639
  }
775
- parsedResult = { name: toolName, args: argsStr, thought: result2.thought || "", dialect: result2.dialect };
640
+ parsedResult = { name: toolName, args: argsStr, thought: result.thought || "", dialect: result.dialect };
776
641
  break;
777
642
  }
778
643
  }
@@ -917,12 +782,30 @@ The JSON object must have exactly these 4 keys:
917
782
  parsed = JSON.parse(cleaned);
918
783
  } catch {
919
784
  const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
920
- const openBraces = (cleaned.match(/\{/g) || []).length;
921
- const closeBraces = (cleaned.match(/\}/g) || []).length;
922
- const openBrackets = (cleaned.match(/\[/g) || []).length;
923
- const closeBrackets = (cleaned.match(/\]/g) || []).length;
924
- if (openBraces > closeBraces || openBrackets > closeBrackets) {
925
- const repaired = cleaned + "}".repeat(Math.max(0, openBraces - closeBraces)) + "]".repeat(Math.max(0, openBrackets - closeBrackets));
785
+ let braceDepth = 0, bracketDepth = 0;
786
+ let inString = false, escapeNext = false;
787
+ for (let i = 0; i < cleaned.length; i++) {
788
+ const c = cleaned[i];
789
+ if (escapeNext) {
790
+ escapeNext = false;
791
+ continue;
792
+ }
793
+ if (c === "\\") {
794
+ if (inString) escapeNext = true;
795
+ continue;
796
+ }
797
+ if (c === '"') {
798
+ inString = !inString;
799
+ continue;
800
+ }
801
+ if (inString) continue;
802
+ if (c === "{") braceDepth++;
803
+ else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
804
+ else if (c === "[") bracketDepth++;
805
+ else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
806
+ }
807
+ if (braceDepth > 0 || bracketDepth > 0) {
808
+ const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
926
809
  try {
927
810
  parsed = JSON.parse(repaired);
928
811
  repairNote = " (repaired truncated JSON)";
@@ -976,12 +859,30 @@ The JSON object must have exactly these 4 keys:
976
859
  parsed = JSON.parse(cleaned);
977
860
  } catch {
978
861
  const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
979
- const openBraces = (cleaned.match(/\{/g) || []).length;
980
- const closeBraces = (cleaned.match(/\}/g) || []).length;
981
- const openBrackets = (cleaned.match(/\[/g) || []).length;
982
- const closeBrackets = (cleaned.match(/\]/g) || []).length;
983
- if (openBraces > closeBraces || openBrackets > closeBrackets) {
984
- const repaired = cleaned + "}".repeat(Math.max(0, openBraces - closeBraces)) + "]".repeat(Math.max(0, openBrackets - closeBrackets));
862
+ let braceDepth = 0, bracketDepth = 0;
863
+ let inString = false, escapeNext = false;
864
+ for (let i = 0; i < cleaned.length; i++) {
865
+ const c = cleaned[i];
866
+ if (escapeNext) {
867
+ escapeNext = false;
868
+ continue;
869
+ }
870
+ if (c === "\\") {
871
+ if (inString) escapeNext = true;
872
+ continue;
873
+ }
874
+ if (c === '"') {
875
+ inString = !inString;
876
+ continue;
877
+ }
878
+ if (inString) continue;
879
+ if (c === "{") braceDepth++;
880
+ else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
881
+ else if (c === "[") bracketDepth++;
882
+ else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
883
+ }
884
+ if (braceDepth > 0 || bracketDepth > 0) {
885
+ const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
985
886
  try {
986
887
  parsed = JSON.parse(repaired);
987
888
  repairNote = " (repaired truncated JSON)";
@@ -1057,29 +958,29 @@ The JSON object must have exactly these 4 keys:
1057
958
  };
1058
959
  try {
1059
960
  const start = Date.now();
1060
- const result = await pi.exec("curl", [
1061
- "-s",
1062
- "--fail-with-body",
1063
- "-X",
1064
- "POST",
1065
- "--connect-timeout",
1066
- "30",
1067
- "--max-time",
1068
- "120",
1069
- `${OLLAMA_BASE}/api/chat`,
1070
- "-H",
1071
- "Content-Type: application/json",
1072
- "-d",
1073
- JSON.stringify(body)
1074
- ], { timeout: 13e4 });
961
+ const controller = new AbortController();
962
+ const timeoutId = setTimeout(() => controller.abort(), 13e4);
963
+ const res = await fetch(`${ollamaBase()}/api/chat`, {
964
+ method: "POST",
965
+ headers: { "Content-Type": "application/json" },
966
+ body: JSON.stringify(body),
967
+ signal: controller.signal
968
+ });
1075
969
  const elapsedMs = Date.now() - start;
1076
- if (result.code !== 0 || !result.stdout.trim()) {
1077
- const detail = result.stderr?.trim() || result.stdout?.trim() || "empty response";
970
+ clearTimeout(timeoutId);
971
+ if (!res.ok) {
972
+ const detail = await res.text().catch(() => "unknown error");
1078
973
  const level2 = "none";
1079
974
  cacheToolSupport(model, level2, family);
1080
- return { level: level2, cached: false, evidence: `API error: ${truncate(detail, 100)}`, elapsedMs };
975
+ return { level: level2, cached: false, evidence: `API error ${res.status}: ${truncate(detail, 100)}`, elapsedMs };
1081
976
  }
1082
- const parsed = JSON.parse(result.stdout);
977
+ const text = await res.text();
978
+ if (!text.trim()) {
979
+ const level2 = "none";
980
+ cacheToolSupport(model, level2, family);
981
+ return { level: level2, cached: false, evidence: "empty response from Ollama", elapsedMs };
982
+ }
983
+ const parsed = JSON.parse(text);
1083
984
  const toolCalls = parsed?.message?.tool_calls;
1084
985
  const content = (parsed?.message?.content || "").trim();
1085
986
  if (toolCalls && Array.isArray(toolCalls) && toolCalls.length > 0) {
@@ -1175,9 +1076,9 @@ The JSON object must have exactly these 4 keys:
1175
1076
  }
1176
1077
  async function getOllamaModels() {
1177
1078
  try {
1178
- const result = await pi.exec("curl", ["-s", "--connect-timeout", "10", `${OLLAMA_BASE}/api/tags`], { timeout: 15e3 });
1179
- if (result.code !== 0 || !result.stdout.trim()) return [];
1180
- const data = JSON.parse(result.stdout);
1079
+ const res = await fetch(`${ollamaBase()}/api/tags`, { signal: AbortSignal.timeout(15e3) });
1080
+ if (!res.ok) return [];
1081
+ const data = await res.json();
1181
1082
  return (data.models || []).map((m) => m.name).filter(Boolean);
1182
1083
  } catch {
1183
1084
  return [];
@@ -1187,14 +1088,8 @@ The JSON object must have exactly these 4 keys:
1187
1088
  return ctx.model?.id;
1188
1089
  }
1189
1090
  function updateModelsJsonReasoning(model, hasReasoning) {
1190
- const agentDir = path.join(os.homedir(), ".pi", "agent");
1191
- const modelsJsonPath = path.join(agentDir, "models.json");
1192
- if (!fs.existsSync(modelsJsonPath)) {
1193
- return { updated: false, message: "models.json not found \u2014 skipped" };
1194
- }
1195
1091
  try {
1196
- const raw = fs.readFileSync(modelsJsonPath, "utf-8");
1197
- const config = JSON.parse(raw);
1092
+ const config = readModelsJson();
1198
1093
  let updated = false;
1199
1094
  for (const provider of Object.values(config.providers || {})) {
1200
1095
  const models = provider.models || [];
@@ -1214,7 +1109,7 @@ The JSON object must have exactly these 4 keys:
1214
1109
  if (!updated) {
1215
1110
  return { updated: false, message: `${model} not found in models.json \u2014 skipped` };
1216
1111
  }
1217
- fs.writeFileSync(modelsJsonPath, JSON.stringify(config, null, 2) + "\n", "utf-8");
1112
+ writeModelsJson(config);
1218
1113
  const action = hasReasoning ? "set reasoning: true" : "set reasoning: false";
1219
1114
  return { updated: true, message: `\u2705 Updated ${model}: ${action}` };
1220
1115
  } catch (e) {
@@ -1222,7 +1117,7 @@ The JSON object must have exactly these 4 keys:
1222
1117
  }
1223
1118
  }
1224
1119
  const branding = [
1225
- ` \u26A1 Pi Model Benchmark v1.0.9`,
1120
+ ` \u26A1 Pi Model Benchmark v${EXTENSION_VERSION}`,
1226
1121
  ` Written by VTSTech`,
1227
1122
  ` GitHub: https://github.com/VTSTech`,
1228
1123
  ` Website: www.vts-tech.org`
@@ -1243,7 +1138,7 @@ The JSON object must have exactly these 4 keys:
1243
1138
  }
1244
1139
  }
1245
1140
  lines.push(info(`API: ${apiMode}`));
1246
- const nativeContext = await fetchModelContextLength(OLLAMA_BASE, model);
1141
+ const nativeContext = await fetchModelContextLength(ollamaBase(), model);
1247
1142
  if (nativeContext !== void 0) {
1248
1143
  const ctxStr = nativeContext >= 1e3 ? `${(nativeContext / 1e3).toFixed(1)}k` : String(nativeContext);
1249
1144
  lines.push(info(`Context: ${ctxStr} tokens (native max)`));
@@ -1254,9 +1149,9 @@ The JSON object must have exactly these 4 keys:
1254
1149
  let modelQuant = "unknown";
1255
1150
  let modelModified = "unknown";
1256
1151
  try {
1257
- const tagsResult = await pi.exec("curl", ["-s", `${OLLAMA_BASE}/api/tags`], { timeout: 1e4 });
1258
- if (tagsResult.code === 0 && tagsResult.stdout.trim()) {
1259
- const tags = JSON.parse(tagsResult.stdout);
1152
+ const tagsRes = await fetch(`${ollamaBase()}/api/tags`, { signal: AbortSignal.timeout(1e4) });
1153
+ if (tagsRes.ok) {
1154
+ const tags = await tagsRes.json();
1260
1155
  const entry = (tags.models || []).find((m) => m.name === model);
1261
1156
  if (entry) {
1262
1157
  const details = entry.details || {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.0.9",
3
+ "version": "1.1.1",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.0.9"
17
+ "@vtstech/pi-shared": "1.1.1"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"