@vtstech/pi-model-test 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/model-test.js +110 -544
  2. package/package.json +2 -2
package/model-test.js CHANGED
@@ -1,7 +1,4 @@
1
1
  // .build-npm/model-test/model-test.temp.ts
2
- import * as fs from "node:fs";
3
- import * as os from "node:os";
4
- import * as path from "node:path";
5
2
  import {
6
3
  section,
7
4
  ok,
@@ -13,79 +10,21 @@ import {
13
10
  sanitizeForReport
14
11
  } from "@vtstech/pi-shared/format";
15
12
  import { getOllamaBaseUrl, detectModelFamily, readModelsJson, writeModelsJson, fetchModelContextLength, EXTENSION_VERSION, detectProvider } from "@vtstech/pi-shared/ollama";
16
- var CONFIG = {
17
- // General API settings
18
- DEFAULT_TIMEOUT_MS: 999999,
19
- // ~16.7 minutes — effectively unlimited for slow models
20
- CONNECT_TIMEOUT_S: 60,
21
- // 60 seconds to establish connection
22
- MAX_RETRIES: 1,
23
- // Single retry for transient failures
24
- RETRY_DELAY_MS: 1e4,
25
- // 10 seconds between retries
26
- // Model generation settings
27
- NUM_PREDICT: 1024,
28
- // Max tokens in response
29
- TEMPERATURE: 0.1,
30
- // Low temperature for more deterministic output
31
- // Test-specific settings
32
- MIN_THINKING_LENGTH: 10,
33
- // Minimum chars to consider thinking tokens valid
34
- TOOL_TEST_TIMEOUT_MS: 999999,
35
- // Effectively unlimited for slow tool usage tests
36
- TOOL_SUPPORT_TIMEOUT_MS: 999999,
37
- // Effectively unlimited for tool support detection
38
- // Metadata retrieval
39
- TAGS_TIMEOUT_MS: 15e3,
40
- // 15 seconds for /api/tags
41
- MODEL_INFO_TIMEOUT_MS: 3e4,
42
- // 30 seconds for model info lookup
43
- // Provider API settings
44
- PROVIDER_TIMEOUT_MS: 999999,
45
- // Effectively unlimited for cloud provider API calls
46
- PROVIDER_TOOL_TIMEOUT_MS: 12e4,
47
- // 120 seconds for tool usage tests on providers
48
- // Rate limiting
49
- TEST_DELAY_MS: 1e4
50
- // 10 seconds between tests to avoid rate limiting
51
- };
52
- var TOOL_SUPPORT_CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
53
- var TOOL_SUPPORT_CACHE_PATH = path.join(TOOL_SUPPORT_CACHE_DIR, "tool_support.json");
54
- var _toolSupportCacheInMemory = null;
55
- function readToolSupportCache() {
56
- try {
57
- if (fs.existsSync(TOOL_SUPPORT_CACHE_PATH)) {
58
- const raw = fs.readFileSync(TOOL_SUPPORT_CACHE_PATH, "utf-8");
59
- return JSON.parse(raw);
60
- }
61
- } catch {
62
- }
63
- return {};
64
- }
65
- function writeToolSupportCache(cache) {
66
- if (!fs.existsSync(TOOL_SUPPORT_CACHE_DIR)) {
67
- fs.mkdirSync(TOOL_SUPPORT_CACHE_DIR, { recursive: true });
68
- }
69
- fs.writeFileSync(TOOL_SUPPORT_CACHE_PATH, JSON.stringify(cache, null, 2) + "\n", "utf-8");
70
- }
71
- function getCachedToolSupport(model) {
72
- const cache = _toolSupportCacheInMemory || readToolSupportCache();
73
- if (!_toolSupportCacheInMemory) _toolSupportCacheInMemory = cache;
74
- const entry = cache[model];
75
- if (!entry) return null;
76
- if (!entry.support || !["native", "react", "none"].includes(entry.support)) return null;
77
- return entry;
78
- }
79
- function cacheToolSupport(model, support, family) {
80
- const cache = _toolSupportCacheInMemory || readToolSupportCache();
81
- cache[model] = {
82
- support,
83
- testedAt: (/* @__PURE__ */ new Date()).toISOString(),
84
- family
85
- };
86
- _toolSupportCacheInMemory = cache;
87
- writeToolSupportCache(cache);
88
- }
13
+ import {
14
+ ALL_DIALECT_PATTERNS,
15
+ parseReactWithPatterns
16
+ } from "@vtstech/pi-shared/react-parser";
17
+ import {
18
+ CONFIG,
19
+ WEATHER_TOOL_DEFINITION,
20
+ scoreReasoning,
21
+ getCachedToolSupport,
22
+ cacheToolSupport,
23
+ testToolUsageUnified,
24
+ testReasoningUnified,
25
+ testInstructionFollowingUnified,
26
+ TOOL_SUPPORT_CACHE_PATH
27
+ } from "@vtstech/pi-shared/model-test-utils";
89
28
  function model_test_temp_default(pi) {
90
29
  function ollamaBase() {
91
30
  return getOllamaBaseUrl();
@@ -96,65 +35,75 @@ function model_test_temp_default(pi) {
96
35
  await new Promise((r) => setTimeout(r, CONFIG.TEST_DELAY_MS));
97
36
  }
98
37
  }
99
- function scoreReasoning(msg) {
100
- const allNumbers = msg.match(/\b(\d+)\b/g) || [];
101
- const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
102
- const isCorrect = answer === "8";
103
- const reasoningPatterns = [
104
- "because",
105
- "therefore",
106
- "since",
107
- "step",
108
- "subtract",
109
- "minus",
110
- "each day",
111
- "each night",
112
- "slides",
113
- "climbs",
114
- "night",
115
- "reaches",
116
- "finally",
117
- "last day"
118
- ];
119
- const hasReasoningWords = reasoningPatterns.some((w) => msg.toLowerCase().includes(w));
120
- const hasNumberedSteps = /^\s*\d+\.\s/m.test(msg);
121
- const hasReasoning = hasReasoningWords || hasNumberedSteps;
122
- if (isCorrect && hasReasoning) return { score: "STRONG", pass: true };
123
- if (isCorrect) return { score: "MODERATE", pass: true };
124
- if (hasReasoning) return { score: "WEAK", pass: false };
125
- return { score: "FAIL", pass: false };
126
- }
127
- function scoreNativeToolCall(fnName, args) {
128
- const hasCorrectTool = fnName === "get_weather";
129
- const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
130
- const unitValid = args.unit === void 0 || typeof args.unit === "string" && ["celsius", "fahrenheit"].includes(args.unit.toLowerCase());
131
- if (hasCorrectTool && hasLocation && unitValid) return { score: "STRONG", pass: true };
132
- if (hasCorrectTool && hasLocation) return { score: "MODERATE", pass: true };
133
- return { score: "WEAK", pass: false };
38
+ function makeOllamaChatFn() {
39
+ return async (model, messages, _options) => {
40
+ const result = await ollamaChat(model, messages);
41
+ return {
42
+ content: result.response?.message?.content || "",
43
+ elapsedMs: result.elapsedMs,
44
+ raw: result.response
45
+ };
46
+ };
134
47
  }
135
- function scoreTextToolCall(fnName, args) {
136
- const isWeatherTool = fnName === "get_weather";
137
- const hasLocation = typeof args.location === "string" && args.location.toLowerCase().includes("paris");
138
- if (isWeatherTool && hasLocation) return { score: "STRONG", pass: true };
139
- if (isWeatherTool) return { score: "MODERATE", pass: true };
140
- return { score: "WEAK", pass: false };
48
+ function makeOllamaToolChatFn() {
49
+ return async (model, messages, options) => {
50
+ const tools = options?.tools || void 0;
51
+ const body = {
52
+ model,
53
+ messages,
54
+ stream: false,
55
+ options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
56
+ };
57
+ if (tools && tools.length > 0) {
58
+ body.tools = tools;
59
+ }
60
+ const controller = new AbortController();
61
+ const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
62
+ const start = Date.now();
63
+ try {
64
+ const res = await fetch(`${ollamaBase()}/api/chat`, {
65
+ method: "POST",
66
+ headers: { "Content-Type": "application/json" },
67
+ body: JSON.stringify(body),
68
+ signal: controller.signal
69
+ });
70
+ const elapsedMs = Date.now() - start;
71
+ clearTimeout(timeoutId);
72
+ if (!res.ok) {
73
+ const errorText = await res.text().catch(() => "unknown error");
74
+ throw new Error(`Ollama API returned ${res.status}: ${truncate(errorText, 200)}`);
75
+ }
76
+ const text = await res.text();
77
+ if (!text.trim()) throw new Error("Empty response from Ollama");
78
+ const parsed = JSON.parse(text);
79
+ const toolCalls = parsed?.message?.tool_calls;
80
+ const content = parsed?.message?.content || "";
81
+ return {
82
+ content,
83
+ toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : void 0,
84
+ elapsedMs,
85
+ raw: parsed
86
+ };
87
+ } catch (e) {
88
+ clearTimeout(timeoutId);
89
+ throw e;
90
+ }
91
+ };
141
92
  }
142
- function parseTextToolCall(content) {
143
- const firstBrace = content.indexOf("{");
144
- if (firstBrace === -1) return null;
145
- const lastBrace = content.lastIndexOf("}");
146
- if (lastBrace <= firstBrace) return null;
147
- const jsonCandidate = content.slice(firstBrace, lastBrace + 1);
148
- let textToolParsed = null;
149
- try {
150
- textToolParsed = JSON.parse(jsonCandidate);
151
- } catch {
152
- return null;
153
- }
154
- if (!textToolParsed || typeof textToolParsed.name !== "string") return null;
155
- const rawArgs = textToolParsed.arguments || { ...textToolParsed };
156
- const { name: _, ...fnArgs } = rawArgs;
157
- return { fnName: textToolParsed.name, args: fnArgs };
93
+ function makeProviderChatFn(providerInfo) {
94
+ return async (model, messages, options) => {
95
+ const result = await providerChat(providerInfo, model, messages, {
96
+ maxTokens: CONFIG.NUM_PREDICT,
97
+ tools: options?.tools,
98
+ timeoutMs: CONFIG.PROVIDER_TOOL_TIMEOUT_MS
99
+ });
100
+ return {
101
+ content: result.content,
102
+ toolCalls: result.toolCalls,
103
+ elapsedMs: result.elapsedMs,
104
+ raw: void 0
105
+ };
106
+ };
158
107
  }
159
108
  async function ollamaChat(model, messages, options = {}, timeoutMs = CONFIG.DEFAULT_TIMEOUT_MS, retries = CONFIG.MAX_RETRIES) {
160
109
  const body = { model, messages, stream: false, options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE, ...options } };
@@ -347,22 +296,7 @@ function model_test_temp_default(pi) {
347
296
  }
348
297
  }
349
298
  async function testReasoningProvider(providerInfo, model) {
350
- const prompt = `A snail climbs 3 feet up a wall each day, but slides back 2 feet each night. The wall is 10 feet tall. How many days does it take the snail to reach the top? Think step by step and give the final answer on its own line like: ANSWER: <number>`;
351
- try {
352
- const result = await providerChat(providerInfo, model, [
353
- { role: "user", content: prompt }
354
- ]);
355
- const msg = result.content.trim();
356
- if (msg.length === 0) {
357
- return { pass: false, score: "ERROR", reasoning: "Empty response from provider", answer: "?", elapsedMs: result.elapsedMs };
358
- }
359
- const allNumbers = msg.match(/\b(\d+)\b/g) || [];
360
- const answer = allNumbers.length > 0 ? allNumbers[allNumbers.length - 1] : "?";
361
- const { score, pass } = scoreReasoning(msg);
362
- return { pass, score, reasoning: msg, answer, elapsedMs: result.elapsedMs };
363
- } catch (e) {
364
- return { pass: false, score: "ERROR", reasoning: e.message, answer: "?", elapsedMs: 0 };
365
- }
299
+ return testReasoningUnified(makeProviderChatFn(providerInfo), model);
366
300
  }
367
301
  async function testThinking(model) {
368
302
  const prompt = "Multiply 37 by 43. Explain your reasoning step by step and give the final answer.";
@@ -386,182 +320,10 @@ function model_test_temp_default(pi) {
386
320
  }
387
321
  }
388
322
  async function testToolUsage(model) {
389
- const tools = [
390
- {
391
- type: "function",
392
- function: {
393
- name: "get_weather",
394
- description: "Get the current weather for a location",
395
- parameters: {
396
- type: "object",
397
- properties: {
398
- location: { type: "string", description: "City name" },
399
- unit: { type: "string", enum: ["celsius", "fahrenheit"] }
400
- },
401
- required: ["location"]
402
- }
403
- }
404
- }
405
- ];
406
- const body = {
407
- model,
408
- messages: [
409
- { role: "system", content: "You are a helpful assistant. Use the available tools when needed." },
410
- { role: "user", content: "What's the weather like in Paris right now?" }
411
- ],
412
- tools,
413
- stream: false,
414
- options: { num_predict: CONFIG.NUM_PREDICT, temperature: CONFIG.TEMPERATURE }
415
- };
416
- try {
417
- const controller = new AbortController();
418
- const timeoutId = setTimeout(() => controller.abort(), CONFIG.TOOL_TEST_TIMEOUT_MS);
419
- const start = Date.now();
420
- const res = await fetch(`${ollamaBase()}/api/chat`, {
421
- method: "POST",
422
- headers: { "Content-Type": "application/json" },
423
- body: JSON.stringify(body),
424
- signal: controller.signal
425
- });
426
- const elapsedMs = Date.now() - start;
427
- clearTimeout(timeoutId);
428
- if (!res.ok) {
429
- const errorText = await res.text().catch(() => "unknown error");
430
- return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `fetch error: ${res.status}`, response: "", elapsedMs };
431
- }
432
- const text = await res.text();
433
- if (!text.trim()) throw new Error("Empty response from Ollama");
434
- const parsed = JSON.parse(text);
435
- const toolCalls = parsed?.message?.tool_calls;
436
- const content = parsed?.message?.content || "";
437
- if (toolCalls && toolCalls.length > 0) {
438
- const call = toolCalls[0];
439
- const fn = call.function || {};
440
- let args = {};
441
- try {
442
- args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
443
- } catch {
444
- return {
445
- pass: true,
446
- score: "WEAK",
447
- hasToolCalls: true,
448
- toolCall: `malformed args: ${String(fn.arguments)}`,
449
- response: content,
450
- elapsedMs
451
- };
452
- }
453
- const { score, pass } = scoreNativeToolCall(fn.name || "", args);
454
- return {
455
- pass,
456
- score,
457
- hasToolCalls: true,
458
- toolCall: `${fn.name}(${JSON.stringify(args)})`,
459
- response: content,
460
- elapsedMs
461
- };
462
- }
463
- const textParsed = parseTextToolCall(content);
464
- if (textParsed) {
465
- const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
466
- return {
467
- pass,
468
- score,
469
- hasToolCalls: true,
470
- toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
471
- response: content,
472
- elapsedMs
473
- };
474
- }
475
- return {
476
- pass: false,
477
- score: "FAIL",
478
- hasToolCalls: false,
479
- toolCall: "none",
480
- response: content,
481
- elapsedMs
482
- };
483
- } catch (e) {
484
- return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `error: ${e.message}`, response: "", elapsedMs: 0 };
485
- }
323
+ return testToolUsageUnified(makeOllamaToolChatFn(), model);
486
324
  }
487
325
  async function testToolUsageProvider(providerInfo, model) {
488
- const tools = [
489
- {
490
- type: "function",
491
- function: {
492
- name: "get_weather",
493
- description: "Get the current weather for a location",
494
- parameters: {
495
- type: "object",
496
- properties: {
497
- location: { type: "string", description: "City name" },
498
- unit: { type: "string", enum: ["celsius", "fahrenheit"] }
499
- },
500
- required: ["location"]
501
- }
502
- }
503
- }
504
- ];
505
- try {
506
- const result = await providerChat(providerInfo, model, [
507
- { role: "system", content: "You are a helpful assistant. Use the available tools when needed." },
508
- { role: "user", content: "What's the weather like in Paris right now?" }
509
- ], {
510
- maxTokens: CONFIG.NUM_PREDICT,
511
- tools,
512
- timeoutMs: CONFIG.PROVIDER_TOOL_TIMEOUT_MS
513
- });
514
- const content = result.content;
515
- const toolCalls = result.toolCalls;
516
- if (toolCalls && toolCalls.length > 0) {
517
- const call = toolCalls[0];
518
- const fn = call.function || {};
519
- let args = {};
520
- try {
521
- args = typeof fn.arguments === "string" ? JSON.parse(fn.arguments) : fn.arguments || {};
522
- } catch {
523
- return {
524
- pass: true,
525
- score: "WEAK",
526
- hasToolCalls: true,
527
- toolCall: `malformed args: ${String(fn.arguments)}`,
528
- response: content,
529
- elapsedMs: result.elapsedMs
530
- };
531
- }
532
- const { score, pass } = scoreNativeToolCall(fn.name || "", args);
533
- return {
534
- pass,
535
- score,
536
- hasToolCalls: true,
537
- toolCall: `${fn.name}(${JSON.stringify(args)})`,
538
- response: content,
539
- elapsedMs: result.elapsedMs
540
- };
541
- }
542
- const textParsed = parseTextToolCall(content);
543
- if (textParsed) {
544
- const { score, pass } = scoreTextToolCall(textParsed.fnName, textParsed.args);
545
- return {
546
- pass,
547
- score,
548
- hasToolCalls: true,
549
- toolCall: `${textParsed.fnName}(${JSON.stringify(textParsed.args)})`,
550
- response: content,
551
- elapsedMs: result.elapsedMs
552
- };
553
- }
554
- return {
555
- pass: false,
556
- score: "FAIL",
557
- hasToolCalls: false,
558
- toolCall: "none",
559
- response: content,
560
- elapsedMs: result.elapsedMs
561
- };
562
- } catch (e) {
563
- return { pass: false, score: "ERROR", hasToolCalls: false, toolCall: `error: ${e.message}`, response: "", elapsedMs: 0 };
564
- }
326
+ return testToolUsageUnified(makeProviderChatFn(providerInfo), model);
565
327
  }
566
328
  async function testReactParsing(model) {
567
329
  const systemPrompt = [
@@ -642,67 +404,35 @@ function model_test_temp_default(pi) {
642
404
  }
643
405
  }
644
406
  } else {
645
- const dialectDefs = [
646
- { name: "react", action: "Action:", input: "Action Input:" },
647
- { name: "function", action: "Function:", input: "Function Input:" },
648
- { name: "tool", action: "Tool:", input: "Tool Input:" },
649
- { name: "call", action: "Call:", input: "Input:" }
650
- ];
651
- for (const dd of dialectDefs) {
652
- const esc = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
653
- const aT = esc(dd.action);
654
- const iT = esc(dd.input);
655
- const primaryRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s*\\n?\\s*${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
656
- const sameRe = new RegExp(`${aT}\\s*[\\x60"']?(\\w+)[\\x60"']?\\s+${iT}\\s*([\\s\\S]*?)(?=\\n\\s*(?:Observation:|Thought:|Final Answer:|${dd.action})|$)`, "is");
657
- const parenRe = new RegExp(`${aT}\\s*(\\w+)\\s*\\(([^)]*)\\)`, "i");
658
- let m = primaryRe.exec(content) || sameRe.exec(content);
659
- let isParen = false;
660
- if (!m) {
661
- m = parenRe.exec(content);
662
- isParen = true;
663
- }
664
- if (m) {
665
- const toolName = m[1].trim().replace(/[`"']/g, "");
666
- const rawArgs = m[2].trim().replace(/^```\w*\s*/gm, "").replace(/```\s*$/gm, "").trim();
667
- let argsStr = "";
668
- if (isParen && rawArgs && !rawArgs.startsWith("{")) {
669
- const pairs = rawArgs.match(/(\w+)\s*:\s*("[^"]*"|'[^']*'|\S+)/g);
670
- if (pairs) {
671
- const obj = {};
672
- for (const p of pairs) {
673
- const ci = p.indexOf(":");
674
- let v = p.slice(ci + 1).trim();
675
- if (v.startsWith('"') && v.endsWith('"') || v.startsWith("'") && v.endsWith("'")) v = v.slice(1, -1);
676
- obj[p.slice(0, ci).trim()] = v;
677
- }
678
- argsStr = JSON.stringify(obj);
679
- } else {
680
- argsStr = rawArgs;
681
- }
682
- } else {
683
- const js = rawArgs.indexOf("{");
684
- if (js !== -1) {
685
- let d = 0, je = -1;
686
- for (let i = js; i < rawArgs.length; i++) {
687
- if (rawArgs[i] === "{") d++;
688
- else if (rawArgs[i] === "}") {
689
- d--;
690
- if (d === 0) {
691
- je = i;
407
+ for (const dp of ALL_DIALECT_PATTERNS) {
408
+ const result = parseReactWithPatterns(content, dp, true);
409
+ if (result) {
410
+ let argsStr;
411
+ const rawArgs = result.args ? JSON.stringify(result.args) : "";
412
+ if (rawArgs && rawArgs !== "{}") {
413
+ argsStr = rawArgs;
414
+ } else if (result.raw) {
415
+ const jsonStart = result.raw.indexOf("{");
416
+ if (jsonStart !== -1) {
417
+ let depth = 0, jsonEnd = -1;
418
+ for (let i = jsonStart; i < result.raw.length; i++) {
419
+ if (result.raw[i] === "{") depth++;
420
+ else if (result.raw[i] === "}") {
421
+ depth--;
422
+ if (depth === 0) {
423
+ jsonEnd = i;
692
424
  break;
693
425
  }
694
426
  }
695
427
  }
696
- argsStr = je !== -1 ? rawArgs.slice(js, je + 1) : rawArgs;
428
+ argsStr = jsonEnd !== -1 ? result.raw.slice(jsonStart, jsonEnd + 1) : "";
697
429
  } else {
698
- argsStr = rawArgs;
430
+ argsStr = "";
699
431
  }
432
+ } else {
433
+ argsStr = "";
700
434
  }
701
- let thought = "";
702
- const thoughtRe = /Thought:\s*(.*?)(?=Action:|Function:|Tool:|Call:|Final Answer:|$)/is;
703
- const tm = thoughtRe.exec(content);
704
- if (tm) thought = tm[1].trim();
705
- parsedResult = { name: toolName, args: argsStr, thought, dialect: dd.name };
435
+ parsedResult = { name: result.name, args: argsStr, thought: result.thought || "", dialect: result.dialect };
706
436
  break;
707
437
  }
708
438
  }
@@ -763,158 +493,10 @@ function model_test_temp_default(pi) {
763
493
  }
764
494
  }
765
495
  async function testInstructionFollowing(model) {
766
- const prompt = `You must respond with ONLY a valid JSON object. No markdown, no explanation, no backticks, no extra text.
767
-
768
- The JSON object must have exactly these 4 keys:
769
- - "name" (string): your model name
770
- - "can_count" (boolean): true
771
- - "sum" (number): the result of 15 + 27
772
- - "language" (string): the language you are responding in`;
773
- try {
774
- const { response, elapsedMs } = await ollamaChat(model, [
775
- { role: "user", content: prompt }
776
- ], { num_predict: CONFIG.NUM_PREDICT });
777
- const msg = (response?.message?.content || "").trim();
778
- let parsed = null;
779
- let repairNote = "";
780
- try {
781
- const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
782
- parsed = JSON.parse(cleaned);
783
- } catch {
784
- const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
785
- let braceDepth = 0, bracketDepth = 0;
786
- let inString = false, escapeNext = false;
787
- for (let i = 0; i < cleaned.length; i++) {
788
- const c = cleaned[i];
789
- if (escapeNext) {
790
- escapeNext = false;
791
- continue;
792
- }
793
- if (c === "\\") {
794
- if (inString) escapeNext = true;
795
- continue;
796
- }
797
- if (c === '"') {
798
- inString = !inString;
799
- continue;
800
- }
801
- if (inString) continue;
802
- if (c === "{") braceDepth++;
803
- else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
804
- else if (c === "[") bracketDepth++;
805
- else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
806
- }
807
- if (braceDepth > 0 || bracketDepth > 0) {
808
- const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
809
- try {
810
- parsed = JSON.parse(repaired);
811
- repairNote = " (repaired truncated JSON)";
812
- } catch {
813
- }
814
- }
815
- }
816
- if (!parsed) {
817
- return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs };
818
- }
819
- const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
820
- const correctSum = parsed.sum === 42;
821
- const hasCorrectCount = parsed.can_count === true;
822
- let score;
823
- if (hasKeys && correctSum && hasCorrectCount) {
824
- score = "STRONG";
825
- } else if (hasKeys && (correctSum || hasCorrectCount)) {
826
- score = "MODERATE";
827
- } else if (parsed.sum !== void 0 || parsed.name) {
828
- score = "WEAK";
829
- } else {
830
- score = "FAIL";
831
- }
832
- return {
833
- pass: hasKeys,
834
- score,
835
- output: JSON.stringify(parsed) + repairNote,
836
- elapsedMs
837
- };
838
- } catch (e) {
839
- return { pass: false, score: "ERROR", output: e.message, elapsedMs: 0 };
840
- }
496
+ return testInstructionFollowingUnified(makeOllamaChatFn(), model);
841
497
  }
842
498
  async function testInstructionFollowingProvider(providerInfo, model) {
843
- const prompt = `You must respond with ONLY a valid JSON object. No markdown, no explanation, no backticks, no extra text.
844
-
845
- The JSON object must have exactly these 4 keys:
846
- - "name" (string): your model name
847
- - "can_count" (boolean): true
848
- - "sum" (number): the result of 15 + 27
849
- - "language" (string): the language you are responding in`;
850
- try {
851
- const result = await providerChat(providerInfo, model, [
852
- { role: "user", content: prompt }
853
- ]);
854
- const msg = result.content.trim();
855
- let parsed = null;
856
- let repairNote = "";
857
- try {
858
- const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
859
- parsed = JSON.parse(cleaned);
860
- } catch {
861
- const cleaned = msg.replace(/```json?\s*/gi, "").replace(/```/g, "").trim();
862
- let braceDepth = 0, bracketDepth = 0;
863
- let inString = false, escapeNext = false;
864
- for (let i = 0; i < cleaned.length; i++) {
865
- const c = cleaned[i];
866
- if (escapeNext) {
867
- escapeNext = false;
868
- continue;
869
- }
870
- if (c === "\\") {
871
- if (inString) escapeNext = true;
872
- continue;
873
- }
874
- if (c === '"') {
875
- inString = !inString;
876
- continue;
877
- }
878
- if (inString) continue;
879
- if (c === "{") braceDepth++;
880
- else if (c === "}") braceDepth = Math.max(0, braceDepth - 1);
881
- else if (c === "[") bracketDepth++;
882
- else if (c === "]") bracketDepth = Math.max(0, bracketDepth - 1);
883
- }
884
- if (braceDepth > 0 || bracketDepth > 0) {
885
- const repaired = cleaned + "}".repeat(braceDepth) + "]".repeat(bracketDepth);
886
- try {
887
- parsed = JSON.parse(repaired);
888
- repairNote = " (repaired truncated JSON)";
889
- } catch {
890
- }
891
- }
892
- }
893
- if (!parsed) {
894
- return { pass: false, score: "FAIL", output: sanitizeForReport(msg), elapsedMs: result.elapsedMs };
895
- }
896
- const hasKeys = parsed.name && parsed.can_count !== void 0 && parsed.sum !== void 0 && parsed.language;
897
- const correctSum = parsed.sum === 42;
898
- const hasCorrectCount = parsed.can_count === true;
899
- let score;
900
- if (hasKeys && correctSum && hasCorrectCount) {
901
- score = "STRONG";
902
- } else if (hasKeys && (correctSum || hasCorrectCount)) {
903
- score = "MODERATE";
904
- } else if (parsed.sum !== void 0 || parsed.name) {
905
- score = "WEAK";
906
- } else {
907
- score = "FAIL";
908
- }
909
- return {
910
- pass: hasKeys,
911
- score,
912
- output: JSON.stringify(parsed) + repairNote,
913
- elapsedMs: result.elapsedMs
914
- };
915
- } catch (e) {
916
- return { pass: false, score: "ERROR", output: e.message, elapsedMs: 0 };
917
- }
499
+ return testInstructionFollowingUnified(makeProviderChatFn(providerInfo), model);
918
500
  }
919
501
  async function testToolSupport(model, family) {
920
502
  const cached = getCachedToolSupport(model);
@@ -926,23 +508,7 @@ The JSON object must have exactly these 4 keys:
926
508
  elapsedMs: 0
927
509
  };
928
510
  }
929
- const tools = [
930
- {
931
- type: "function",
932
- function: {
933
- name: "get_weather",
934
- description: "Get the current weather for a location",
935
- parameters: {
936
- type: "object",
937
- properties: {
938
- location: { type: "string", description: "City name" },
939
- unit: { type: "string", enum: ["celsius", "fahrenheit"] }
940
- },
941
- required: ["location"]
942
- }
943
- }
944
- }
945
- ];
511
+ const tools = [WEATHER_TOOL_DEFINITION];
946
512
  const body = {
947
513
  model,
948
514
  messages: [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vtstech/pi-model-test",
3
- "version": "1.1.1",
3
+ "version": "1.1.3",
4
4
  "description": "Model benchmark/testing extension for Pi Coding Agent",
5
5
  "main": "model-test.js",
6
6
  "keywords": ["pi-extensions"],
@@ -14,7 +14,7 @@
14
14
  "url": "https://github.com/VTSTech/pi-coding-agent"
15
15
  },
16
16
  "dependencies": {
17
- "@vtstech/pi-shared": "1.1.1"
17
+ "@vtstech/pi-shared": "1.1.3"
18
18
  },
19
19
  "peerDependencies": {
20
20
  "@mariozechner/pi-coding-agent": ">=0.66"