promptfoo 0.119.13 → 0.119.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/dist/package.json +28 -26
  2. package/dist/src/app/assets/index-eJ2lMe94.js +51 -0
  3. package/dist/src/app/assets/{source-map-support-Bnh0UQ2S.js → source-map-support-1v4oeb7P.js} +1 -1
  4. package/dist/src/app/assets/sync-CtLQRuC1.js +1 -0
  5. package/dist/src/app/assets/{vendor-charts-T60Uk0Z3.js → vendor-charts-DnVv66VV.js} +1 -1
  6. package/dist/src/app/assets/{vendor-markdown-DLig-KJh.js → vendor-markdown-DCpQIyMA.js} +1 -1
  7. package/dist/src/app/assets/{vendor-mui-core-5BLaiG3c.js → vendor-mui-core-Boqnpf9f.js} +1 -1
  8. package/dist/src/app/assets/{vendor-mui-icons-fn39Fu2e.js → vendor-mui-icons-B8MqoVbj.js} +1 -1
  9. package/dist/src/app/assets/vendor-mui-x-CGSS6QHF.js +45 -0
  10. package/dist/src/app/assets/{vendor-utils-DYBMEuwX.js → vendor-utils-DdfHIEy8.js} +1 -1
  11. package/dist/src/app/index.html +7 -7
  12. package/dist/src/assertions/guardrails.d.ts +1 -1
  13. package/dist/src/assertions/guardrails.js +18 -9
  14. package/dist/src/assertions/index.d.ts +1 -1
  15. package/dist/src/assertions/index.js +9 -3
  16. package/dist/src/assertions/searchRubric.d.ts +3 -0
  17. package/dist/src/assertions/searchRubric.js +18 -0
  18. package/dist/src/commands/eval.js +1 -1
  19. package/dist/src/commands/modelScan.d.ts +7 -1
  20. package/dist/src/commands/modelScan.js +121 -59
  21. package/dist/src/database/index.d.ts +6 -0
  22. package/dist/src/database/index.js +11 -0
  23. package/dist/src/database/tables.d.ts +46 -24
  24. package/dist/src/envars.d.ts +17 -0
  25. package/dist/src/generated/constants.js +1 -1
  26. package/dist/src/logger.d.ts +5 -0
  27. package/dist/src/logger.js +28 -0
  28. package/dist/src/main.js +17 -6
  29. package/dist/src/matchers.d.ts +1 -0
  30. package/dist/src/matchers.js +80 -0
  31. package/dist/src/models/eval.d.ts +2 -1
  32. package/dist/src/models/eval.js +44 -2
  33. package/dist/src/prompts/grading.d.ts +1 -0
  34. package/dist/src/prompts/grading.js +26 -1
  35. package/dist/src/prompts/index.d.ts +1 -0
  36. package/dist/src/prompts/index.js +4 -1
  37. package/dist/src/providers/adaline.gateway.js +2 -2
  38. package/dist/src/providers/anthropic/defaults.d.ts +1 -1
  39. package/dist/src/providers/anthropic/defaults.js +15 -0
  40. package/dist/src/providers/azure/chat.d.ts +3 -1
  41. package/dist/src/providers/azure/chat.js +16 -3
  42. package/dist/src/providers/azure/defaults.js +660 -141
  43. package/dist/src/providers/azure/responses.d.ts +5 -0
  44. package/dist/src/providers/azure/responses.js +33 -4
  45. package/dist/src/providers/azure/types.d.ts +4 -0
  46. package/dist/src/providers/bedrock/agents.d.ts +1 -1
  47. package/dist/src/providers/bedrock/agents.js +2 -2
  48. package/dist/src/providers/bedrock/base.d.ts +40 -0
  49. package/dist/src/providers/bedrock/base.js +171 -0
  50. package/dist/src/providers/bedrock/converse.d.ts +146 -0
  51. package/dist/src/providers/bedrock/converse.js +1044 -0
  52. package/dist/src/providers/bedrock/index.d.ts +1 -34
  53. package/dist/src/providers/bedrock/index.js +4 -159
  54. package/dist/src/providers/bedrock/knowledgeBase.d.ts +1 -1
  55. package/dist/src/providers/bedrock/knowledgeBase.js +2 -2
  56. package/dist/src/providers/bedrock/nova-sonic.d.ts +2 -1
  57. package/dist/src/providers/bedrock/nova-sonic.js +2 -2
  58. package/dist/src/providers/claude-agent-sdk.d.ts +58 -1
  59. package/dist/src/providers/claude-agent-sdk.js +22 -1
  60. package/dist/src/providers/defaults.js +4 -0
  61. package/dist/src/providers/github/defaults.js +6 -6
  62. package/dist/src/providers/google/types.d.ts +25 -0
  63. package/dist/src/providers/google/util.d.ts +2 -0
  64. package/dist/src/providers/google/vertex.js +78 -22
  65. package/dist/src/providers/{groq.d.ts → groq/chat.d.ts} +26 -20
  66. package/dist/src/providers/groq/chat.js +79 -0
  67. package/dist/src/providers/groq/index.d.ts +5 -0
  68. package/dist/src/providers/groq/index.js +24 -0
  69. package/dist/src/providers/groq/responses.d.ts +106 -0
  70. package/dist/src/providers/groq/responses.js +64 -0
  71. package/dist/src/providers/groq/types.d.ts +44 -0
  72. package/dist/src/providers/groq/types.js +3 -0
  73. package/dist/src/providers/groq/util.d.ts +15 -0
  74. package/dist/src/providers/groq/util.js +28 -0
  75. package/dist/src/providers/mcp/client.d.ts +8 -0
  76. package/dist/src/providers/mcp/client.js +60 -10
  77. package/dist/src/providers/mcp/types.d.ts +21 -0
  78. package/dist/src/providers/openai/chatkit-pool.d.ts +114 -0
  79. package/dist/src/providers/openai/chatkit-pool.js +548 -0
  80. package/dist/src/providers/openai/chatkit-types.d.ts +73 -0
  81. package/dist/src/providers/openai/chatkit-types.js +3 -0
  82. package/dist/src/providers/openai/chatkit.d.ts +76 -0
  83. package/dist/src/providers/openai/chatkit.js +879 -0
  84. package/dist/src/providers/openai/codex-sdk.d.ts +109 -0
  85. package/dist/src/providers/openai/codex-sdk.js +346 -0
  86. package/dist/src/providers/openai/defaults.d.ts +2 -0
  87. package/dist/src/providers/openai/defaults.js +10 -4
  88. package/dist/src/providers/registry.js +48 -9
  89. package/dist/src/providers/responses/types.d.ts +1 -1
  90. package/dist/src/providers/sagemaker.d.ts +2 -2
  91. package/dist/src/providers/webSearchUtils.d.ts +17 -0
  92. package/dist/src/providers/webSearchUtils.js +169 -0
  93. package/dist/src/providers/xai/chat.d.ts +61 -0
  94. package/dist/src/providers/xai/chat.js +68 -3
  95. package/dist/src/providers/xai/responses.d.ts +189 -0
  96. package/dist/src/providers/xai/responses.js +268 -0
  97. package/dist/src/redteam/constants/plugins.d.ts +1 -1
  98. package/dist/src/redteam/constants/plugins.js +1 -1
  99. package/dist/src/redteam/constants/strategies.d.ts +1 -1
  100. package/dist/src/redteam/constants/strategies.js +1 -0
  101. package/dist/src/redteam/plugins/vlguard.d.ts +53 -4
  102. package/dist/src/redteam/plugins/vlguard.js +362 -46
  103. package/dist/src/redteam/providers/constants.d.ts +2 -2
  104. package/dist/src/redteam/providers/constants.js +2 -2
  105. package/dist/src/redteam/providers/crescendo/index.d.ts +1 -1
  106. package/dist/src/redteam/providers/crescendo/index.js +5 -3
  107. package/dist/src/redteam/providers/hydra/index.js +1 -1
  108. package/dist/src/server/routes/modelAudit.js +4 -4
  109. package/dist/src/share.js +4 -2
  110. package/dist/src/telemetry.js +44 -8
  111. package/dist/src/types/env.d.ts +3 -0
  112. package/dist/src/types/env.js +1 -0
  113. package/dist/src/types/index.d.ts +896 -615
  114. package/dist/src/types/index.js +1 -0
  115. package/dist/src/types/providers.d.ts +1 -0
  116. package/dist/src/types/tracing.d.ts +3 -0
  117. package/dist/src/util/database.d.ts +6 -4
  118. package/dist/src/util/file.js +6 -4
  119. package/dist/src/util/modelAuditCliParser.d.ts +4 -4
  120. package/dist/src/util/xlsx.js +52 -26
  121. package/dist/src/validators/providers.d.ts +142 -122
  122. package/dist/src/validators/providers.js +4 -6
  123. package/dist/src/validators/redteam.d.ts +36 -28
  124. package/dist/src/validators/redteam.js +9 -3
  125. package/dist/tsconfig.tsbuildinfo +1 -1
  126. package/package.json +28 -26
  127. package/dist/drizzle/CLAUDE.md +0 -65
  128. package/dist/src/app/assets/index-DifT6VGT.js +0 -51
  129. package/dist/src/app/assets/sync-Oo-W_Rbj.js +0 -1
  130. package/dist/src/app/assets/vendor-mui-x-C2xF-yiO.js +0 -45
  131. package/dist/src/providers/groq.js +0 -48
@@ -0,0 +1,879 @@
1
+ "use strict";
2
+ /**
3
+ * OpenAI ChatKit Provider
4
+ *
5
+ * Evaluates ChatKit workflows deployed via Agent Builder using Playwright
6
+ * to interact with the ChatKit web component.
7
+ *
8
+ * ChatKit workflows created in OpenAI's Agent Builder don't expose a direct
9
+ * REST API for sending messages. Instead, they require interaction through
10
+ * the ChatKit web component, which this provider automates using Playwright.
11
+ *
12
+ * Prerequisites:
13
+ * - Playwright installed: npm install playwright && npx playwright install chromium
14
+ * - OPENAI_API_KEY environment variable set
15
+ *
16
+ * Usage:
17
+ * providers:
18
+ * - id: openai:chatkit:wf_68ffb83dbfc88190a38103c2bb9f421003f913035dbdb131
19
+ * config:
20
+ * version: '3' # Optional: workflow version
21
+ * timeout: 120000 # Optional: response timeout in ms (default: 120000)
22
+ * headless: true # Optional: run browser headless (default: true)
23
+ *
24
+ * Performance Notes:
25
+ * - Each evaluation spawns a browser instance, so it's slower than REST APIs
26
+ * - For reliable results, use --max-concurrency 1 to avoid resource contention
27
+ * - First test may be slower due to browser launch and ChatKit initialization
28
+ *
29
+ * Troubleshooting:
30
+ * - "Playwright not found": Run `npx playwright install chromium`
31
+ * - Timeout errors: Increase timeout config or use --max-concurrency 1
32
+ * - Empty responses: The workflow may not generate text for some inputs
33
+ */
34
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
35
+ if (k2 === undefined) k2 = k;
36
+ var desc = Object.getOwnPropertyDescriptor(m, k);
37
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
38
+ desc = { enumerable: true, get: function() { return m[k]; } };
39
+ }
40
+ Object.defineProperty(o, k2, desc);
41
+ }) : (function(o, m, k, k2) {
42
+ if (k2 === undefined) k2 = k;
43
+ o[k2] = m[k];
44
+ }));
45
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
46
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
47
+ }) : function(o, v) {
48
+ o["default"] = v;
49
+ });
50
+ var __importStar = (this && this.__importStar) || (function () {
51
+ var ownKeys = function(o) {
52
+ ownKeys = Object.getOwnPropertyNames || function (o) {
53
+ var ar = [];
54
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
55
+ return ar;
56
+ };
57
+ return ownKeys(o);
58
+ };
59
+ return function (mod) {
60
+ if (mod && mod.__esModule) return mod;
61
+ var result = {};
62
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
63
+ __setModuleDefault(result, mod);
64
+ return result;
65
+ };
66
+ })();
67
+ var __importDefault = (this && this.__importDefault) || function (mod) {
68
+ return (mod && mod.__esModule) ? mod : { "default": mod };
69
+ };
70
+ Object.defineProperty(exports, "__esModule", { value: true });
71
+ exports.OpenAiChatKitProvider = void 0;
72
+ const playwright_1 = require("playwright");
73
+ const http = __importStar(require("http"));
74
+ const logger_1 = __importDefault(require("../../logger"));
75
+ const providerRegistry_1 = require("../providerRegistry");
76
+ const index_1 = require("./index");
77
+ const chatkit_pool_1 = require("./chatkit-pool");
78
+ // Configuration constants
79
+ const DEFAULT_TIMEOUT_MS = 120000;
80
+ const DEFAULT_MAX_APPROVALS = 5;
81
+ const DEFAULT_POOL_SIZE = 4;
82
+ const CHATKIT_READY_TIMEOUT_MS = 60000;
83
+ const DOM_SETTLE_DELAY_MS = 2000;
84
+ const APPROVAL_PROCESS_DELAY_MS = 500;
85
+ const APPROVAL_CLICK_DELAY_MS = 1000;
86
+ const RESPONSE_EXTRACT_RETRY_DELAY_MS = 500;
87
+ // Note: MIN_RESPONSE_LENGTH (20), MIN_MESSAGE_LENGTH (30), MAX_INIT_ATTEMPTS (100),
88
+ // and INIT_POLL_INTERVAL_MS (100) are hardcoded in the HTML template string
89
+ // and in DOM evaluation functions where constants cannot be easily passed.
90
+ /**
91
+ * Check if a URL is from OpenAI's CDN by parsing the hostname.
92
+ * This is more secure than substring matching which could be bypassed.
93
+ */
94
+ function isOpenAICdnUrl(url) {
95
+ try {
96
+ const parsed = new URL(url);
97
+ return parsed.hostname === 'cdn.platform.openai.com';
98
+ }
99
+ catch {
100
+ return false;
101
+ }
102
+ }
103
+ /**
104
+ * Validate workflowId format to prevent script injection
105
+ */
106
+ function validateWorkflowId(workflowId) {
107
+ if (!workflowId || !/^wf_[a-zA-Z0-9]+$/.test(workflowId)) {
108
+ throw new Error(`Invalid workflowId format: ${workflowId}. Expected format: wf_<alphanumeric>`);
109
+ }
110
+ }
111
+ /**
112
+ * Validate version format to prevent script injection
113
+ */
114
+ function validateVersion(version) {
115
+ if (!/^[a-zA-Z0-9._-]+$/.test(version)) {
116
+ throw new Error(`Invalid version format: ${version}. Only alphanumeric, dot, dash, and underscore allowed.`);
117
+ }
118
+ }
119
+ /**
120
+ * Validate userId format to prevent script injection
121
+ */
122
+ function validateUserId(userId) {
123
+ if (!/^[a-zA-Z0-9._@-]+$/.test(userId)) {
124
+ throw new Error(`Invalid userId format: ${userId}. Only alphanumeric, dot, dash, underscore, and @ allowed.`);
125
+ }
126
+ }
127
+ /**
128
+ * Generate the HTML page that hosts the ChatKit component
129
+ */
130
+ function generateChatKitHTML(apiKey, workflowId, version, userId) {
131
+ // Validate inputs to prevent script injection
132
+ validateWorkflowId(workflowId);
133
+ if (version) {
134
+ validateVersion(version);
135
+ }
136
+ // userId is required - caller must provide it (constructor ensures this)
137
+ if (!userId) {
138
+ throw new Error('userId is required for ChatKit HTML generation');
139
+ }
140
+ validateUserId(userId);
141
+ const versionClause = version ? `, version: '${version}'` : '';
142
+ return `<!DOCTYPE html>
143
+ <html>
144
+ <head>
145
+ <meta charset="utf-8">
146
+ <title>ChatKit Eval</title>
147
+ </head>
148
+ <body>
149
+ <openai-chatkit id="chatkit"></openai-chatkit>
150
+
151
+ <script src="https://cdn.platform.openai.com/deployments/chatkit/chatkit.js"></script>
152
+
153
+ <script>
154
+ window.__state = { ready: false, responses: [], threadId: null, error: null };
155
+
156
+ async function init() {
157
+ const chatkit = document.getElementById('chatkit');
158
+
159
+ // Wait for element to be ready
160
+ let attempts = 0;
161
+ while (typeof chatkit.setOptions !== 'function' && attempts < 100) {
162
+ await new Promise(r => setTimeout(r, 100));
163
+ attempts++;
164
+ }
165
+
166
+ if (typeof chatkit.setOptions !== 'function') {
167
+ window.__state.error = 'ChatKit component failed to initialize';
168
+ return;
169
+ }
170
+
171
+ let cachedSecret = null;
172
+
173
+ chatkit.setOptions({
174
+ api: {
175
+ getClientSecret: async (existing) => {
176
+ if (existing) return existing;
177
+ if (cachedSecret) return cachedSecret;
178
+
179
+ const res = await fetch('https://api.openai.com/v1/chatkit/sessions', {
180
+ method: 'POST',
181
+ headers: {
182
+ 'Authorization': 'Bearer ${apiKey}',
183
+ 'Content-Type': 'application/json',
184
+ 'OpenAI-Beta': 'chatkit_beta=v1'
185
+ },
186
+ body: JSON.stringify({
187
+ workflow: { id: '${workflowId}'${versionClause} },
188
+ user: '${userId}'
189
+ })
190
+ });
191
+
192
+ if (!res.ok) {
193
+ const text = await res.text();
194
+ throw new Error('Session failed: ' + res.status + ' ' + text);
195
+ }
196
+
197
+ const data = await res.json();
198
+ cachedSecret = data.client_secret;
199
+ return cachedSecret;
200
+ }
201
+ },
202
+ header: { enabled: false },
203
+ history: { enabled: false },
204
+ });
205
+
206
+ chatkit.addEventListener('chatkit.ready', () => {
207
+ window.__state.ready = true;
208
+ });
209
+
210
+ chatkit.addEventListener('chatkit.error', (e) => {
211
+ window.__state.error = e.detail.error?.message || 'Unknown error';
212
+ });
213
+
214
+ chatkit.addEventListener('chatkit.thread.change', (e) => {
215
+ window.__state.threadId = e.detail.threadId;
216
+ });
217
+
218
+ chatkit.addEventListener('chatkit.response.end', () => {
219
+ window.__state.responses.push({ timestamp: Date.now() });
220
+ });
221
+
222
+ window.__chatkit = chatkit;
223
+ }
224
+
225
+ init().catch(e => {
226
+ window.__state.error = e.message;
227
+ });
228
+ </script>
229
+ </body>
230
+ </html>`;
231
+ }
232
+ /**
233
+ * Extract assistant response text from the ChatKit iframe
234
+ * Uses retry logic since DOM may still be updating after response.end event
235
+ */
236
+ async function extractResponseFromFrame(page, maxRetries = 3) {
237
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
238
+ const frames = page.frames();
239
+ for (const frame of frames) {
240
+ const url = frame.url();
241
+ if (isOpenAICdnUrl(url)) {
242
+ try {
243
+ const result = await frame.evaluate(() => {
244
+ // Helper to check if element is likely a user message
245
+ const isUserMessage = (el) => {
246
+ const className = el.className?.toString().toLowerCase() || '';
247
+ const role = el.getAttribute('data-role') || '';
248
+ const testId = el.getAttribute('data-testid') || '';
249
+ return className.includes('user') || role === 'user' || testId.includes('user');
250
+ };
251
+ // Helper to check if element is an assistant message
252
+ const isAssistantMessage = (el) => {
253
+ const className = el.className?.toString().toLowerCase() || '';
254
+ const role = el.getAttribute('data-role') || '';
255
+ const testId = el.getAttribute('data-testid') || '';
256
+ return (className.includes('assistant') ||
257
+ role === 'assistant' ||
258
+ testId.includes('assistant'));
259
+ };
260
+ // Try assistant-specific selectors first - these are most reliable
261
+ const assistantSelectors = [
262
+ '[data-testid="assistant-message"]',
263
+ '[data-role="assistant"]',
264
+ '[class*="assistant"]:not([class*="user"])',
265
+ ];
266
+ for (const sel of assistantSelectors) {
267
+ const els = document.querySelectorAll(sel);
268
+ if (els.length > 0) {
269
+ const lastEl = els[els.length - 1];
270
+ const text = lastEl.textContent?.trim() || '';
271
+ // Accept any non-empty assistant message (removed length requirement)
272
+ if (text.length > 0) {
273
+ return { text, source: sel, isAssistant: true };
274
+ }
275
+ }
276
+ }
277
+ // Look for message containers and find messages
278
+ // Collect both user and assistant messages to identify the last assistant one
279
+ const allMessages = document.querySelectorAll('[class*="message"]');
280
+ const messages = [];
281
+ allMessages.forEach((msg) => {
282
+ const text = msg.textContent?.trim() || '';
283
+ if (text.length > 0) {
284
+ messages.push({
285
+ text,
286
+ isUser: isUserMessage(msg),
287
+ isAssistant: isAssistantMessage(msg),
288
+ });
289
+ }
290
+ });
291
+ // Find the last non-user message (assistant messages)
292
+ for (let i = messages.length - 1; i >= 0; i--) {
293
+ if (!messages[i].isUser && messages[i].text.length > 0) {
294
+ return { text: messages[i].text, source: 'last-non-user', isAssistant: true };
295
+ }
296
+ }
297
+ // Try markdown content (often contains the formatted response)
298
+ const markdown = document.querySelectorAll('.markdown, [class*="markdown"]');
299
+ if (markdown.length > 0) {
300
+ // Find markdown that's not inside a user message
301
+ for (let i = markdown.length - 1; i >= 0; i--) {
302
+ const el = markdown[i];
303
+ let parent = el.parentElement;
304
+ let inUserArea = false;
305
+ while (parent && parent !== document.body) {
306
+ if (isUserMessage(parent)) {
307
+ inUserArea = true;
308
+ break;
309
+ }
310
+ parent = parent.parentElement;
311
+ }
312
+ if (!inUserArea) {
313
+ const text = el.textContent?.trim() || '';
314
+ if (text.length > 0) {
315
+ return { text, source: 'markdown', isAssistant: true };
316
+ }
317
+ }
318
+ }
319
+ }
320
+ // Try response-specific containers
321
+ const responseContainers = document.querySelectorAll('[class*="response"], [class*="reply"], [class*="answer"]');
322
+ for (let i = responseContainers.length - 1; i >= 0; i--) {
323
+ const container = responseContainers[i];
324
+ if (!isUserMessage(container)) {
325
+ const text = container.textContent?.trim() || '';
326
+ if (text.length > 0) {
327
+ return { text, source: 'response-container', isAssistant: true };
328
+ }
329
+ }
330
+ }
331
+ // Fallback: look for the last div that's not in a user message area
332
+ // Prefer shorter texts to avoid grabbing the entire page
333
+ const divs = Array.from(document.querySelectorAll('div'));
334
+ const candidateDivs = [];
335
+ for (const div of divs) {
336
+ const text = div.textContent?.trim() || '';
337
+ if (text.length > 0 && text.length < 5000 && !isUserMessage(div)) {
338
+ // Check parent chain for user indicators
339
+ let parent = div.parentElement;
340
+ let inUserArea = false;
341
+ while (parent && parent !== document.body) {
342
+ if (isUserMessage(parent)) {
343
+ inUserArea = true;
344
+ break;
345
+ }
346
+ parent = parent.parentElement;
347
+ }
348
+ if (!inUserArea) {
349
+ candidateDivs.push({ text, el: div });
350
+ }
351
+ }
352
+ }
353
+ // Sort by length and prefer medium-length texts (likely actual responses)
354
+ // Avoid very short (labels) and very long (containers with multiple messages)
355
+ if (candidateDivs.length > 0) {
356
+ // Find divs that don't contain other message-like elements
357
+ const leafDivs = candidateDivs.filter((d) => d.el.querySelectorAll('[class*="message"]').length === 0);
358
+ if (leafDivs.length > 0) {
359
+ // Return the last leaf div (most recent)
360
+ return { text: leafDivs[leafDivs.length - 1].text, source: 'leaf-div' };
361
+ }
362
+ // Otherwise return the last candidate
363
+ return { text: candidateDivs[candidateDivs.length - 1].text, source: 'fallback-div' };
364
+ }
365
+ // Last resort: full body text
366
+ return { text: document.body?.textContent?.trim() || '', source: 'body' };
367
+ });
368
+ if (result.text && result.text.length > 0) {
369
+ // Clean up the response - remove Cloudflare scripts and other noise
370
+ let cleaned = result.text.replace(/\(function\(\)\{.*?\}\)\(\);?/gs, '').trim();
371
+ // Remove "You said:" prefix and everything after it if it looks like user echo
372
+ // This pattern matches "You said:" followed by any text
373
+ const youSaidMatch = cleaned.match(/^You said:([\s\S]*)/i);
374
+ if (youSaidMatch) {
375
+ // The entire response is just echoing the user - this means we got the wrong element
376
+ // Return empty to trigger retry or indicate no real response
377
+ logger_1.default.debug('[ChatKitProvider] Detected user echo, discarding', {
378
+ preview: cleaned.substring(0, 100),
379
+ });
380
+ // Don't return the echo - continue to retry or return empty
381
+ cleaned = '';
382
+ }
383
+ // Also check for "You said:" appearing anywhere in the text and remove it
384
+ cleaned = cleaned.replace(/You said:[\s\S]*/gi, '').trim();
385
+ // Don't strip JSON if it's the only response - it might be intentional
386
+ // Only strip if there's substantial text after the JSON
387
+ const jsonMatch = cleaned.match(/^(\{[^}]+\})\s+(.+)/s);
388
+ if (jsonMatch && jsonMatch[2].trim().length > 50) {
389
+ cleaned = jsonMatch[2].trim();
390
+ }
391
+ if (cleaned.length > 0) {
392
+ logger_1.default.debug('[ChatKitProvider] Extracted response', {
393
+ source: result.source,
394
+ length: cleaned.length,
395
+ preview: cleaned.substring(0, 100),
396
+ });
397
+ return cleaned;
398
+ }
399
+ // If we got here with no cleaned text but had original text,
400
+ // the extraction found only user content - return empty to retry
401
+ logger_1.default.debug('[ChatKitProvider] No assistant content found after cleaning', {
402
+ originalLength: result.text.length,
403
+ source: result.source,
404
+ });
405
+ }
406
+ }
407
+ catch (e) {
408
+ logger_1.default.debug('[ChatKitProvider] Could not access frame', { url, error: e, attempt });
409
+ }
410
+ }
411
+ }
412
+ // Wait before retry
413
+ if (attempt < maxRetries - 1) {
414
+ await page.waitForTimeout(RESPONSE_EXTRACT_RETRY_DELAY_MS);
415
+ }
416
+ }
417
+ return '';
418
+ }
419
+ /**
420
+ * Handle workflow approval steps by clicking approve/reject buttons.
421
+ * Returns true if an approval was handled, false if no approval found.
422
+ */
423
+ async function handleApproval(page, action) {
424
+ const frames = page.frames();
425
+ for (const frame of frames) {
426
+ const url = frame.url();
427
+ if (isOpenAICdnUrl(url)) {
428
+ try {
429
+ // Look for approval buttons in the ChatKit iframe
430
+ const buttonText = action === 'auto-approve' ? 'Approve' : 'Reject';
431
+ const buttonSelectors = [
432
+ `button:has-text("${buttonText}")`,
433
+ `[role="button"]:has-text("${buttonText}")`,
434
+ `[data-testid="${buttonText.toLowerCase()}-button"]`,
435
+ ];
436
+ for (const selector of buttonSelectors) {
437
+ const button = await frame.$(selector);
438
+ if (button) {
439
+ const isVisible = await button.isVisible();
440
+ if (isVisible) {
441
+ logger_1.default.debug('[ChatKitProvider] Found approval button, clicking', {
442
+ action,
443
+ selector,
444
+ });
445
+ await button.click();
446
+ // Wait for the approval to be processed
447
+ await page.waitForTimeout(APPROVAL_CLICK_DELAY_MS);
448
+ return true;
449
+ }
450
+ }
451
+ }
452
+ // Alternative: Look for approval UI patterns in the DOM
453
+ const hasApproval = await frame.evaluate((btnText) => {
454
+ const buttons = Array.from(document.querySelectorAll('button, [role="button"]'));
455
+ const approveBtn = buttons.find((b) => b.textContent?.toLowerCase().includes(btnText.toLowerCase()));
456
+ if (approveBtn && approveBtn instanceof HTMLElement) {
457
+ approveBtn.click();
458
+ return true;
459
+ }
460
+ return false;
461
+ }, buttonText);
462
+ if (hasApproval) {
463
+ logger_1.default.debug('[ChatKitProvider] Clicked approval button via evaluate', { action });
464
+ await page.waitForTimeout(APPROVAL_CLICK_DELAY_MS);
465
+ return true;
466
+ }
467
+ }
468
+ catch (e) {
469
+ logger_1.default.debug('[ChatKitProvider] Error checking for approval buttons', { error: e });
470
+ }
471
+ }
472
+ }
473
+ return false;
474
+ }
475
+ /**
476
+ * Process approvals until none remain or max reached.
477
+ * Returns the number of approvals processed.
478
+ */
479
+ async function processApprovals(page, approvalHandling, maxApprovals, timeout) {
480
+ if (approvalHandling === 'skip') {
481
+ return 0;
482
+ }
483
+ let approvalCount = 0;
484
+ while (approvalCount < maxApprovals) {
485
+ // Small delay to let UI settle
486
+ await page.waitForTimeout(APPROVAL_PROCESS_DELAY_MS);
487
+ const handled = await handleApproval(page, approvalHandling);
488
+ if (!handled) {
489
+ break;
490
+ }
491
+ approvalCount++;
492
+ logger_1.default.debug('[ChatKitProvider] Processed approval', {
493
+ count: approvalCount,
494
+ max: maxApprovals,
495
+ });
496
+ // Wait for next response after approval
497
+ try {
498
+ await page.waitForFunction((prevCount) => window.__state?.responses?.length > prevCount, approvalCount, { timeout: timeout / 2 });
499
+ // Let DOM settle after new response
500
+ await page.waitForTimeout(DOM_SETTLE_DELAY_MS);
501
+ }
502
+ catch {
503
+ // Timeout waiting for response after approval - might be final response
504
+ break;
505
+ }
506
+ }
507
+ return approvalCount;
508
+ }
509
+ class OpenAiChatKitProvider extends index_1.OpenAiGenericProvider {
510
+ static getDefaultUserId() {
511
+ if (!OpenAiChatKitProvider.defaultUserId) {
512
+ // Generate once per process to ensure template consistency
513
+ OpenAiChatKitProvider.defaultUserId = `promptfoo-eval-${Date.now()}`;
514
+ }
515
+ return OpenAiChatKitProvider.defaultUserId;
516
+ }
517
+ constructor(workflowId, options = {}) {
518
+ super(workflowId, options);
519
+ this.browser = null;
520
+ this.context = null;
521
+ this.page = null;
522
+ this.server = null;
523
+ this.serverPort = 0;
524
+ this.initialized = false;
525
+ // Default poolSize to PROMPTFOO_MAX_CONCURRENCY env var if set, otherwise DEFAULT_POOL_SIZE
526
+ const envPoolSize = process.env.PROMPTFOO_MAX_CONCURRENCY
527
+ ? parseInt(process.env.PROMPTFOO_MAX_CONCURRENCY, 10)
528
+ : NaN;
529
+ const defaultPoolSize = Number.isNaN(envPoolSize) ? DEFAULT_POOL_SIZE : envPoolSize;
530
+ this.chatKitConfig = {
531
+ workflowId: options.config?.workflowId || workflowId,
532
+ version: options.config?.version,
533
+ // Use consistent default userId to ensure template stability during concurrent execution
534
+ userId: options.config?.userId || OpenAiChatKitProvider.getDefaultUserId(),
535
+ timeout: options.config?.timeout || DEFAULT_TIMEOUT_MS,
536
+ headless: options.config?.headless ?? true,
537
+ serverPort: options.config?.serverPort || 0,
538
+ usePool: options.config?.usePool ?? true, // Pool mode by default for better performance
539
+ poolSize: options.config?.poolSize ?? defaultPoolSize,
540
+ approvalHandling: options.config?.approvalHandling ?? 'auto-approve',
541
+ maxApprovals: options.config?.maxApprovals ?? DEFAULT_MAX_APPROVALS,
542
+ stateful: options.config?.stateful ?? false,
543
+ };
544
+ }
545
+ id() {
546
+ const version = this.chatKitConfig.version ? `:${this.chatKitConfig.version}` : '';
547
+ return `openai:chatkit:${this.chatKitConfig.workflowId}${version}`;
548
+ }
549
+ toString() {
550
+ return `[OpenAI ChatKit Provider ${this.chatKitConfig.workflowId}]`;
551
+ }
552
+ /**
553
+ * Initialize the browser and ChatKit page
554
+ */
555
+ async initialize() {
556
+ if (this.initialized) {
557
+ return;
558
+ }
559
+ const apiKey = this.getApiKey();
560
+ if (!apiKey) {
561
+ throw new Error('OpenAI API key is required for ChatKit provider');
562
+ }
563
+ const workflowId = this.chatKitConfig.workflowId;
564
+ if (!workflowId) {
565
+ throw new Error('ChatKit workflowId is required');
566
+ }
567
+ logger_1.default.debug('[ChatKitProvider] Initializing', {
568
+ workflowId,
569
+ version: this.chatKitConfig.version,
570
+ });
571
+ // Create HTTP server to serve the ChatKit HTML
572
+ const html = generateChatKitHTML(apiKey, workflowId, this.chatKitConfig.version, this.chatKitConfig.userId);
573
+ this.server = http.createServer((_req, res) => {
574
+ res.writeHead(200, { 'Content-Type': 'text/html' });
575
+ res.end(html);
576
+ });
577
+ await new Promise((resolve, reject) => {
578
+ this.server.once('error', (err) => {
579
+ reject(new Error(`Failed to start ChatKit server: ${err.message}`));
580
+ });
581
+ this.server.listen(this.chatKitConfig.serverPort, () => {
582
+ const address = this.server.address();
583
+ this.serverPort = typeof address === 'object' ? address?.port || 0 : 0;
584
+ logger_1.default.debug('[ChatKitProvider] Server started', { port: this.serverPort });
585
+ resolve();
586
+ });
587
+ });
588
+ // Launch browser with helpful error for missing Playwright
589
+ try {
590
+ this.browser = await playwright_1.chromium.launch({
591
+ headless: this.chatKitConfig.headless,
592
+ });
593
+ }
594
+ catch (launchError) {
595
+ const errorMessage = launchError instanceof Error ? launchError.message : String(launchError);
596
+ if (errorMessage.includes("Executable doesn't exist") ||
597
+ errorMessage.includes('browserType.launch')) {
598
+ throw new Error('Playwright browser not installed. Run: npx playwright install chromium\n' +
599
+ `Original error: ${errorMessage}`);
600
+ }
601
+ throw launchError;
602
+ }
603
+ this.context = await this.browser.newContext({
604
+ viewport: { width: 800, height: 600 },
605
+ });
606
+ this.page = await this.context.newPage();
607
+ // Navigate to our HTML page
608
+ await this.page.goto(`http://localhost:${this.serverPort}`, {
609
+ waitUntil: 'domcontentloaded',
610
+ });
611
+ // Wait for ChatKit to be ready
612
+ logger_1.default.debug('[ChatKitProvider] Waiting for ChatKit ready');
613
+ await this.page.waitForFunction(() => window.__state?.ready === true, {
614
+ timeout: CHATKIT_READY_TIMEOUT_MS,
615
+ });
616
+ this.initialized = true;
617
+ // Register for cleanup on process exit (non-pool mode only)
618
+ // Pool mode has its own cleanup mechanism
619
+ if (!this.chatKitConfig.usePool) {
620
+ providerRegistry_1.providerRegistry.register(this);
621
+ }
622
+ logger_1.default.debug('[ChatKitProvider] Initialized successfully');
623
+ }
624
+ /**
625
+ * Shutdown method for providerRegistry cleanup
626
+ */
627
+ async shutdown() {
628
+ await this.cleanup();
629
+ }
630
+ /**
631
+ * Clean up browser resources
632
+ */
633
+ async cleanup() {
634
+ if (this.context) {
635
+ await this.context.close();
636
+ this.context = null;
637
+ this.page = null;
638
+ }
639
+ if (this.browser) {
640
+ await this.browser.close();
641
+ this.browser = null;
642
+ }
643
+ if (this.server) {
644
+ this.server.close();
645
+ this.server = null;
646
+ }
647
+ this.initialized = false;
648
+ }
649
+ /**
650
+ * Call the ChatKit workflow with the given prompt
651
+ */
652
+ async callApi(prompt, _context, _callApiOptions) {
653
+ // Stateful mode requires sequential processing, so disable pool mode
654
+ const usePool = this.chatKitConfig.usePool && !this.chatKitConfig.stateful;
655
+ logger_1.default.debug('[ChatKitProvider] Starting call', {
656
+ prompt: prompt.substring(0, 100),
657
+ workflowId: this.chatKitConfig.workflowId,
658
+ usePool,
659
+ stateful: this.chatKitConfig.stateful,
660
+ });
661
+ // Use pool-based execution for better concurrency (not available in stateful mode)
662
+ if (usePool) {
663
+ return this.callApiWithPool(prompt);
664
+ }
665
+ const startTime = Date.now();
666
+ try {
667
+ await this.initialize();
668
+ if (!this.page) {
669
+ throw new Error('Browser page not initialized');
670
+ }
671
+ // For stateful mode, don't reload the page to maintain conversation state
672
+ // For non-stateful mode, refresh to get clean state for each evaluation
673
+ if (!this.chatKitConfig.stateful) {
674
+ await this.page.reload({ waitUntil: 'domcontentloaded' });
675
+ // Wait for ChatKit to be ready again after reload
676
+ await this.page.waitForFunction(() => window.__state?.ready === true, {
677
+ timeout: CHATKIT_READY_TIMEOUT_MS,
678
+ });
679
+ }
680
+ // For stateful mode, check if this is a follow-up message (responses already exist)
681
+ // Use newThread: false for follow-ups to continue the conversation
682
+ const responseCount = await this.page.evaluate(() => window.__state?.responses?.length || 0);
683
+ const isFollowUp = this.chatKitConfig.stateful && responseCount > 0;
684
+ logger_1.default.debug('[ChatKitProvider] Sending message', {
685
+ stateful: this.chatKitConfig.stateful,
686
+ isFollowUp,
687
+ responseCount,
688
+ });
689
+ // Send the message
690
+ await this.page.evaluate(({ text, newThread }) => {
691
+ return window.__chatkit.sendUserMessage({
692
+ text,
693
+ newThread,
694
+ });
695
+ }, { text: prompt, newThread: !isFollowUp });
696
+ // Wait for response - in stateful mode, wait for a NEW response
697
+ logger_1.default.debug('[ChatKitProvider] Waiting for response');
698
+ const expectedResponseCount = responseCount + 1;
699
+ await this.page.waitForFunction((expected) => window.__state?.responses?.length >= expected, expectedResponseCount, { timeout: this.chatKitConfig.timeout });
700
+ // Allow DOM to settle - ChatKit iframe needs time to render the response
701
+ await this.page.waitForTimeout(DOM_SETTLE_DELAY_MS);
702
+ // Handle any approval steps in the workflow
703
+ const approvalsHandled = await processApprovals(this.page, this.chatKitConfig.approvalHandling ?? 'auto-approve', this.chatKitConfig.maxApprovals ?? DEFAULT_MAX_APPROVALS, this.chatKitConfig.timeout ?? DEFAULT_TIMEOUT_MS);
704
+ if (approvalsHandled > 0) {
705
+ logger_1.default.debug('[ChatKitProvider] Processed approvals', { count: approvalsHandled });
706
+ }
707
+ // Extract response from iframe
708
+ const responseText = await extractResponseFromFrame(this.page);
709
+ // Get thread ID
710
+ const threadId = await this.page.evaluate(() => window.__state.threadId);
711
+ // Get final response count for turn tracking
712
+ const finalResponseCount = await this.page.evaluate(() => window.__state?.responses?.length || 0);
713
+ const latencyMs = Date.now() - startTime;
714
+ logger_1.default.debug('[ChatKitProvider] Response received', {
715
+ threadId,
716
+ textLength: responseText.length,
717
+ turnNumber: finalResponseCount,
718
+ latencyMs,
719
+ });
720
+ return {
721
+ output: responseText,
722
+ cached: false, // ChatKit responses are never cached (browser-based)
723
+ latencyMs,
724
+ // Use sessionId for consistency with HTTP provider's stateful handling
725
+ sessionId: threadId,
726
+ // Token usage not available from ChatKit, but track request count
727
+ tokenUsage: { numRequests: 1 },
728
+ metadata: {
729
+ workflowId: this.chatKitConfig.workflowId,
730
+ version: this.chatKitConfig.version,
731
+ stateful: this.chatKitConfig.stateful,
732
+ turnNumber: finalResponseCount,
733
+ },
734
+ };
735
+ }
736
+ catch (error) {
737
+ const errorMessage = error instanceof Error ? error.message : String(error);
738
+ logger_1.default.error('[ChatKitProvider] Call failed', { error: errorMessage });
739
+ // Check for ChatKit-specific errors in page state
740
+ if (this.page) {
741
+ try {
742
+ const stateError = await this.page.evaluate(() => window.__state?.error);
743
+ if (stateError) {
744
+ return {
745
+ error: `ChatKit workflow error: ${stateError}`,
746
+ };
747
+ }
748
+ }
749
+ catch {
750
+ // Page may be in bad state, continue with general error
751
+ }
752
+ }
753
+ // Provide helpful error messages for common issues
754
+ if (errorMessage.includes('Timeout') || errorMessage.includes('timeout')) {
755
+ return {
756
+ error: `ChatKit response timeout after ${this.chatKitConfig.timeout}ms. ` +
757
+ 'Try increasing timeout in config or use --max-concurrency 1 for more reliable results.',
758
+ };
759
+ }
760
+ if (errorMessage.includes('API key')) {
761
+ return {
762
+ error: 'OpenAI API key is required. Set OPENAI_API_KEY environment variable.',
763
+ };
764
+ }
765
+ if (errorMessage.includes('Playwright') || errorMessage.includes('browser')) {
766
+ return {
767
+ error: `Browser error: ${errorMessage}. Ensure Playwright is installed: npx playwright install chromium`,
768
+ };
769
+ }
770
+ return {
771
+ error: `ChatKit provider error: ${errorMessage}`,
772
+ };
773
+ }
774
+ }
775
+ /**
776
+ * Pool-based callApi for better concurrency support.
777
+ * Uses a shared browser with multiple contexts instead of separate browsers.
778
+ */
779
+ async callApiWithPool(prompt) {
780
+ const apiKey = this.getApiKey();
781
+ if (!apiKey) {
782
+ return {
783
+ error: 'OpenAI API key is required. Set OPENAI_API_KEY environment variable.',
784
+ };
785
+ }
786
+ const workflowId = this.chatKitConfig.workflowId;
787
+ if (!workflowId) {
788
+ return {
789
+ error: 'ChatKit workflowId is required',
790
+ };
791
+ }
792
+ // Get or create the pool
793
+ const pool = chatkit_pool_1.ChatKitBrowserPool.getInstance({
794
+ maxConcurrency: this.chatKitConfig.poolSize,
795
+ headless: this.chatKitConfig.headless,
796
+ });
797
+ // Generate a unique template key for this workflow configuration
798
+ // This ensures different workflows get isolated pages in the pool
799
+ const templateKey = chatkit_pool_1.ChatKitBrowserPool.generateTemplateKey(workflowId, this.chatKitConfig.version, this.chatKitConfig.userId);
800
+ // Register the HTML template for this workflow
801
+ const html = generateChatKitHTML(apiKey, workflowId, this.chatKitConfig.version, this.chatKitConfig.userId);
802
+ pool.setTemplate(templateKey, html);
803
+ let pooledPage = null;
804
+ const startTime = Date.now();
805
+ try {
806
+ // Acquire a page from the pool for this specific template
807
+ pooledPage = await pool.acquirePage(templateKey);
808
+ const page = pooledPage.page;
809
+ logger_1.default.debug('[ChatKitProvider] Acquired page from pool', {
810
+ stats: pool.getStats(),
811
+ });
812
+ // Send the message
813
+ await page.evaluate((text) => {
814
+ return window.__chatkit.sendUserMessage({
815
+ text,
816
+ newThread: true,
817
+ });
818
+ }, prompt);
819
+ // Wait for response
820
+ await page.waitForFunction(() => window.__state?.responses?.length > 0, {
821
+ timeout: this.chatKitConfig.timeout,
822
+ });
823
+ // Allow DOM to settle
824
+ await page.waitForTimeout(DOM_SETTLE_DELAY_MS);
825
+ // Handle any approval steps in the workflow
826
+ const approvalsHandled = await processApprovals(page, this.chatKitConfig.approvalHandling ?? 'auto-approve', this.chatKitConfig.maxApprovals ?? DEFAULT_MAX_APPROVALS, this.chatKitConfig.timeout ?? DEFAULT_TIMEOUT_MS);
827
+ if (approvalsHandled > 0) {
828
+ logger_1.default.debug('[ChatKitProvider] Pool processed approvals', { count: approvalsHandled });
829
+ }
830
+ // Extract response from iframe
831
+ const responseText = await extractResponseFromFrame(page);
832
+ // Get thread ID
833
+ const threadId = await page.evaluate(() => window.__state.threadId);
834
+ const latencyMs = Date.now() - startTime;
835
+ logger_1.default.debug('[ChatKitProvider] Pool response received', {
836
+ threadId,
837
+ textLength: responseText.length,
838
+ latencyMs,
839
+ });
840
+ return {
841
+ output: responseText,
842
+ cached: false, // ChatKit responses are never cached (browser-based)
843
+ latencyMs,
844
+ // Use sessionId for consistency with HTTP provider's stateful handling
845
+ sessionId: threadId,
846
+ // Token usage not available from ChatKit, but track request count
847
+ tokenUsage: { numRequests: 1 },
848
+ metadata: {
849
+ workflowId: this.chatKitConfig.workflowId,
850
+ version: this.chatKitConfig.version,
851
+ poolMode: true,
852
+ },
853
+ };
854
+ }
855
+ catch (error) {
856
+ const errorMessage = error instanceof Error ? error.message : String(error);
857
+ logger_1.default.error('[ChatKitProvider] Pool call failed', { error: errorMessage });
858
+ if (errorMessage.includes('Timeout') || errorMessage.includes('timeout')) {
859
+ return {
860
+ error: `ChatKit response timeout after ${this.chatKitConfig.timeout}ms. ` +
861
+ 'Try increasing timeout or reducing concurrency.',
862
+ };
863
+ }
864
+ return {
865
+ error: `ChatKit provider error: ${errorMessage}`,
866
+ };
867
+ }
868
+ finally {
869
+ // Release the page back to the pool
870
+ if (pooledPage) {
871
+ await pool.releasePage(pooledPage);
872
+ }
873
+ }
874
+ }
875
+ }
876
+ exports.OpenAiChatKitProvider = OpenAiChatKitProvider;
877
+ // Static userId for consistent template keys across concurrent evaluations
878
+ OpenAiChatKitProvider.defaultUserId = null;
879
+ //# sourceMappingURL=chatkit.js.map