@cutleryapp/agent 1.0.17 → 1.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/mcp-executor.js +125 -56
- package/package.json +1 -1
package/dist/mcp-executor.js
CHANGED
|
@@ -248,9 +248,62 @@ async function tryClick(page, nameRe, label) {
|
|
|
248
248
|
// AI vision fallback
|
|
249
249
|
return await aiClickFallback(page, label);
|
|
250
250
|
}
|
|
251
|
+
function buildAgentPrompt(stepText, round) {
|
|
252
|
+
return `You are an intelligent browser test agent with vision. Your job is to look at the current screen, understand the test step intent, and decide what actions to perform.
|
|
253
|
+
|
|
254
|
+
GOAL: "${stepText}"
|
|
255
|
+
ROUND: ${round + 1}
|
|
256
|
+
|
|
257
|
+
## YOUR CAPABILITIES
|
|
258
|
+
1. SCREEN UNDERSTANDING — Identify every visible UI element, form field, button, link, and label on screen.
|
|
259
|
+
2. INTENT MAPPING — Understand what the test step MEANS even if the wording is vague or high-level (e.g. "checkout the product" = navigate to cart → fill shipping info → complete purchase).
|
|
260
|
+
3. DATA SIMULATION — If a form needs data that is not specified, INVENT realistic test data:
|
|
261
|
+
- Names: "John Smith" or "Test User"
|
|
262
|
+
- Email: "testuser@example.com"
|
|
263
|
+
- Phone: "555-0100"
|
|
264
|
+
- Address: "123 Test St, Springfield"
|
|
265
|
+
- Postal/ZIP: "12345"
|
|
266
|
+
- Credit card: "4111111111111111", expiry "12/25", CVV "123"
|
|
267
|
+
- Password: "Test@1234"
|
|
268
|
+
- Any other field: invent plausible data based on the field label
|
|
269
|
+
|
|
270
|
+
## RETURN FORMAT
|
|
271
|
+
Return ONLY valid JSON — no markdown, no explanation:
|
|
272
|
+
{
|
|
273
|
+
"reasoning": "What I see on screen and what I plan to do",
|
|
274
|
+
"done": false,
|
|
275
|
+
"actions": [
|
|
276
|
+
{"action": "click", "selector": "SELECTOR"},
|
|
277
|
+
{"action": "fill", "selector": "SELECTOR", "value": "VALUE"},
|
|
278
|
+
{"action": "select", "selector": "SELECTOR", "value": "OPTION_LABEL"},
|
|
279
|
+
{"action": "verify", "text": "TEXT_TO_CHECK", "not": false},
|
|
280
|
+
{"action": "wait", "ms": 500}
|
|
281
|
+
]
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
Set "done": true with empty "actions" array when the goal is fully accomplished.
|
|
285
|
+
|
|
286
|
+
## SELECTOR RULES
|
|
287
|
+
- Prefer visible text: button:has-text("Checkout"), a:has-text("Login")
|
|
288
|
+
- Use id when visible: #first-name, #postal-code
|
|
289
|
+
- Use data attributes: [data-test="checkout-button"], [data-testid="..."]
|
|
290
|
+
- Use placeholder for inputs: input[placeholder="First Name"]
|
|
291
|
+
- Use name attribute: input[name="firstName"]
|
|
292
|
+
- NEVER use position or coordinates
|
|
293
|
+
|
|
294
|
+
## SMART RULES
|
|
295
|
+
- Look at ALL visible form fields and fill them ALL in one round
|
|
296
|
+
- If you see a multi-step form, complete this step fully then click continue/next/submit
|
|
297
|
+
- If the goal is "checkout": cart → fill info → continue → finish
|
|
298
|
+
- If the goal is "register" or "sign up": fill all fields + submit
|
|
299
|
+
- If the goal is already accomplished (correct page shown), set done: true immediately
|
|
300
|
+
- Maximum 5 actions per round`;
|
|
301
|
+
}
|
|
251
302
|
/**
|
|
252
|
-
* Universal AI fallback
|
|
253
|
-
*
|
|
303
|
+
* Universal AI agentic fallback.
|
|
304
|
+
* GPT-4o sees the current page screenshot, understands the GOAL of the step,
|
|
305
|
+
* and returns a SEQUENCE of actions to accomplish it — then executes them one by one.
|
|
306
|
+
* After each action it re-screenshots so the AI can verify progress and adapt.
|
|
254
307
|
*/
|
|
255
308
|
async function aiStepFallback(page, stepText) {
|
|
256
309
|
const openaiKey = process.env.OPENAI_API_KEY;
|
|
@@ -258,61 +311,77 @@ async function aiStepFallback(page, stepText) {
|
|
|
258
311
|
throw new Error(`No OPENAI_API_KEY — cannot use AI fallback for: "${stepText}"`);
|
|
259
312
|
const { default: OpenAI } = await import('openai');
|
|
260
313
|
const openai = new OpenAI({ apiKey: openaiKey });
|
|
261
|
-
const
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
314
|
+
const MAX_ROUNDS = 6; // prevent infinite loops
|
|
315
|
+
for (let round = 0; round < MAX_ROUNDS; round++) {
|
|
316
|
+
const screenshotBuffer = await page.screenshot({ type: 'png' });
|
|
317
|
+
const base64 = screenshotBuffer.toString('base64');
|
|
318
|
+
const response = await openai.chat.completions.create({
|
|
319
|
+
model: 'gpt-4o',
|
|
320
|
+
max_tokens: 800,
|
|
321
|
+
messages: [{
|
|
322
|
+
role: 'user',
|
|
323
|
+
content: [
|
|
324
|
+
{ type: 'text', text: buildAgentPrompt(stepText, round) },
|
|
325
|
+
{ type: 'image_url', image_url: { url: `data:image/png;base64,${base64}` } }
|
|
326
|
+
]
|
|
327
|
+
}]
|
|
328
|
+
});
|
|
329
|
+
const raw = (response.choices[0]?.message?.content || '')
|
|
330
|
+
.trim()
|
|
331
|
+
.replace(/```json\n?/gi, '')
|
|
332
|
+
.replace(/```/g, '')
|
|
333
|
+
.trim();
|
|
334
|
+
if (!raw)
|
|
335
|
+
throw new Error(`AI returned empty response for: "${stepText}"`);
|
|
336
|
+
const plan = JSON.parse(raw);
|
|
337
|
+
console.log(` 🤖 AI round ${round + 1} — ${plan.reasoning}`);
|
|
338
|
+
if (plan.done || !plan.actions?.length) {
|
|
339
|
+
console.log(` ✅ AI agent completed: "${stepText}"`);
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
// Execute each action in this round
|
|
343
|
+
for (const act of plan.actions) {
|
|
344
|
+
console.log(` 🤖 Executing: ${JSON.stringify(act)}`);
|
|
345
|
+
try {
|
|
346
|
+
if (act.action === 'click') {
|
|
347
|
+
await page.locator(act.selector).first().click({ timeout: 10000 });
|
|
348
|
+
await page.waitForTimeout(500); // brief settle
|
|
349
|
+
}
|
|
350
|
+
else if (act.action === 'fill') {
|
|
351
|
+
await page.locator(act.selector).first().fill(act.value || '');
|
|
352
|
+
}
|
|
353
|
+
else if (act.action === 'select') {
|
|
354
|
+
await page.locator(act.selector).first().selectOption({ label: act.value });
|
|
355
|
+
}
|
|
356
|
+
else if (act.action === 'verify') {
|
|
357
|
+
const content = await page.textContent('body') || '';
|
|
358
|
+
const found = content.includes(act.text);
|
|
359
|
+
if (act.not && found)
|
|
360
|
+
throw new Error(`Text "${act.text}" should NOT be visible`);
|
|
361
|
+
if (!act.not && !found)
|
|
362
|
+
throw new Error(`Expected text not found: "${act.text}"`);
|
|
363
|
+
}
|
|
364
|
+
else if (act.action === 'wait') {
|
|
365
|
+
await page.waitForTimeout(act.ms || 1000);
|
|
366
|
+
}
|
|
367
|
+
else if (act.action === 'scroll') {
|
|
368
|
+
if (act.selector) {
|
|
369
|
+
await page.locator(act.selector).first().scrollIntoViewIfNeeded();
|
|
370
|
+
}
|
|
371
|
+
else {
|
|
372
|
+
await page.evaluate(() => window.scrollBy(0, 400));
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
catch (e) {
|
|
377
|
+
console.log(` ⚠️ Action failed (${e.message}), continuing to next round...`);
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
// Wait for page to settle before next round
|
|
381
|
+
await page.waitForLoadState('domcontentloaded').catch(() => { });
|
|
382
|
+
await page.waitForTimeout(300);
|
|
315
383
|
}
|
|
384
|
+
throw new Error(`AI agent could not complete "${stepText}" within ${MAX_ROUNDS} rounds`);
|
|
316
385
|
}
|
|
317
386
|
/** Use OpenAI vision to identify the element and generate a selector, then click it */
|
|
318
387
|
async function aiClickFallback(page, description) {
|