@cutleryapp/agent 1.0.17 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -249,8 +249,10 @@ async function tryClick(page, nameRe, label) {
249
249
  return await aiClickFallback(page, label);
250
250
  }
251
251
  /**
252
- * Universal AI fallback — takes a screenshot + the raw step text and asks GPT-4o
253
- * what to do (click, fill, verify, select, etc.) and returns a JSON action to execute.
252
+ * Universal AI agentic fallback.
253
+ * GPT-4o sees the current page screenshot, understands the GOAL of the step,
254
+ * and returns a SEQUENCE of actions to accomplish it — then executes them one by one.
255
+ * After each action it re-screenshots so the AI can verify progress and adapt.
254
256
  */
255
257
  async function aiStepFallback(page, stepText) {
256
258
  const openaiKey = process.env.OPENAI_API_KEY;
@@ -258,61 +260,99 @@ async function aiStepFallback(page, stepText) {
258
260
  throw new Error(`No OPENAI_API_KEY — cannot use AI fallback for: "${stepText}"`);
259
261
  const { default: OpenAI } = await import('openai');
260
262
  const openai = new OpenAI({ apiKey: openaiKey });
261
- const screenshotBuffer = await page.screenshot({ type: 'png' });
262
- const base64 = screenshotBuffer.toString('base64');
263
- const response = await openai.chat.completions.create({
264
- model: 'gpt-4o',
265
- max_tokens: 200,
266
- messages: [{
267
- role: 'user',
268
- content: [
269
- {
270
- type: 'text',
271
- text: `You are a Playwright test automation AI. Analyse this screenshot and the test step below, then return a JSON action to execute.
263
+ const MAX_ROUNDS = 6; // prevent infinite loops
264
+ for (let round = 0; round < MAX_ROUNDS; round++) {
265
+ const screenshotBuffer = await page.screenshot({ type: 'png' });
266
+ const base64 = screenshotBuffer.toString('base64');
267
+ const response = await openai.chat.completions.create({
268
+ model: 'gpt-4o',
269
+ max_tokens: 600,
270
+ messages: [{
271
+ role: 'user',
272
+ content: [
273
+ {
274
+ type: 'text',
275
+ text: `You are an autonomous browser test agent. Your goal is to accomplish this test step:
276
+
277
+ "${stepText}"
272
278
 
273
- Test step: "${stepText}"
279
+ Look at the current screenshot and decide what actions are needed RIGHT NOW to make progress toward the goal.
280
+
281
+ Return ONLY valid JSON (no markdown) with this structure:
282
+ {
283
+ "reasoning": "brief explanation of what you see and what needs to happen",
284
+ "done": false,
285
+ "actions": [
286
+ {"action": "click", "selector": "CSS_OR_TEXT_SELECTOR"},
287
+ {"action": "fill", "selector": "CSS_OR_TEXT_SELECTOR", "value": "TEXT"},
288
+ {"action": "select", "selector": "CSS_OR_TEXT_SELECTOR", "value": "OPTION_TEXT"},
289
+ {"action": "verify", "text": "EXPECTED_TEXT", "not": false},
290
+ {"action": "wait", "ms": 500}
291
+ ]
292
+ }
274
293
 
275
- Return ONLY valid JSON (no markdown, no explanation) in one of these formats:
276
- - Click: {"action":"click","selector":"CSS_SELECTOR"}
277
- - Fill: {"action":"fill","selector":"CSS_SELECTOR","value":"TEXT"}
278
- - Select: {"action":"select","selector":"CSS_SELECTOR","value":"OPTION"}
279
- - Verify: {"action":"verify","text":"EXPECTED_TEXT","not":false}
280
- - Wait: {"action":"wait","ms":1000}
294
+ Set "done": true and empty "actions" array when the goal has been fully accomplished.
281
295
 
282
296
  Rules:
283
- - Use the most specific selector you can see (data-testid, id, aria-label, class, text)
284
- - For verify steps, set "not":true if the step says "not displayed/visible"
285
- - Return NOT_FOUND if you cannot determine the action`
286
- },
287
- { type: 'image_url', image_url: { url: `data:image/png;base64,${base64}` } }
288
- ]
289
- }]
290
- });
291
- const raw2 = (response.choices[0]?.message?.content || '').trim().replace(/```json\n?/gi, '').replace(/```/g, '').trim();
292
- if (!raw2 || raw2 === 'NOT_FOUND')
293
- throw new Error(`AI could not determine action for: "${stepText}"`);
294
- const action = JSON.parse(raw2);
295
- console.log(` 🤖 AI action: ${JSON.stringify(action)}`);
296
- if (action.action === 'click') {
297
- await page.locator(action.selector).first().click({ timeout: 10000 });
298
- }
299
- else if (action.action === 'fill') {
300
- await page.locator(action.selector).first().fill(action.value);
301
- }
302
- else if (action.action === 'select') {
303
- await page.locator(action.selector).first().selectOption({ label: action.value });
304
- }
305
- else if (action.action === 'verify') {
306
- const content = await page.textContent('body') || '';
307
- const found = content.includes(action.text);
308
- if (action.not && found)
309
- throw new Error(`Text "${action.text}" should NOT be visible`);
310
- if (!action.not && !found)
311
- throw new Error(`Expected text not found: "${action.text}"`);
312
- }
313
- else if (action.action === 'wait') {
314
- await page.waitForTimeout(action.ms || 1000);
297
+ - Return only the actions needed for THIS screenshot — after executing them you will see the next state
298
+ - Use text-based selectors when possible: button:has-text("Checkout"), [data-test="..."], #id
299
+ - For "checkout" goals: look for cart/checkout buttons and click them in sequence
300
+ - For "login" goals: fill username, fill password, click login
301
+ - For "add to cart" goals: find and click the Add to cart button
302
+ - Maximum 3 actions per round to stay precise`
303
+ },
304
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${base64}` } }
305
+ ]
306
+ }]
307
+ });
308
+ const raw = (response.choices[0]?.message?.content || '')
309
+ .trim()
310
+ .replace(/```json\n?/gi, '')
311
+ .replace(/```/g, '')
312
+ .trim();
313
+ if (!raw)
314
+ throw new Error(`AI returned empty response for: "${stepText}"`);
315
+ const plan = JSON.parse(raw);
316
+ console.log(` 🤖 AI round ${round + 1} — ${plan.reasoning}`);
317
+ if (plan.done || !plan.actions?.length) {
318
+ console.log(` ✅ AI agent completed: "${stepText}"`);
319
+ return;
320
+ }
321
+ // Execute each action in this round
322
+ for (const act of plan.actions) {
323
+ console.log(` 🤖 Executing: ${JSON.stringify(act)}`);
324
+ try {
325
+ if (act.action === 'click') {
326
+ await page.locator(act.selector).first().click({ timeout: 10000 });
327
+ await page.waitForTimeout(500); // brief settle
328
+ }
329
+ else if (act.action === 'fill') {
330
+ await page.locator(act.selector).first().fill(act.value || '');
331
+ }
332
+ else if (act.action === 'select') {
333
+ await page.locator(act.selector).first().selectOption({ label: act.value });
334
+ }
335
+ else if (act.action === 'verify') {
336
+ const content = await page.textContent('body') || '';
337
+ const found = content.includes(act.text);
338
+ if (act.not && found)
339
+ throw new Error(`Text "${act.text}" should NOT be visible`);
340
+ if (!act.not && !found)
341
+ throw new Error(`Expected text not found: "${act.text}"`);
342
+ }
343
+ else if (act.action === 'wait') {
344
+ await page.waitForTimeout(act.ms || 1000);
345
+ }
346
+ }
347
+ catch (e) {
348
+ console.log(` ⚠️ Action failed (${e.message}), continuing to next round...`);
349
+ }
350
+ }
351
+ // Wait for page to settle before next round
352
+ await page.waitForLoadState('domcontentloaded').catch(() => { });
353
+ await page.waitForTimeout(300);
315
354
  }
355
+ throw new Error(`AI agent could not complete "${stepText}" within ${MAX_ROUNDS} rounds`);
316
356
  }
317
357
  /** Use OpenAI vision to identify the element and generate a selector, then click it */
318
358
  async function aiClickFallback(page, description) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cutleryapp/agent",
3
- "version": "1.0.17",
3
+ "version": "1.0.18",
4
4
  "description": "Local agent that connects your machine to the Cutlery QA platform and runs UI tests via Playwright",
5
5
  "main": "dist/cli.js",
6
6
  "bin": {