@cutleryapp/agent 1.0.18 → 1.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -248,6 +248,57 @@ async function tryClick(page, nameRe, label) {
248
248
  // AI vision fallback
249
249
  return await aiClickFallback(page, label);
250
250
  }
251
+ function buildAgentPrompt(stepText, round) {
252
+ return `You are an intelligent browser test agent with vision. Your job is to look at the current screen, understand the test step intent, and decide what actions to perform.
253
+
254
+ GOAL: "${stepText}"
255
+ ROUND: ${round + 1}
256
+
257
+ ## YOUR CAPABILITIES
258
+ 1. SCREEN UNDERSTANDING — Identify every visible UI element, form field, button, link, and label on screen.
259
+ 2. INTENT MAPPING — Understand what the test step MEANS even if the wording is vague or high-level (e.g. "checkout the product" = navigate to cart → fill shipping info → complete purchase).
260
+ 3. DATA SIMULATION — If a form needs data that is not specified, INVENT realistic test data:
261
+ - Names: "John Smith" or "Test User"
262
+ - Email: "testuser@example.com"
263
+ - Phone: "555-0100"
264
+ - Address: "123 Test St, Springfield"
265
+ - Postal/ZIP: "12345"
266
+ - Credit card: "4111111111111111", expiry "12/25", CVV "123"
267
+ - Password: "Test@1234"
268
+ - Any other field: invent plausible data based on the field label
269
+
270
+ ## RETURN FORMAT
271
+ Return ONLY valid JSON — no markdown, no explanation:
272
+ {
273
+ "reasoning": "What I see on screen and what I plan to do",
274
+ "done": false,
275
+ "actions": [
276
+ {"action": "click", "selector": "SELECTOR"},
277
+ {"action": "fill", "selector": "SELECTOR", "value": "VALUE"},
278
+ {"action": "select", "selector": "SELECTOR", "value": "OPTION_LABEL"},
279
+ {"action": "verify", "text": "TEXT_TO_CHECK", "not": false},
280
+ {"action": "wait", "ms": 500}
281
+ ]
282
+ }
283
+
284
+ Set "done": true with empty "actions" array when the goal is fully accomplished.
285
+
286
+ ## SELECTOR RULES
287
+ - Prefer visible text: button:has-text("Checkout"), a:has-text("Login")
288
+ - Use id when visible: #first-name, #postal-code
289
+ - Use data attributes: [data-test="checkout-button"], [data-testid="..."]
290
+ - Use placeholder for inputs: input[placeholder="First Name"]
291
+ - Use name attribute: input[name="firstName"]
292
+ - NEVER use position or coordinates
293
+
294
+ ## SMART RULES
295
+ - Look at ALL visible form fields and fill them ALL in one round
296
+ - If you see a multi-step form, complete this step fully then click continue/next/submit
297
+ - If the goal is "checkout": cart → fill info → continue → finish
298
+ - If the goal is "register" or "sign up": fill all fields + submit
299
+ - If the goal is already accomplished (correct page shown), set done: true immediately
300
+ - Maximum 5 actions per round`;
301
+ }
251
302
  /**
252
303
  * Universal AI agentic fallback.
253
304
  * GPT-4o sees the current page screenshot, understands the GOAL of the step,
@@ -266,41 +317,11 @@ async function aiStepFallback(page, stepText) {
266
317
  const base64 = screenshotBuffer.toString('base64');
267
318
  const response = await openai.chat.completions.create({
268
319
  model: 'gpt-4o',
269
- max_tokens: 600,
320
+ max_tokens: 800,
270
321
  messages: [{
271
322
  role: 'user',
272
323
  content: [
273
- {
274
- type: 'text',
275
- text: `You are an autonomous browser test agent. Your goal is to accomplish this test step:
276
-
277
- "${stepText}"
278
-
279
- Look at the current screenshot and decide what actions are needed RIGHT NOW to make progress toward the goal.
280
-
281
- Return ONLY valid JSON (no markdown) with this structure:
282
- {
283
- "reasoning": "brief explanation of what you see and what needs to happen",
284
- "done": false,
285
- "actions": [
286
- {"action": "click", "selector": "CSS_OR_TEXT_SELECTOR"},
287
- {"action": "fill", "selector": "CSS_OR_TEXT_SELECTOR", "value": "TEXT"},
288
- {"action": "select", "selector": "CSS_OR_TEXT_SELECTOR", "value": "OPTION_TEXT"},
289
- {"action": "verify", "text": "EXPECTED_TEXT", "not": false},
290
- {"action": "wait", "ms": 500}
291
- ]
292
- }
293
-
294
- Set "done": true and empty "actions" array when the goal has been fully accomplished.
295
-
296
- Rules:
297
- - Return only the actions needed for THIS screenshot — after executing them you will see the next state
298
- - Use text-based selectors when possible: button:has-text("Checkout"), [data-test="..."], #id
299
- - For "checkout" goals: look for cart/checkout buttons and click them in sequence
300
- - For "login" goals: fill username, fill password, click login
301
- - For "add to cart" goals: find and click the Add to cart button
302
- - Maximum 3 actions per round to stay precise`
303
- },
324
+ { type: 'text', text: buildAgentPrompt(stepText, round) },
304
325
  { type: 'image_url', image_url: { url: `data:image/png;base64,${base64}` } }
305
326
  ]
306
327
  }]
@@ -343,6 +364,14 @@ Rules:
343
364
  else if (act.action === 'wait') {
344
365
  await page.waitForTimeout(act.ms || 1000);
345
366
  }
367
+ else if (act.action === 'scroll') {
368
+ if (act.selector) {
369
+ await page.locator(act.selector).first().scrollIntoViewIfNeeded();
370
+ }
371
+ else {
372
+ await page.evaluate(() => window.scrollBy(0, 400));
373
+ }
374
+ }
346
375
  }
347
376
  catch (e) {
348
377
  console.log(` ⚠️ Action failed (${e.message}), continuing to next round...`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cutleryapp/agent",
3
- "version": "1.0.18",
3
+ "version": "1.0.19",
4
4
  "description": "Local agent that connects your machine to the Cutlery QA platform and runs UI tests via Playwright",
5
5
  "main": "dist/cli.js",
6
6
  "bin": {