@cutleryapp/agent 1.0.18 → 1.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/mcp-executor.js +67 -130
  2. package/package.json +1 -1
@@ -60,9 +60,9 @@ class TestExecutor {
60
60
  });
61
61
  let stepError;
62
62
  try {
63
+ // Navigate is handled directly — URL extraction doesn't need vision
63
64
  if (lower.includes("navigate to") || lower.includes("go to")) {
64
- // Extract URL wherever it appears in the step (handles emoji/symbol prefixes)
65
- const urlMatch = raw.match(/(?:navigate to|go to)\s+(https?:\/\/\S+|\/\S*|\S+\.\S+)/i);
65
+ const urlMatch = raw.match(/(?:navigate\s+to|go\s+to)\s+(https?:\/\/\S+|\/\S*|\S+\.\S+)/i);
66
66
  if (urlMatch) {
67
67
  let url = urlMatch[1].trim();
68
68
  if (url.startsWith("/") && this.options.baseUrl) {
@@ -71,110 +71,18 @@ class TestExecutor {
71
71
  await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
72
72
  }
73
73
  else if (this.options.baseUrl) {
74
- // Step says "navigate to" but no URL found — go to baseUrl
75
74
  await page.goto(this.options.baseUrl, { waitUntil: "domcontentloaded", timeout: 30000 });
76
75
  }
77
76
  }
78
- else if (lower.includes("click")) {
79
- const labelMatch = raw.match(/click\s+(?:on\s+)?(?:the\s+)?"?([^"]+?)"?(?:\s+(?:button|link|tab))?$/i);
80
- let label = labelMatch?.[1]?.trim();
81
- if (label) {
82
- // Split "Add to cart under Sauce Labs Bike Light product" into target + scope
83
- const scopeMatch = label.match(/^(.+?)\s+(?:under|inside|within|in the|in)\s+(.+)$/i);
84
- const target = scopeMatch ? scopeMatch[1].trim() : label;
85
- const scope = scopeMatch ? scopeMatch[2].trim() : null;
86
- const nameRe = new RegExp(escapeRegex(target), 'i');
87
- const clicked = scope
88
- ? await tryClickScoped(page, nameRe, target, scope)
89
- : await tryClick(page, nameRe, target);
90
- if (!clicked)
91
- throw new Error(`Could not find clickable element: "${label}"`);
92
- }
93
- }
94
- else if (lower.includes("fill") || lower.includes("type") || lower.includes("enter")) {
95
- // Support both quoted and unquoted formats:
96
- // Fill "standard_user" in "Username"
97
- // Fill standard_user in Username field
98
- // Split on first " in " / " into " to separate value from field
99
- const match = raw.match(/(?:enter|fill|type)\s+"([^"]+)"\s+(?:in|into)\s+(?:the\s+)?"?([^"]+?)"?\s*(?:field|input|box|area)?\s*$/i) ||
100
- raw.match(/(?:enter|fill|type)\s+(\S+)\s+(?:in|into)\s+(?:the\s+)?(.+?)\s*(?:field|input|box|area)?\s*$/i);
101
- if (match) {
102
- const value = match[1].trim();
103
- const fieldLabel = match[2].trim();
104
- const looksLikeCss = (s) => /[#.\[\]:>]/.test(s);
105
- if (looksLikeCss(fieldLabel)) {
106
- await page.waitForSelector(fieldLabel, { state: "visible", timeout: 5000 });
107
- await page.fill(fieldLabel, value);
108
- }
109
- else {
110
- await tryFill(page, fieldLabel, value);
111
- }
112
- }
113
- }
114
- else if (lower.includes("wait") && lower.includes("second")) {
115
- const ms = raw.match(/wait\s+(\d+)\s*sec/i);
116
- if (ms)
117
- await page.waitForTimeout(parseInt(ms[1]) * 1000);
118
- }
119
- else if (lower.includes("wait for") && !lower.includes("second")) {
120
- const sel = extractSelector(raw, /wait for\s+"?([^"]+)"?\s+to be/i);
121
- if (sel)
122
- await page.waitForSelector(sel, { state: "visible", timeout: 15000 });
123
- }
124
- else if (lower.includes("verify") || lower.includes("check") || lower.includes("assert") || lower.includes("should")) {
125
- // Support: Verify "text", Verify I see text Foo, Verify text Foo is not displayed
126
- const isNegative = /not\s+(?:displayed|visible|present)/i.test(raw);
127
- const textMatch = raw.match(/"([^"]+)"/) ||
128
- raw.match(/(?:verify|check|assert)\s+(?:i\s+see\s+(?:text\s+)?|text\s+)?(.+?)(?:\s+is\s+(?:not\s+)?(?:displayed|visible|present))?$/i);
129
- if (textMatch) {
130
- const expected = textMatch[1].trim();
131
- if (isNegative) {
132
- const content = await page.textContent('body') || '';
133
- if (content.includes(expected))
134
- throw new Error(`Text "${expected}" should NOT be visible but was found`);
135
- }
136
- else {
137
- try {
138
- await page.waitForFunction((text) => document.body.innerText.includes(text), expected, { timeout: 10000 });
139
- }
140
- catch {
141
- throw new Error(`Expected text not found: "${expected}"`);
142
- }
143
- }
144
- }
145
- }
146
- else if (lower.includes("select") || lower.includes("choose")) {
147
- const selMatch = raw.match(/select\s+"?([^"]+?)"?\s+(?:from|in)\s+"?([^"]+?)"?\s*(?:dropdown|select|field)?$/i);
148
- if (selMatch) {
149
- try {
150
- await page.selectOption(selMatch[2].trim(), { label: selMatch[1].trim() });
151
- }
152
- catch {
153
- // fallback: click the dropdown then click the option
154
- await tryClick(page, new RegExp(escapeRegex(selMatch[2].trim()), 'i'), selMatch[2].trim());
155
- await tryClick(page, new RegExp(escapeRegex(selMatch[1].trim()), 'i'), selMatch[1].trim());
156
- }
157
- }
158
- else {
159
- await aiStepFallback(page, raw);
160
- }
161
- }
162
77
  else {
163
- // Unknown step let AI interpret and execute it
78
+ // ALL other steps: AI reads the screen and performs the action
79
+ console.log(` 🤖 AI executing: "${raw}"`);
164
80
  await aiStepFallback(page, raw);
165
81
  }
166
82
  }
167
83
  catch (err) {
168
- // If a recognised handler threw, try AI fallback before marking as failed
169
- console.log(` ⚠️ Step failed (${err.message}), trying AI fallback...`);
170
- try {
171
- await aiStepFallback(page, raw);
172
- stepError = undefined; // AI recovered it
173
- }
174
- catch (aiErr) {
175
- stepError = err.message; // Report original error
176
- result.success = false;
177
- }
84
+ stepError = err.message;
85
+ result.success = false;
178
86
  }
179
87
  // Screenshot after each step
180
88
  let screenshotB64 = "";
@@ -248,6 +156,57 @@ async function tryClick(page, nameRe, label) {
248
156
  // AI vision fallback
249
157
  return await aiClickFallback(page, label);
250
158
  }
159
+ function buildAgentPrompt(stepText, round) {
160
+ return `You are an intelligent browser test agent with vision. Your job is to look at the current screen, understand the test step intent, and decide what actions to perform.
161
+
162
+ GOAL: "${stepText}"
163
+ ROUND: ${round + 1}
164
+
165
+ ## YOUR CAPABILITIES
166
+ 1. SCREEN UNDERSTANDING — Identify every visible UI element, form field, button, link, and label on screen.
167
+ 2. INTENT MAPPING — Understand what the test step MEANS even if the wording is vague or high-level (e.g. "checkout the product" = navigate to cart → fill shipping info → complete purchase).
168
+ 3. DATA SIMULATION — If a form needs data that is not specified, INVENT realistic test data:
169
+ - Names: "John Smith" or "Test User"
170
+ - Email: "testuser@example.com"
171
+ - Phone: "555-0100"
172
+ - Address: "123 Test St, Springfield"
173
+ - Postal/ZIP: "12345"
174
+ - Credit card: "4111111111111111", expiry "12/25", CVV "123"
175
+ - Password: "Test@1234"
176
+ - Any other field: invent plausible data based on the field label
177
+
178
+ ## RETURN FORMAT
179
+ Return ONLY valid JSON — no markdown, no explanation:
180
+ {
181
+ "reasoning": "What I see on screen and what I plan to do",
182
+ "done": false,
183
+ "actions": [
184
+ {"action": "click", "selector": "SELECTOR"},
185
+ {"action": "fill", "selector": "SELECTOR", "value": "VALUE"},
186
+ {"action": "select", "selector": "SELECTOR", "value": "OPTION_LABEL"},
187
+ {"action": "verify", "text": "TEXT_TO_CHECK", "not": false},
188
+ {"action": "wait", "ms": 500}
189
+ ]
190
+ }
191
+
192
+ Set "done": true with empty "actions" array when the goal is fully accomplished.
193
+
194
+ ## SELECTOR RULES
195
+ - Prefer visible text: button:has-text("Checkout"), a:has-text("Login")
196
+ - Use id when visible: #first-name, #postal-code
197
+ - Use data attributes: [data-test="checkout-button"], [data-testid="..."]
198
+ - Use placeholder for inputs: input[placeholder="First Name"]
199
+ - Use name attribute: input[name="firstName"]
200
+ - NEVER use position or coordinates
201
+
202
+ ## SMART RULES
203
+ - Look at ALL visible form fields and fill them ALL in one round
204
+ - If you see a multi-step form, complete this step fully then click continue/next/submit
205
+ - If the goal is "checkout": cart → fill info → continue → finish
206
+ - If the goal is "register" or "sign up": fill all fields + submit
207
+ - If the goal is already accomplished (correct page shown), set done: true immediately
208
+ - Maximum 5 actions per round`;
209
+ }
251
210
  /**
252
211
  * Universal AI agentic fallback.
253
212
  * GPT-4o sees the current page screenshot, understands the GOAL of the step,
@@ -266,41 +225,11 @@ async function aiStepFallback(page, stepText) {
266
225
  const base64 = screenshotBuffer.toString('base64');
267
226
  const response = await openai.chat.completions.create({
268
227
  model: 'gpt-4o',
269
- max_tokens: 600,
228
+ max_tokens: 800,
270
229
  messages: [{
271
230
  role: 'user',
272
231
  content: [
273
- {
274
- type: 'text',
275
- text: `You are an autonomous browser test agent. Your goal is to accomplish this test step:
276
-
277
- "${stepText}"
278
-
279
- Look at the current screenshot and decide what actions are needed RIGHT NOW to make progress toward the goal.
280
-
281
- Return ONLY valid JSON (no markdown) with this structure:
282
- {
283
- "reasoning": "brief explanation of what you see and what needs to happen",
284
- "done": false,
285
- "actions": [
286
- {"action": "click", "selector": "CSS_OR_TEXT_SELECTOR"},
287
- {"action": "fill", "selector": "CSS_OR_TEXT_SELECTOR", "value": "TEXT"},
288
- {"action": "select", "selector": "CSS_OR_TEXT_SELECTOR", "value": "OPTION_TEXT"},
289
- {"action": "verify", "text": "EXPECTED_TEXT", "not": false},
290
- {"action": "wait", "ms": 500}
291
- ]
292
- }
293
-
294
- Set "done": true and empty "actions" array when the goal has been fully accomplished.
295
-
296
- Rules:
297
- - Return only the actions needed for THIS screenshot — after executing them you will see the next state
298
- - Use text-based selectors when possible: button:has-text("Checkout"), [data-test="..."], #id
299
- - For "checkout" goals: look for cart/checkout buttons and click them in sequence
300
- - For "login" goals: fill username, fill password, click login
301
- - For "add to cart" goals: find and click the Add to cart button
302
- - Maximum 3 actions per round to stay precise`
303
- },
232
+ { type: 'text', text: buildAgentPrompt(stepText, round) },
304
233
  { type: 'image_url', image_url: { url: `data:image/png;base64,${base64}` } }
305
234
  ]
306
235
  }]
@@ -343,6 +272,14 @@ Rules:
343
272
  else if (act.action === 'wait') {
344
273
  await page.waitForTimeout(act.ms || 1000);
345
274
  }
275
+ else if (act.action === 'scroll') {
276
+ if (act.selector) {
277
+ await page.locator(act.selector).first().scrollIntoViewIfNeeded();
278
+ }
279
+ else {
280
+ await page.evaluate(() => window.scrollBy(0, 400));
281
+ }
282
+ }
346
283
  }
347
284
  catch (e) {
348
285
  console.log(` ⚠️ Action failed (${e.message}), continuing to next round...`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cutleryapp/agent",
3
- "version": "1.0.18",
3
+ "version": "1.0.20",
4
4
  "description": "Local agent that connects your machine to the Cutlery QA platform and runs UI tests via Playwright",
5
5
  "main": "dist/cli.js",
6
6
  "bin": {