elementus-ai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +292 -0
  3. package/elementus.js +1288 -0
  4. package/package.json +42 -0
package/elementus.js ADDED
@@ -0,0 +1,1288 @@
1
+ /**
2
+ * ╔════════════════════════════════════════════════════════════════════╗
3
+ * ║ Elementus — Self-healing element resolution for Playwright & WDIO ║
4
+ * ╚══════════════════════════════���═════════════════════════════════════╝
5
+ *
6
+ * When a selector breaks, elementus uses AI to find the element by
7
+ * natural-language description. Works with any action (click, fill, hover)
8
+ * and any assertion (toHaveText, toBeVisible). Supports local LLMs via
9
+ * LM Studio and cloud LLMs via Google Gemini API.
10
+ *
11
+ * ─────────────────────────────────────────────────────────────────────────
12
+ * 1. INSTALLATION
13
+ * ─────────────────────────────────────────────────────────────────────────
14
+ *
15
+ * npm install elementus
16
+ *
17
+ * ─────────────────────────────────────────────────────────────────────────
18
+ * 2. LLM PROVIDER SETUP (choose one)
19
+ * ─────────────────────────────────────────────────────────────────────────
20
+ *
21
+ * Option A — Local LLM via LM Studio (free, private, no API key):
22
+ * 1. Download LM Studio from https://lmstudio.ai
23
+ * 2. Load a vision-capable model (e.g., gemma-4-26b-a4b-it)
24
+ * 3. Start the local server (default: http://localhost:1234)
25
+ * 4. Configure:
26
+ * const el = createElementus({
27
+ * provider: 'lmstudio',
28
+ * lmStudioUrl: 'http://localhost:1234/v1/chat/completions',
29
+ * model: 'gemma-4-26b-a4b-it',
30
+ * })
31
+ *
32
+ * Option B — Google Gemini API (cloud, fast, better vision):
33
+ * 1. Get an API key from https://aistudio.google.com/apikey
34
+ * 2. Configure:
35
+ * const el = createElementus({
36
+ * provider: 'gemini',
37
+ * geminiApiKey: 'AIza...', // or set GEMINI_API_KEY env var
38
+ * geminiModel: 'gemini-2.5-flash',
39
+ * })
40
+ *
41
+ * ─────────────────────────────────────────────────────────────────────────
42
+ * 3. FRAMEWORK SETUP
43
+ * ─────────────────────────────────────────────────────────────────────────
44
+ *
45
+ * Playwright — wrap page once, add { ai } to any locator:
46
+ *
47
+ * const { createElementus } = require('elementus')
48
+ * const el = createElementus({ provider: 'gemini', geminiApiKey: '...' })
49
+ *
50
+ * // In test or fixture:
51
+ * const p = el.wrapPage(page)
52
+ * await p.locator('#btn', { ai: 'Submit order button' }).click()
53
+ * await p.locator('#email', { ai: 'Email input field' }).fill('test@test.com')
54
+ *
55
+ * // Locators WITHOUT { ai } work normally — zero overhead:
56
+ * await p.locator('#always-stable').click()
57
+ *
58
+ * Playwright fixture (recommended — wrap once for all tests):
59
+ *
60
+ * // fixtures.js
61
+ * const { test: base } = require('@playwright/test')
62
+ * const { createElementus } = require('elementus')
63
+ * const el = createElementus({ provider: 'gemini', geminiApiKey: '...' })
64
+ *
65
+ * module.exports = base.extend({
66
+ * page: async ({ page }, use) => {
67
+ * await use(el.wrapPage(page))
68
+ * }
69
+ * })
70
+ *
71
+ * // In tests — page is already wrapped:
72
+ * test('example', async ({ page }) => {
73
+ * await page.locator('#btn', { ai: 'Submit button' }).click()
74
+ * })
75
+ *
76
+ * WDIO — wrap browser once, add { ai } to any $() selector:
77
+ *
78
+ * const { createElementus } = require('elementus')
79
+ * const el = createElementus({ provider: 'lmstudio' })
80
+ *
81
+ * // In before hook or config:
82
+ * const b = el.wrapBrowser(browser)
83
+ * await b.$('#btn', { ai: 'Submit order button' }).click()
84
+ * await b.$('#email', { ai: 'Email input field' }).setValue('test@test.com')
85
+ *
86
+ * // $() calls WITHOUT { ai } work normally:
87
+ * await b.$('#always-stable').click()
88
+ *
89
+ * Appium (native Android/iOS/Flutter) — same wrapBrowser pattern:
90
+ *
91
+ * const { createElementus } = require('elementus')
92
+ * const el = createElementus({ provider: 'gemini', geminiApiKey: '...' })
93
+ *
94
+ * // In before hook:
95
+ * const d = el.wrapBrowser(driver)
96
+ * await d.$('~loginButton', { ai: 'Login button on welcome screen' }).click()
97
+ * await d.$('~emailField', { ai: 'Email input' }).setValue('test@test.com')
98
+ *
99
+ * // Works with Flutter, React Native, native Android/iOS — any Appium driver.
100
+ * // Instead of DOM scanning, Elementus parses the native element tree
101
+ * // from driver.getPageSource() (XML) and applies the same AI scoring.
102
+ *
103
+ * ─────────────────────────────────────────────────────────────────────────
104
+ * 4. API REFERENCE
105
+ * ─────────────────────────────────────────────────────────────────────────
106
+ *
107
+ * el.wrapPage(page)
108
+ * Wraps a Playwright page. Returns a proxy where page.locator(selector,
109
+ * { ai: 'description' }) auto-creates AI-fallback locators.
110
+ * Locators without { ai } pass through unchanged.
111
+ *
112
+ * el.wrapBrowser(browser)
113
+ * Wraps a WDIO browser. Returns a proxy where browser.$(selector,
114
+ * { ai: 'description' }) auto-creates AI-fallback elements.
115
+ * $() calls without { ai } pass through unchanged.
116
+ *
117
+ * el.wrap(context, locator, description)
118
+ * Low-level: wraps any single locator/element with AI fallback.
119
+ * Use wrapPage/wrapBrowser instead for cleaner code.
120
+ * - context: page (Playwright) or browser (WDIO)
121
+ * - locator: Playwright Locator or WDIO Element
122
+ * - description: natural-language element description
123
+ * Returns a Proxy that tries the original, falls back to AI on failure.
124
+ *
125
+ * el.find(context, description)
126
+ * Find element by description only (no locator needed).
127
+ * Returns a real Playwright Locator / WDIO Element for any action.
128
+ * - context: page (Playwright) or browser (WDIO)
129
+ * - description: natural-language element description
130
+ * Example:
131
+ * const el = await el.find(page, 'Submit button')
132
+ * await el.click()
133
+ * await expect(el).toHaveText('Submit')
134
+ *
135
+ * el.locate(context, locator, description)
136
+ * Try locator first, fall back to AI if locator fails.
137
+ * Returns a Playwright Locator / WDIO Element.
138
+ * Respects your framework's configured action timeout.
139
+ * Example:
140
+ * const el = await el.locate(page, page.locator('#btn'), 'Submit')
141
+ * await el.click()
142
+ *
143
+ * el.click(context, locator, description)
144
+ * Click with optimized fallback: uses page.goto() for links (avoids
145
+ * hover/overlay issues) and JS click for buttons (no mouse movement).
146
+ * Use this for navigation clicks. For other actions, use wrap/find.
147
+ * Respects your framework's configured action timeout.
148
+ * Example:
149
+ * await el.click(page, page.locator('#nav'), 'Blog page link')
150
+ *
151
+ * ─────────────────────────────────────────────────────────────────────────
152
+ * 5. CONFIGURATION OPTIONS
153
+ * ─────────────────────────────────────────────────────────────────────────
154
+ *
155
+ * createElementus({
156
+ * // LLM Provider
157
+ * provider: 'lmstudio', // 'lmstudio' | 'gemini'
158
+ *
159
+ * // LM Studio (when provider = 'lmstudio')
160
+ * lmStudioUrl: 'http://localhost:1234/v1/chat/completions',
161
+ * model: 'gemma-4-26b-a4b-it',
162
+ *
163
+ * // Gemini (when provider = 'gemini')
164
+ * geminiApiKey: null, // or GEMINI_API_KEY env var
165
+ * geminiModel: 'gemini-2.5-flash',
166
+ *
167
+ * // Behavior
168
+ * maxCandidates: 20, // max elements sent to LLM for disambiguation
169
+ * visionMaxWidth: 1280, // max screenshot width (px) sent to vision LLM
170
+ *
171
+ * // Debugging
172
+ * debug: false, // save screenshots to debugDir
173
+ * debugDir: './debug', // directory for debug screenshots
174
+ *
175
+ * // Custom stop words (merged with defaults)
176
+ * stopWords: null, // Set of words to ignore in descriptions
177
+ * })
178
+ *
179
+ * ─────────────────────────────────────────────────────────────────────────
180
+ * 6. HOW IT WORKS (fallback pipeline)
181
+ * ─────────────────────────────────────────────────────────────────────────
182
+ *
183
+ * When a locator/selector fails, elementus runs this pipeline:
184
+ *
185
+ * Step 1: Locator/Selector
186
+ * Try the original selector. If it works, done — zero overhead.
187
+ *
188
+ * Step 2: DOM Scoring
189
+ * Scan all interactive elements on the page. Score each by keyword
190
+ * and phrase relevance to the description. If one clear winner, use it.
191
+ * If multiple tied: send top candidates to LLM for disambiguation.
192
+ * If all identical (e.g., 10x "Edit" buttons): use positional LLM
193
+ * with coordinates ("first Edit button near the top").
194
+ *
195
+ * Step 3: Vision (last resort)
196
+ * Take a full-page screenshot with a 3x3 labeled grid overlay.
197
+ * Ask the vision LLM which region contains the target element.
198
+ * Scroll to that region, re-scan DOM. If still unresolved,
199
+ * ask LLM for precise pixel coordinates.
200
+ *
201
+ * ─────────────────────────────────────────────────────────────────────────
202
+ * 7. TIPS FOR WRITING DESCRIPTIONS
203
+ * ─────────────────────────────────────────────────────────────────────────
204
+ *
205
+ * Good descriptions use words that appear in or near the element:
206
+ * 'Submit order button' — matches <button>Submit</button>
207
+ * 'Email input field' — matches <input> near "Email" label
208
+ * 'Privacy Policy footer link' — matches <a>Privacy Policy</a>
209
+ *
210
+ * For identical elements, add positional context:
211
+ * 'first Edit button near the top'
212
+ * 'Delete button in the third row'
213
+ * 'Add to Cart for the last product'
214
+ *
215
+ * Avoid vague descriptions:
216
+ * BAD: 'the button' — matches every button
217
+ * BAD: 'click here' — no useful keywords
218
+ * GOOD: 'Save Changes button' — specific, matchable text
219
+ *
220
+ * ─────────────────────────────────────────────────────────────────────────
221
+ * 8. WHICH API TO USE FOR WHAT
222
+ * ─────────────────────────────────────────────────────────────────────────
223
+ *
224
+ * "I want to click a link/button":
225
+ * → Use wrapPage + { ai } or el.click
226
+ * await p.locator('#btn', { ai: 'Submit order' }).click()
227
+ *
228
+ * "I want to fill an input":
229
+ * → Use wrapPage + { ai }
230
+ * await p.locator('#email', { ai: 'Email input' }).fill('a@b.com')
231
+ *
232
+ * "I want to read text or an attribute":
233
+ * → Use wrapPage + { ai } or el.find
234
+ * const text = await p.locator('#price', { ai: 'Product price' }).textContent()
235
+ * const href = await p.locator('#link', { ai: 'Blog link' }).getAttribute('href')
236
+ *
237
+ * "I want to assert (expect) on an element":
238
+ * → Use el.find — returns a real locator you can pass to expect()
239
+ * const el = await el.find(page, 'Submit order button')
240
+ * await expect(el).toBeVisible()
241
+ * await expect(el).toHaveText('Submit')
242
+ * await expect(el).toHaveAttribute('href', '/checkout')
243
+ *
244
+ * "I want to check isVisible / isEnabled (boolean result)":
245
+ * → Just use { ai } — handled automatically. Boolean query methods
246
+ * (isVisible, isEnabled, isChecked, isHidden, isEditable) are detected
247
+ * by the Proxy and resolve via AI first, then query the real element.
248
+ * const vis = await p.locator('#gone', { ai: 'Submit' }).isVisible() // true
249
+ * const on = await p.locator('#gone', { ai: 'Checkbox' }).isChecked() // true/false
250
+ *
251
+ * "I want to navigate to a page via link click":
252
+ * → Use el.click — it uses page.goto(href) which is faster and avoids
253
+ * CSS hover/overlay issues that regular click() can trigger.
254
+ * await el.click(page, page.locator('#blog'), 'Blog page link')
255
+ *
256
+ * ─────────────────────────────────────────────────────────────────────────
257
+ */
258
+
259
+ const fs = require('fs')
260
+ const path = require('path')
261
+
262
+ // ─────────────────────────────────────────────────────────────────────────────
263
+ // Defaults & constants
264
+ // ─────────────────────────────────────────────────────────────────────────────
265
+
266
+ const DEFAULTS = {
267
+ provider: 'lmstudio',
268
+ lmStudioUrl: 'http://localhost:1234/v1/chat/completions',
269
+ model: 'gemma-4-26b-a4b-it',
270
+ geminiApiKey: null,
271
+ geminiModel: 'gemini-2.5-flash',
272
+ maxCandidates: 20,
273
+ debug: false,
274
+ debugDir: null,
275
+ stopWords: null,
276
+ visionMaxWidth: 1280,
277
+ }
278
+
279
+ const DEFAULT_STOP_WORDS = new Set([
280
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of',
281
+ 'with', 'by', 'from', 'is', 'it', 'its', 'this', 'that', 'be', 'are', 'was',
282
+ 'were', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'not',
283
+ 'link', 'button', 'click', 'press', 'navigate', 'navigation', 'nav',
284
+ 'page', 'menu', 'top', 'bottom', 'footer', 'header', 'sidebar', 'bar',
285
+ 'find', 'locate', 'element', 'item', 'icon', 'label', 'text', 'section'
286
+ ])
287
+
288
+ const INTERACTIVE_TAGS = ['a', 'button', 'input', 'select', 'textarea', 'label', 'summary']
289
+ const INTERACTIVE_ROLES = ['button', 'link', 'menuitem', 'menuitemcheckbox', 'menuitemradio',
290
+ 'tab', 'checkbox', 'radio', 'option', 'combobox', 'switch', 'treeitem', 'gridcell']
291
+ const INTERACTIVE_SELECTORS = 'a, button, input, select, textarea, [role="button"], [role="link"], [role="menuitem"], [role="tab"], [role="checkbox"], [role="radio"]'
292
+
293
+ const REGION_LABELS = [
294
+ ['top-left', 'top-center', 'top-right' ],
295
+ ['middle-left', 'middle-center', 'middle-right'],
296
+ ['bottom-left', 'bottom-center', 'bottom-right'],
297
+ ]
298
+
299
+ // ─────────────────────────────────────────────────────────────────────────────
300
+ // Factory
301
+ // ─────────────────────────────────────────────────────────────────────────────
302
+
303
+ /**
304
+ * Create a elementus instance with the given configuration.
305
+ *
306
+ * @param {Object} userConfig
307
+ * @param {'lmstudio'|'gemini'} [userConfig.provider='lmstudio'] - LLM provider
308
+ * @param {string} [userConfig.lmStudioUrl='http://localhost:1234/v1/chat/completions'] - LM Studio endpoint
309
+ * @param {string} [userConfig.model='gemma-4-26b-a4b-it'] - LM Studio model name
310
+ * @param {string|null} [userConfig.geminiApiKey=null] - Google Gemini API key (or GEMINI_API_KEY env var)
311
+ * @param {string} [userConfig.geminiModel='gemini-2.5-flash'] - Gemini model ID
312
+ * @param {number} [userConfig.maxCandidates=20] - max elements sent to LLM for disambiguation
313
+ * @param {boolean} [userConfig.debug=false] - save debug screenshots
314
+ * @param {string|null} [userConfig.debugDir=null] - directory for debug screenshots
315
+ * @param {Set<string>|null} [userConfig.stopWords=null] - custom stop words (replaces defaults)
316
+ * @param {number} [userConfig.visionMaxWidth=1280] - max screenshot width (px) sent to vision LLM
317
+ * @returns {{ wrap, wrapPage, wrapBrowser, locate, find, click }}
318
+ */
319
+ function createElementus(userConfig = {}) {
320
+ const config = { ...DEFAULTS, ...userConfig }
321
+ const stopWords = config.stopWords || DEFAULT_STOP_WORDS
322
+
323
+ // ── Driver adapter — auto-detects Playwright vs WDIO ─────────────────
324
+
325
+ function _eval(ctx, fn, args) {
326
+ // Playwright: page.evaluate(fn, args) | WDIO: browser.execute(fn, args)
327
+ if (typeof ctx.evaluate === 'function') {
328
+ return args !== undefined ? ctx.evaluate(fn, args) : ctx.evaluate(fn)
329
+ }
330
+ if (typeof ctx.execute === 'function') {
331
+ return args !== undefined ? ctx.execute(fn, args) : ctx.execute(fn)
332
+ }
333
+ throw new Error('Context must have evaluate() (Playwright) or execute() (WDIO)')
334
+ }
335
+
336
+ async function _screenshot(ctx, fullPage = false) {
337
+ if (typeof ctx.screenshot === 'function') {
338
+ // Playwright — returns Buffer
339
+ const buf = await ctx.screenshot({ type: 'png', fullPage, scale: 'css' })
340
+ return { buffer: buf, base64: buf.toString('base64') }
341
+ }
342
+ if (typeof ctx.takeScreenshot === 'function') {
343
+ // WDIO — returns base64 string (viewport only)
344
+ const b64 = await ctx.takeScreenshot()
345
+ return { buffer: Buffer.from(b64, 'base64'), base64: b64 }
346
+ }
347
+ throw new Error('Context must have screenshot() (Playwright) or takeScreenshot() (WDIO)')
348
+ }
349
+
350
+ async function _goto(ctx, url) {
351
+ if (typeof ctx.goto === 'function') return ctx.goto(url, { waitUntil: 'load' })
352
+ if (typeof ctx.url === 'function') return ctx.url(url)
353
+ // Native apps: no URL navigation — silently skip
354
+ if (_isNative(ctx)) return
355
+ throw new Error('Context must have goto() (Playwright) or url() (WDIO)')
356
+ }
357
+
358
+ async function _wait(ctx, ms) {
359
+ if (typeof ctx.waitForTimeout === 'function') return ctx.waitForTimeout(ms)
360
+ if (typeof ctx.pause === 'function') return ctx.pause(ms)
361
+ }
362
+
363
+ async function _makeLocator(ctx, selector) {
364
+ // Playwright: sync locator() | WDIO: async $()
365
+ if (typeof ctx.locator === 'function') return ctx.locator(selector)
366
+ if (typeof ctx.$ === 'function') return ctx.$(selector)
367
+ throw new Error('Context must have locator() (Playwright) or $() (WDIO)')
368
+ }
369
+
370
+ function _isPlaywright(ctx) {
371
+ return typeof ctx.evaluate === 'function'
372
+ }
373
+
374
+ function _isNative(ctx) {
375
+ // Appium native: has getPageSource but no evaluate/execute for browser JS
376
+ // (or execute exists but would fail — we detect via getPageSource presence + no DOM)
377
+ return typeof ctx.getPageSource === 'function' &&
378
+ typeof ctx.evaluate !== 'function'
379
+ }
380
+
381
+ // ── LLM helpers — multi-provider ─────────────────────────────────────
382
+
383
+ async function _lmStudioText(prompt, maxTokens) {
384
+ const res = await fetch(config.lmStudioUrl, {
385
+ method: 'POST',
386
+ headers: { 'Content-Type': 'application/json' },
387
+ body: JSON.stringify({
388
+ model: config.model,
389
+ messages: [{ role: 'user', content: prompt }],
390
+ max_tokens: maxTokens, temperature: 0
391
+ })
392
+ })
393
+ if (!res.ok) throw new Error(`LM Studio ${res.status}: ${await res.text()}`)
394
+ return (await res.json()).choices[0].message.content.trim()
395
+ }
396
+
397
+ async function _lmStudioVision(prompt, base64Image, maxTokens) {
398
+ const res = await fetch(config.lmStudioUrl, {
399
+ method: 'POST',
400
+ headers: { 'Content-Type': 'application/json' },
401
+ body: JSON.stringify({
402
+ model: config.model,
403
+ messages: [{ role: 'user', content: [
404
+ { type: 'text', text: prompt },
405
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } }
406
+ ]}],
407
+ max_tokens: maxTokens, temperature: 0
408
+ })
409
+ })
410
+ if (!res.ok) throw new Error(`LM Studio ${res.status}: ${await res.text()}`)
411
+ return (await res.json()).choices[0].message.content.trim()
412
+ }
413
+
414
+ function _geminiUrl() {
415
+ const key = config.geminiApiKey || process.env.GEMINI_API_KEY
416
+ if (!key) throw new Error('Gemini API key required: set geminiApiKey or GEMINI_API_KEY env var')
417
+ return `https://generativelanguage.googleapis.com/v1beta/models/${config.geminiModel}:generateContent?key=${key}`
418
+ }
419
+
420
+ function _geminiExtractText(data) {
421
+ if (!data.candidates?.[0]?.content?.parts) {
422
+ console.log(`[LLM] Gemini raw response: ${JSON.stringify(data).slice(0, 500)}`)
423
+ throw new Error('Unexpected Gemini response structure')
424
+ }
425
+ const parts = data.candidates[0].content.parts
426
+ // Thinking models return multiple parts (thought + output) — take the last text part
427
+ for (let i = parts.length - 1; i >= 0; i--) {
428
+ if (parts[i].text && !parts[i].thought) return parts[i].text.trim()
429
+ }
430
+ // Fallback: return any text part
431
+ for (let i = parts.length - 1; i >= 0; i--) {
432
+ if (parts[i].text) return parts[i].text.trim()
433
+ }
434
+ throw new Error('No text in Gemini response')
435
+ }
436
+
437
+ async function _geminiText(prompt, maxTokens) {
438
+ const res = await fetch(_geminiUrl(), {
439
+ method: 'POST',
440
+ headers: { 'Content-Type': 'application/json' },
441
+ body: JSON.stringify({
442
+ contents: [{ parts: [{ text: prompt }] }],
443
+ generationConfig: { maxOutputTokens: maxTokens, temperature: 0, responseMimeType: 'application/json', thinkingConfig: { thinkingBudget: 0 } }
444
+ })
445
+ })
446
+ if (!res.ok) throw new Error(`Gemini ${res.status}: ${await res.text()}`)
447
+ return _geminiExtractText(await res.json())
448
+ }
449
+
450
+ async function _geminiVision(prompt, base64Image, maxTokens) {
451
+ const res = await fetch(_geminiUrl(), {
452
+ method: 'POST',
453
+ headers: { 'Content-Type': 'application/json' },
454
+ body: JSON.stringify({
455
+ contents: [{ parts: [
456
+ { text: prompt },
457
+ { inline_data: { mime_type: 'image/png', data: base64Image } }
458
+ ]}],
459
+ generationConfig: { maxOutputTokens: maxTokens, temperature: 0, responseMimeType: 'application/json', thinkingConfig: { thinkingBudget: 0 } }
460
+ })
461
+ })
462
+ if (!res.ok) throw new Error(`Gemini ${res.status}: ${await res.text()}`)
463
+ return _geminiExtractText(await res.json())
464
+ }
465
+
466
+ async function askLLMText(prompt, maxTokens = 131072) {
467
+ const t0 = Date.now()
468
+ const result = config.provider === 'gemini' ? await _geminiText(prompt, maxTokens) : await _lmStudioText(prompt, maxTokens)
469
+ console.log(`[LLM] Text response: ${Date.now() - t0}ms`)
470
+ return result
471
+ }
472
+
473
+ async function askLLMVision(prompt, base64Image, maxTokens = 131072) {
474
+ const t0 = Date.now()
475
+ const result = config.provider === 'gemini' ? await _geminiVision(prompt, base64Image, maxTokens) : await _lmStudioVision(prompt, base64Image, maxTokens)
476
+ console.log(`[LLM] Vision response: ${Date.now() - t0}ms`)
477
+ return result
478
+ }
479
+
480
+ function parseJSON(content) {
481
+ const start = content.indexOf('{')
482
+ if (start === -1) throw new Error(`No JSON found in: ${content}`)
483
+ let depth = 0
484
+ for (let i = start; i < content.length; i++) {
485
+ if (content[i] === '{') depth++
486
+ else if (content[i] === '}') {
487
+ depth--
488
+ if (depth === 0) return JSON.parse(content.slice(start, i + 1))
489
+ }
490
+ }
491
+ throw new Error(`Unbalanced JSON in: ${content}`)
492
+ }
493
+
494
+ function saveDebug(filename, buffer) {
495
+ if (!config.debug || !config.debugDir) return
496
+ fs.mkdirSync(config.debugDir, { recursive: true })
497
+ fs.writeFileSync(path.join(config.debugDir, filename), buffer)
498
+ }
499
+
500
+ async function _resizeScreenshot(ctx, shot, origWidth, origHeight) {
501
+ const maxW = config.visionMaxWidth
502
+ if (origWidth <= maxW) return { base64: shot.base64, scale: 1 }
503
+ const scale = origWidth / maxW
504
+ const newH = Math.round(origHeight / scale)
505
+ const resized = await _eval(ctx, ({ b64, w, h }) => {
506
+ const img = new Image()
507
+ const canvas = document.createElement('canvas')
508
+ canvas.width = w; canvas.height = h
509
+ return new Promise(resolve => {
510
+ img.onload = () => {
511
+ canvas.getContext('2d').drawImage(img, 0, 0, w, h)
512
+ resolve(canvas.toDataURL('image/png').split(',')[1])
513
+ }
514
+ img.src = 'data:image/png;base64,' + b64
515
+ })
516
+ }, { b64: shot.base64, w: maxW, h: newH })
517
+ console.log(`[Vision] Resized screenshot: ${origWidth}×${origHeight} → ${maxW}×${newH} (scale ${scale.toFixed(2)}x)`)
518
+ return { base64: resized, scale }
519
+ }
520
+
521
+ // ── Native app XML parsing (Appium) ───────────────────────────────
522
+
523
+ // Interactive element types in Android/iOS/Flutter native trees
524
+ const NATIVE_INTERACTIVE = new Set([
525
+ // Android
526
+ 'android.widget.button', 'android.widget.imagebutton', 'android.widget.edittext',
527
+ 'android.widget.checkbox', 'android.widget.radiobutton', 'android.widget.switch',
528
+ 'android.widget.togglebutton', 'android.widget.spinner', 'android.widget.imageview',
529
+ 'android.widget.textview', 'android.view.view',
530
+ // iOS
531
+ 'xcuielementtypebutton', 'xcuielementtypetextfield', 'xcuielementtypesecuretextfield',
532
+ 'xcuielementtypeswitch', 'xcuielementtypepicker', 'xcuielementtypeimage',
533
+ 'xcuielementtypestatictext', 'xcuielementtypeother', 'xcuielementtypecell',
534
+ 'xcuielementtypelink',
535
+ ])
536
+
537
+ function _parseNativeXml(xmlSource) {
538
+ // Lightweight XML parser — extracts elements with bounds and text from
539
+ // Appium's getPageSource() XML. Works for Android and iOS element trees.
540
+ const elements = []
541
+ // Match self-closing and open tags with attributes
542
+ const tagRegex = /<([a-zA-Z0-9._]+)\s([^>]*?)\/?>|<([a-zA-Z0-9._]+)\s([^>]*?)>[^<]*<\/\3>/g
543
+ let match
544
+
545
+ while ((match = tagRegex.exec(xmlSource)) !== null) {
546
+ const tagName = (match[1] || match[3] || '').toLowerCase()
547
+ const attrs = match[2] || match[4] || ''
548
+
549
+ // Extract attributes (indexOf is faster than creating RegExp per call)
550
+ const get = (name) => {
551
+ const needle = name + '="'
552
+ const i = attrs.indexOf(needle)
553
+ if (i === -1) return null
554
+ const start = i + needle.length
555
+ return attrs.substring(start, attrs.indexOf('"', start))
556
+ }
557
+
558
+ // Get text from various attribute names across platforms
559
+ const text = (get('text') || get('content-desc') || get('label') ||
560
+ get('name') || get('value') || '').trim()
561
+ if (!text) continue
562
+
563
+ // Get bounds — Android: bounds="[x1,y1][x2,y2]", iOS: x,y,width,height attrs
564
+ let docX = 0, docY = 0
565
+ const bounds = get('bounds')
566
+ if (bounds) {
567
+ // Android format: [x1,y1][x2,y2]
568
+ const bm = bounds.match(/\[(\d+),(\d+)\]\[(\d+),(\d+)\]/)
569
+ if (bm) {
570
+ docX = Math.round((+bm[1] + +bm[3]) / 2)
571
+ docY = Math.round((+bm[2] + +bm[4]) / 2)
572
+ }
573
+ } else {
574
+ // iOS format: separate x, y, width, height attributes
575
+ const x = +(get('x') || 0), y = +(get('y') || 0)
576
+ const w = +(get('width') || 0), h = +(get('height') || 0)
577
+ if (w === 0 || h === 0) continue
578
+ docX = Math.round(x + w / 2)
579
+ docY = Math.round(y + h / 2)
580
+ }
581
+
582
+ if (docX <= 0 && docY <= 0) continue
583
+
584
+ // Determine if interactive (by type or clickable attribute)
585
+ const clickable = get('clickable') === 'true' || get('enabled') === 'true'
586
+ const isInteractive = NATIVE_INTERACTIVE.has(tagName) || clickable
587
+
588
+ if (!isInteractive) continue
589
+
590
+ elements.push({
591
+ text: text.replace(/\s+/g, ' '),
592
+ tag: tagName.split('.').pop(), // 'android.widget.Button' → 'button'
593
+ role: get('class') || tagName,
594
+ href: null, // native apps don't have hrefs
595
+ docX,
596
+ docY,
597
+ // Native-specific: store identifiers for locator building
598
+ _resourceId: get('resource-id') || null,
599
+ _accessibilityId: get('content-desc') || get('accessibility-id') || get('label') || null,
600
+ _xpath: null, // set later if needed
601
+ })
602
+ }
603
+
604
+ return elements
605
+ }
606
+
607
+ async function getAllElementsNative(ctx) {
608
+ const source = await ctx.getPageSource()
609
+ const elements = _parseNativeXml(source)
610
+ console.log(`[Native] Parsed ${elements.length} interactive elements from page source`)
611
+ return elements
612
+ }
613
+
614
+ // Build an Appium locator from native element data (no DOM attribute stamping)
615
+ async function markByElementNative(ctx, element) {
616
+ // Priority: accessibility-id > resource-id > xpath by text
617
+ if (element._accessibilityId) {
618
+ console.log(`[Resolve] Native: accessibility-id "${element._accessibilityId}"`)
619
+ return ctx.$(`~${element._accessibilityId}`)
620
+ }
621
+ if (element._resourceId) {
622
+ console.log(`[Resolve] Native: resource-id "${element._resourceId}"`)
623
+ return ctx.$(`android=new UiSelector().resourceId("${element._resourceId}")`)
624
+ }
625
+ // Fallback: find by text content
626
+ console.log(`[Resolve] Native: text "${element.text}"`)
627
+ const escapedText = element.text.replace(/"/g, '\\"')
628
+ // Try accessibility id first (works cross-platform), then text-based
629
+ const found = await ctx.$(`~${element.text}`).catch(() => null)
630
+ if (found && await found.isExisting()) return found
631
+ // Android UiSelector fallback
632
+ return ctx.$(`android=new UiSelector().text("${escapedText}")`)
633
+ }
634
+
635
+ // ── DOM scanning (web) ───────────────────────────────────────────────
636
+
637
+ async function getAllElements(ctx) {
638
+ // Dispatch: native app → parse XML, web → evaluate JS in browser
639
+ if (_isNative(ctx)) return getAllElementsNative(ctx)
640
+ return _eval(ctx, ({ selectors }) => {
641
+ function extract(el) {
642
+ const rect = el.getBoundingClientRect()
643
+ if (rect.width === 0 || rect.height === 0) return null
644
+ const docX = Math.round(rect.left + window.scrollX + rect.width / 2)
645
+ if (docX < 0 || docX > window.innerWidth) return null
646
+ const text = el.textContent.trim().replace(/\s+/g, ' ')
647
+ if (!text) return null
648
+ return {
649
+ text,
650
+ tag: el.tagName.toLowerCase(),
651
+ role: el.getAttribute('role') || null,
652
+ href: el.getAttribute('href') || null,
653
+ docX,
654
+ docY: Math.round(rect.top + window.scrollY + rect.height / 2),
655
+ }
656
+ }
657
+ // Fast pass: interactive selectors + onclick + tabindex (no getComputedStyle)
658
+ const seen = new Set()
659
+ const results = []
660
+ for (const el of document.querySelectorAll(selectors + ',[onclick],[tabindex]')) {
661
+ if (seen.has(el)) continue
662
+ seen.add(el)
663
+ const data = extract(el)
664
+ if (data) results.push(data)
665
+ }
666
+ // Slow pass: cursor:pointer elements not caught by selectors
667
+ for (const el of document.querySelectorAll('*')) {
668
+ if (seen.has(el)) continue
669
+ if (window.getComputedStyle(el).cursor === 'pointer') {
670
+ const data = extract(el)
671
+ if (data) results.push(data)
672
+ }
673
+ }
674
+ return results
675
+ }, { selectors: INTERACTIVE_SELECTORS })
676
+ }
677
+
678
+ // ── Scoring ──────────────────────────────────────────────────────────
679
+
680
+ function _normalizeWords(description) {
681
+ return description.toLowerCase().replace(/[^a-z0-9& ]/g, ' ').split(/\s+/).filter(w => w.length > 1)
682
+ }
683
+
684
+ function extractKeywordsAndPhrases(description) {
685
+ const words = _normalizeWords(description)
686
+ const keywords = words.filter(w => !stopWords.has(w))
687
+ const phrases = []; let run = []
688
+ for (const w of words) {
689
+ if (!stopWords.has(w)) { run.push(w) }
690
+ else { if (run.length >= 2) phrases.push(run.join(' ')); run = [] }
691
+ }
692
+ if (run.length >= 2) phrases.push(run.join(' '))
693
+ return { keywords, phrases }
694
+ }
695
+
696
+ function scoreCandidate(el, keywords, phrases) {
697
+ return phrases.reduce((s, p) => s + (el._ltext.includes(p) || el._lhref.includes(p) ? 3 : 0), 0) +
698
+ keywords.reduce((s, kw) => s + (el._ltext.includes(kw) || el._lhref.includes(kw) ? 1 : 0), 0)
699
+ }
700
+
701
+ // ── Element resolution ───────────────────────────────────────────────
702
+
703
+ async function findElementInDOM(ctx, description, regionBounds = null) {
704
+ let elements = await getAllElements(ctx)
705
+
706
+ if (elements.length === 0) {
707
+ for (let attempt = 0; attempt < 3; attempt++) {
708
+ console.log(`[DOM] 0 elements \u2014 waiting for render (${attempt + 1}/3)`)
709
+ await _wait(ctx, 1000)
710
+ elements = await getAllElements(ctx)
711
+ if (elements.length > 0) break
712
+ }
713
+ }
714
+
715
+ const seen = new Set()
716
+ elements = elements.filter(e => {
717
+ const key = `${e.text}|${e.docX}|${e.docY}`
718
+ return seen.has(key) ? false : seen.add(key)
719
+ })
720
+
721
+ if (regionBounds) {
722
+ const { x1, y1, x2, y2 } = regionBounds
723
+ elements = elements.filter(e => e.docX >= x1 && e.docX <= x2 && e.docY >= y1 && e.docY <= y2)
724
+ console.log(`[DOM] ${elements.length} elements in region`)
725
+ } else {
726
+ console.log(`[DOM] ${elements.length} elements on page`)
727
+ }
728
+
729
+ if (elements.length === 0) return null
730
+
731
+ // Pre-compute lowercase text/href for scoring (avoids repeated toLowerCase per element)
732
+ elements.forEach(e => { e._ltext = e.text.toLowerCase(); e._lhref = (e.href || '').toLowerCase() })
733
+
734
+ const { keywords, phrases } = extractKeywordsAndPhrases(description)
735
+ console.log(`[DOM] Keywords: [${keywords.join(', ')}] | Phrases: [${phrases.join(', ')}]`)
736
+
737
+ const scored = elements
738
+ .map(e => ({ ...e, score: scoreCandidate(e, keywords, phrases) }))
739
+ .filter(e => e.score > 0)
740
+ .sort((a, b) => b.score - a.score)
741
+
742
+ if (scored.length === 0) {
743
+ if (!regionBounds) { console.log(`[DOM] No matches \u2014 signalling vision`); return null }
744
+ const capped = elements.slice(0, config.maxCandidates)
745
+ console.log(`[DOM] No matches in region \u2014 sending ${capped.length} to LLM`)
746
+ return disambiguateWithLLM(capped, description)
747
+ }
748
+
749
+ const topScore = scored[0].score
750
+ const topMatches = scored.filter(e => e.score === topScore)
751
+ console.log(`[DOM] Top score: ${topScore} | Top matches: ${topMatches.length}`)
752
+ topMatches.slice(0, 5).forEach(e => console.log(` [${e.score}] <${e.role || e.tag}> "${e.text}"`))
753
+
754
+ if (topMatches.length === 1) {
755
+ console.log(`[DOM] Clear match: "${topMatches[0].text}"`)
756
+ return topMatches[0]
757
+ }
758
+
759
+ if (!regionBounds && topMatches.length / elements.length > 0.4) {
760
+ console.log(`[DOM] Keyword too generic \u2014 signalling vision`); return null
761
+ }
762
+
763
+ const firstHref = topMatches[0].href || ''
764
+ const shortestLen = Math.min(...topMatches.map(e => e.text.length))
765
+ const firstPrefix = topMatches[0].text.slice(0, shortestLen).toLowerCase()
766
+ const allIdentical = topMatches.every(e =>
767
+ e.text.slice(0, shortestLen).toLowerCase() === firstPrefix && (e.href || '') === firstHref
768
+ )
769
+ if (allIdentical) {
770
+ console.log(`[DOM] ${topMatches.length} identical ("${firstPrefix}") \u2014 positional LLM`)
771
+ return disambiguateWithPosition(topMatches, description)
772
+ }
773
+
774
+ const capped = topMatches.slice(0, config.maxCandidates)
775
+ console.log(`[DOM] ${capped.length} tied \u2014 LLM disambiguating...`)
776
+ return disambiguateWithLLM(capped, description)
777
+ }
778
+
779
+ async function disambiguateWithLLM(candidates, description) {
780
+ const list = candidates.map((e, i) => {
781
+ const hint = e.href ? ` \u2192 ${e.href}` : ''
782
+ return `[${i}] <${e.role || e.tag}> "${e.text}"${hint}`
783
+ }).join('\n')
784
+ let content
785
+ try {
786
+ content = await askLLMText(
787
+ `I need to click: "${description}"\n\nCandidates:\n${list}\n\nReturn ONLY JSON: {"index": <number>}`)
788
+ } catch (err) { console.log(`[DOM] LLM failed: ${err.message}`); return null }
789
+ console.log(`[DOM] LLM response: ${content}`)
790
+ let parsed = null
791
+ try { const { index } = parseJSON(content); if (typeof index === 'number' && isFinite(index)) parsed = Math.round(index) } catch {}
792
+ if (parsed === null || parsed < 0 || parsed >= candidates.length) {
793
+ console.log(`[DOM] Invalid index (${parsed}) \u2014 signalling vision`); return null
794
+ }
795
+ const chosen = candidates[parsed]
796
+ console.log(`[DOM] Chose [${parsed}]: "${chosen.text}" at doc(${chosen.docX}, ${chosen.docY})`)
797
+ return chosen
798
+ }
799
+
800
+ async function disambiguateWithPosition(candidates, description) {
801
+ const capped = candidates.slice(0, config.maxCandidates)
802
+ const list = capped.map((e, i) =>
803
+ `[${i}] <${e.role || e.tag}> "${e.text}" at position (x=${e.docX}, y=${e.docY})`
804
+ ).join('\n')
805
+ let content
806
+ try {
807
+ content = await askLLMText(
808
+ `I need to click: "${description}"\n\n` +
809
+ `Identical elements at different positions. Smaller y = higher on page.\n\n` +
810
+ `${list}\n\nReturn ONLY JSON: {"index": <number>}`)
811
+ } catch (err) { console.log(`[DOM] Positional LLM failed: ${err.message}`); return null }
812
+ console.log(`[DOM] Positional LLM: ${content}`)
813
+ let parsed = null
814
+ try { const { index } = parseJSON(content); if (typeof index === 'number' && isFinite(index)) parsed = Math.round(index) } catch {}
815
+ if (parsed === null || parsed < 0 || parsed >= capped.length) return null
816
+ const chosen = capped[parsed]
817
+ console.log(`[DOM] Positional: chose [${parsed}] at doc(${chosen.docX}, ${chosen.docY})`)
818
+ return chosen
819
+ }
820
+
821
+ // ── Vision ───────────────────────────────────────────────────────────
822
+
823
+ async function identifyRegionViaVision(ctx, description) {
824
+ // Combined eval: get dimensions + draw grid overlay in one round trip
825
+ const { viewWidth, docHeight } = await _eval(ctx, ({ labels }) => {
826
+ const w = window.innerWidth, h = document.body.scrollHeight
827
+ const canvas = document.createElement('canvas')
828
+ canvas.id = '__vision_grid__'
829
+ canvas.style.cssText = 'position:absolute;top:0;left:0;z-index:999999;pointer-events:none;'
830
+ canvas.width = w; canvas.height = h
831
+ document.body.appendChild(canvas)
832
+ const ctx = canvas.getContext('2d'), cw = w / 3, ch = h / 3
833
+ const fontSize = Math.max(16, Math.min(cw, ch) * 0.08)
834
+ for (let r = 0; r < 3; r++) for (let c = 0; c < 3; c++) {
835
+ const x = c * cw, y = r * ch
836
+ ctx.strokeStyle = 'rgba(255,50,50,0.7)'; ctx.lineWidth = 2
837
+ ctx.strokeRect(x + 1, y + 1, cw - 2, ch - 2)
838
+ ctx.font = `bold ${fontSize}px sans-serif`; ctx.textAlign = 'center'; ctx.textBaseline = 'middle'
839
+ const tw = ctx.measureText(labels[r][c]).width
840
+ ctx.fillStyle = 'rgba(0,0,0,0.6)'
841
+ ctx.fillRect(x + cw/2 - tw/2 - 4, y + ch/2 - fontSize/2 - 3, tw + 8, fontSize + 6)
842
+ ctx.fillStyle = 'white'; ctx.fillText(labels[r][c], x + cw / 2, y + ch / 2)
843
+ }
844
+ return { viewWidth: w, docHeight: h }
845
+ }, { labels: REGION_LABELS })
846
+
847
+ const shot = await _screenshot(ctx, true)
848
+ saveDebug('debug_region.png', shot.buffer)
849
+ await _eval(ctx, () => document.getElementById('__vision_grid__')?.remove())
850
+
851
+ const regionImg = await _resizeScreenshot(ctx, shot, viewWidth, docHeight)
852
+ const content = await askLLMVision(
853
+ `The screenshot shows a full webpage with a 3x3 grid:\n` +
854
+ `${REGION_LABELS.map(r => r.join(' | ')).join('\n')}\n\n` +
855
+ `Which region contains: "${description}"?\n` +
856
+ `Return ONLY JSON: {"region": "<label>"}\nValid: ${REGION_LABELS.flat().join(', ')}`,
857
+ regionImg.base64)
858
+ console.log(`[Vision] Region: ${content}`)
859
+
860
+ const { region: raw } = parseJSON(content)
861
+ const region = raw.toLowerCase().trim()
862
+ const row = REGION_LABELS.findIndex(r => r.includes(region))
863
+ const col = row >= 0 ? REGION_LABELS[row].indexOf(region) : -1
864
+ if (row < 0 || col < 0) throw new Error(`Unknown region: "${raw}"`)
865
+
866
+ const cw = viewWidth / 3, ch = docHeight / 3, OV = 0.20
867
+ return {
868
+ x1: Math.max(0, col * cw - cw * OV), y1: Math.max(0, row * ch - ch * OV),
869
+ x2: Math.min(viewWidth, (col + 1) * cw + cw * OV), y2: Math.min(docHeight, (row + 1) * ch + ch * OV),
870
+ }
871
+ }
872
+
873
+ async function locatePreciseViaVision(ctx, description) {
874
+ const { viewWidth, docHeight } = await _eval(ctx, () => ({
875
+ viewWidth: window.innerWidth, docHeight: document.body.scrollHeight
876
+ }))
877
+ const shot = await _screenshot(ctx, true)
878
+ saveDebug('debug_precise.png', shot.buffer)
879
+
880
+ const { base64: resizedB64, scale } = await _resizeScreenshot(ctx, shot, viewWidth, docHeight)
881
+ const resizedW = Math.round(viewWidth / scale), resizedH = Math.round(docHeight / scale)
882
+ const content = await askLLMVision(
883
+ `Screenshot: ${resizedW}\u00d7${resizedH}px (full page). Origin (0,0) = top-left.\n\n` +
884
+ `Find the CENTER of: "${description}"\n\n` +
885
+ `Return ONLY JSON: {"x": <number>, "y": <number>}`, resizedB64, 30)
886
+ console.log(`[Vision] Coordinates: ${content}`)
887
+
888
+ const { x, y } = parseJSON(content)
889
+ return {
890
+ docX: Math.max(0, Math.min(viewWidth - 1, Math.round(x * scale))),
891
+ docY: Math.max(0, Math.min(docHeight - 1, Math.round(y * scale)))
892
+ }
893
+ }
894
+
895
+ // ── Scroll, mark, click helpers ──────────────────────────────────────
896
+
897
+ async function scrollIntoView(ctx, docY) {
898
+ if (_isNative(ctx)) return // native apps handle scrolling differently — skip
899
+ const { scrollY, viewHeight } = await _eval(ctx, () => ({
900
+ scrollY: window.scrollY, viewHeight: window.innerHeight
901
+ }))
902
+ if (docY < scrollY || docY > scrollY + viewHeight) {
903
+ await _eval(ctx, top => window.scrollTo({ top, behavior: 'instant' }), docY - viewHeight / 2)
904
+ }
905
+ }
906
+
907
+ async function markByElement(ctx, element) {
908
+ if (_isNative(ctx)) return markByElementNative(ctx, element)
909
+ await scrollIntoView(ctx, element.docY)
910
+ const uid = `sr-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
911
+ const marked = await _eval(ctx, ({ tag, text, href, docX, docY, uid }) => {
912
+ function isClippedByParent(el) {
913
+ const rect = el.getBoundingClientRect()
914
+ let p = el.parentElement
915
+ while (p && p !== document.body) {
916
+ const ps = window.getComputedStyle(p)
917
+ if (ps.overflow === 'hidden' || ps.overflow === 'clip' || ps.overflowY === 'hidden') {
918
+ const pr = p.getBoundingClientRect()
919
+ if (rect.bottom > pr.bottom + 1 || rect.top < pr.top - 1) return true
920
+ }
921
+ p = p.parentElement
922
+ }
923
+ return false
924
+ }
925
+ const candidates = []
926
+ const selector = href ? tag + '[href="' + CSS.escape(href) + '"]' : tag
927
+ for (const el of document.querySelectorAll(selector)) {
928
+ const elText = el.textContent.trim().replace(/\s+/g, ' ')
929
+ if (elText !== text) continue
930
+ const rect = el.getBoundingClientRect()
931
+ if (rect.width === 0 || rect.height === 0) continue
932
+ const cx = Math.round(rect.left + window.scrollX + rect.width / 2)
933
+ const cy = Math.round(rect.top + window.scrollY + rect.height / 2)
934
+ const dist = Math.abs(cx - docX) + Math.abs(cy - docY)
935
+ const visible = !isClippedByParent(el)
936
+ candidates.push({ el, dist, visible })
937
+ }
938
+ candidates.sort((a, b) => {
939
+ if (a.visible !== b.visible) return a.visible ? -1 : 1
940
+ return a.dist - b.dist
941
+ })
942
+ if (candidates.length === 0) return null
943
+ candidates[0].el.setAttribute('data-elementus', uid)
944
+ return candidates[0].el.tagName.toLowerCase()
945
+ }, { tag: element.tag, text: element.text, href: element.href, docX: element.docX, docY: element.docY, uid })
946
+
947
+ if (!marked) throw new Error(`Could not mark <${element.tag}> "${element.text}"`)
948
+ console.log(`[Resolve] Marked <${marked}> "${element.text}" at doc(${element.docX}, ${element.docY})`)
949
+ return _makeLocator(ctx, `[data-elementus="${uid}"]`)
950
+ }
951
+
952
+ async function markAtCoordinates(ctx, docX, docY) {
953
+ if (!_isNative(ctx)) await scrollIntoView(ctx, docY)
954
+ const uid = `sr-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
955
+ const marked = await _eval(ctx, ({ docX, docY, uid, selectors }) => {
956
+ const vx = docX - window.scrollX, vy = docY - window.scrollY
957
+ const top = document.elementFromPoint(vx, vy)
958
+ if (!top) return null
959
+ let target = top.closest(selectors)
960
+ if (!target && typeof document.elementsFromPoint === 'function') {
961
+ for (const el of document.elementsFromPoint(vx, vy)) {
962
+ target = el.matches(selectors) ? el : el.closest(selectors)
963
+ if (target) break
964
+ }
965
+ }
966
+ const final = target || top
967
+ final.setAttribute('data-elementus', uid)
968
+ return final.tagName.toLowerCase()
969
+ }, { docX, docY, uid, selectors: INTERACTIVE_SELECTORS })
970
+ if (!marked) throw new Error(`No element at doc(${docX}, ${docY})`)
971
+ console.log(`[Resolve] Marked <${marked}> at doc(${docX}, ${docY})`)
972
+ return _makeLocator(ctx, `[data-elementus="${uid}"]`)
973
+ }
974
+
975
+ async function scrollAndClick(ctx, element) {
976
+ if (_isNative(ctx)) {
977
+ // Native app: resolve via markByElementNative, then click the element
978
+ console.log(`\u2713 Tapping "${element.text}" \u2014 native (${element.docX}, ${element.docY})`)
979
+ const nativeEl = await markByElementNative(ctx, element)
980
+ await nativeEl.click()
981
+ return
982
+ }
983
+ await scrollIntoView(ctx, element.docY)
984
+ const { vx, vy } = await _eval(ctx, ({ docX, docY }) => ({
985
+ vx: docX - window.scrollX, vy: docY - window.scrollY
986
+ }), { docX: element.docX, docY: element.docY })
987
+ console.log(`\u2713 Clicking "${element.text}" \u2014 doc(${element.docX}, ${element.docY})`)
988
+ if (element.href && element.tag === 'a') {
989
+ await _goto(ctx, element.href)
990
+ console.log(`[Click] Navigated to: ${element.href}`)
991
+ return
992
+ }
993
+ const clicked = await _eval(ctx, ({ x, y }) => {
994
+ const el = document.elementFromPoint(x, y)
995
+ if (!el) return 'null'
996
+ if (typeof el.click === 'function') el.click()
997
+ else el.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }))
998
+ return el.tagName + ':' + (el.textContent?.trim().slice(0, 40) || '')
999
+ }, { x: vx, y: vy })
1000
+ console.log(`[Click] JS click: ${clicked}`)
1001
+ }
1002
+
1003
+ async function clickAtCoords(ctx, coords) {
1004
+ if (_isNative(ctx)) {
1005
+ // Native app: tap at absolute coordinates via Appium action
1006
+ console.log(`[Vision] Native tap at (${coords.docX}, ${coords.docY})`)
1007
+ await ctx.action('pointer', { parameters: { pointerType: 'touch' } })
1008
+ .move({ x: coords.docX, y: coords.docY })
1009
+ .down().up()
1010
+ .perform()
1011
+ return
1012
+ }
1013
+ await scrollIntoView(ctx, coords.docY)
1014
+ const { vx, vy } = await _eval(ctx, ({ docX, docY }) => ({
1015
+ vx: docX - window.scrollX, vy: docY - window.scrollY
1016
+ }), { docX: coords.docX, docY: coords.docY })
1017
+ const info = await _eval(ctx, ({ x, y }) => {
1018
+ const el = document.elementFromPoint(x, y)
1019
+ if (!el) return null
1020
+ const a = el.closest('a')
1021
+ return { href: a?.getAttribute('href') || null, isAnchor: !!a }
1022
+ }, { x: vx, y: vy })
1023
+ if (info?.href && info.isAnchor) {
1024
+ await _goto(ctx, info.href)
1025
+ console.log(`[Vision] Navigated to: ${info.href}`)
1026
+ return
1027
+ }
1028
+ await _eval(ctx, ({ x, y }) => {
1029
+ const el = document.elementFromPoint(x, y)
1030
+ if (!el) return
1031
+ if (typeof el.click === 'function') el.click()
1032
+ else el.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }))
1033
+ }, { x: vx, y: vy })
1034
+ console.log(`[Vision] JS click at (${vx}, ${vy})`)
1035
+ }
1036
+
1037
+ // ── Vision fallback (shared) ─────────────────────────────────────────
1038
+
1039
+ async function visionFallback(ctx, description) {
1040
+ console.log(`[Vision] DOM returned null \u2014 activating vision`)
1041
+ const region = await identifyRegionViaVision(ctx, description)
1042
+ const vh = await _eval(ctx, () => window.innerHeight)
1043
+ await _eval(ctx, top => window.scrollTo({ top, behavior: 'instant' }), (region.y1 + region.y2) / 2 - vh / 2)
1044
+ const element = await findElementInDOM(ctx, description, region)
1045
+ if (element) return { element, coords: null }
1046
+ console.log(`[Vision] DOM unresolved \u2014 precise coordinates...`)
1047
+ const coords = await locatePreciseViaVision(ctx, description)
1048
+ return { element: null, coords }
1049
+ }
1050
+
1051
+ // ── Public API ───────────────────────────────────────────────────────
1052
+
1053
+ async function _findByDescription(ctx, description) {
1054
+ let element = await findElementInDOM(ctx, description)
1055
+ if (element) return markByElement(ctx, element)
1056
+ try {
1057
+ const result = await visionFallback(ctx, description)
1058
+ if (result.element) return markByElement(ctx, result.element)
1059
+ return markAtCoordinates(ctx, result.coords.docX, result.coords.docY)
1060
+ } catch (err) {
1061
+ throw new Error(`All fallback paths exhausted for "${description}": ${err.message}`)
1062
+ }
1063
+ }
1064
+
1065
+ /**
1066
+ * Try locator first, fall back to AI-based description search if locator fails.
1067
+ * Returns a framework-native locator/element usable for any action or assertion.
1068
+ *
1069
+ * @param {Object} ctx - page (Playwright) or browser (WDIO)
1070
+ * @param {Object} locator - Playwright Locator or WDIO Element to try first
1071
+ * @param {string} description - natural-language element description for AI fallback
1072
+ * @returns {Promise<Object>} Playwright Locator or WDIO Element
1073
+ */
1074
+ async function locate(ctx, locator, description) {
1075
+ try {
1076
+ if (_isPlaywright(ctx)) {
1077
+ await locator.waitFor({ state: 'attached' })
1078
+ } else {
1079
+ await locator.waitForExist()
1080
+ }
1081
+ console.log(`\u2713 Located via locator`)
1082
+ return locator
1083
+ } catch {
1084
+ console.log(`\u2717 Locator failed \u2014 searching for: "${description}"`)
1085
+ }
1086
+ return _findByDescription(ctx, description)
1087
+ }
1088
+
1089
+ /**
1090
+ * Find an element by natural-language description only (no locator needed).
1091
+ * Searches the page DOM, uses LLM disambiguation, and vision as last resort.
1092
+ * Returns a framework-native locator/element usable for any action or assertion.
1093
+ *
1094
+ * @param {Object} ctx - page (Playwright) or browser (WDIO)
1095
+ * @param {string} description - natural-language element description
1096
+ * @returns {Promise<Object>} Playwright Locator or WDIO Element
1097
+ *
1098
+ * @example
1099
+ * const el = await el.find(page, 'Submit order button')
1100
+ * await el.click()
1101
+ * await expect(el).toHaveText('Submit')
1102
+ */
1103
+ async function find(ctx, description) {
1104
+ console.log(`[Find] "${description}"`)
1105
+ return _findByDescription(ctx, description)
1106
+ }
1107
+
1108
+ /**
1109
+ * Click with locator-first fallback + optimized click strategy.
1110
+ * Uses page.goto() for <a href> links (avoids hover/overlay issues)
1111
+ * and JS elementFromPoint click for buttons (no mouse cursor movement).
1112
+ * Best for navigation actions. For fill/hover/assert, use wrap or find.
1113
+ *
1114
+ * @param {Object} ctx - page (Playwright) or browser (WDIO)
1115
+ * @param {Object} locator - Playwright Locator or WDIO Element to try first
1116
+ * @param {string} description - natural-language element description for AI fallback
1117
+ * @returns {Promise<void>}
1118
+ *
1119
+ * @example
1120
+ * await el.click(page, page.locator('#nav-blog'), 'Blog page link')
1121
+ */
1122
+ async function click(ctx, locator, description) {
1123
+ try {
1124
+ await locator.click()
1125
+ console.log(`\u2713 Clicked via locator`)
1126
+ return
1127
+ } catch {
1128
+ console.log(`\u2717 Locator failed \u2014 searching for: "${description}"`)
1129
+ }
1130
+ let element = await findElementInDOM(ctx, description)
1131
+ if (element) { await scrollAndClick(ctx, element); return }
1132
+ try {
1133
+ const result = await visionFallback(ctx, description)
1134
+ if (result.element) { await scrollAndClick(ctx, result.element); return }
1135
+ await clickAtCoords(ctx, result.coords)
1136
+ } catch (err) {
1137
+ throw new Error(`All fallback paths exhausted for "${description}": ${err.message}`)
1138
+ }
1139
+ }
1140
+
1141
+ /**
1142
+ * Wrap a single locator/element with AI fallback. Returns a Proxy that
1143
+ * intercepts all method calls (click, fill, textContent, getAttribute, etc.).
1144
+ * If the original method fails, AI resolves the element and retries.
1145
+ * The wrapped object looks and acts like the original — Playwright assertions,
1146
+ * WDIO expect, and all framework APIs work transparently.
1147
+ *
1148
+ * For cleaner code, prefer wrapPage() or wrapBrowser() over calling this directly.
1149
+ *
1150
+ * @param {Object} driverContext - page (Playwright) or browser (WDIO)
1151
+ * @param {Object} locator - Playwright Locator or WDIO Element
1152
+ * @param {string} description - natural-language element description
1153
+ * @returns {Proxy} Proxy that behaves like the original locator/element
1154
+ *
1155
+ * @example
1156
+ * const btn = el.wrap(page, page.locator('#old-btn'), 'Submit button')
1157
+ * await btn.click() // tries #old-btn → fail → AI finds Submit → click
1158
+ * await btn.textContent() // same fallback for any method
1159
+ */
1160
+ function wrap(driverContext, locator, description) {
1161
+ const PASSTHROUGH = new Set([
1162
+ 'then', 'catch', 'finally', 'toString', 'valueOf', 'toJSON',
1163
+ Symbol.toPrimitive, Symbol.toStringTag, Symbol.iterator, Symbol.asyncIterator,
1164
+ ])
1165
+ let _resolved = null
1166
+
1167
+ return new Proxy(locator, {
1168
+ get(target, prop, receiver) {
1169
+ if (typeof prop === 'symbol' || PASSTHROUGH.has(prop)) {
1170
+ return Reflect.get(target, prop, receiver)
1171
+ }
1172
+ const original = target[prop]
1173
+ if (typeof original !== 'function') return original
1174
+
1175
+ // Boolean query methods (isVisible, isEnabled, etc.) return false instead
1176
+ // of throwing on missing elements. We can't detect failure from the return
1177
+ // value, so resolve via AI first, then query the real element.
1178
+ const BOOL_QUERIES = ['isVisible', 'isEnabled', 'isChecked', 'isHidden', 'isEditable']
1179
+
1180
+ return async function (...args) {
1181
+ if (BOOL_QUERIES.includes(prop)) {
1182
+ if (!_resolved) {
1183
+ console.log(`[AI] ${prop}() \u2014 resolving via AI first for "${description}"`)
1184
+ _resolved = await _findByDescription(driverContext, description)
1185
+ }
1186
+ return _resolved[prop](...args)
1187
+ }
1188
+
1189
+ try {
1190
+ return await original.apply(target, args)
1191
+ } catch (firstError) {
1192
+ console.log(`[AI] ${String(prop)}() failed \u2014 AI fallback for "${description}"`)
1193
+ if (!_resolved) _resolved = await _findByDescription(driverContext, description)
1194
+
1195
+ const resolvedMethod = _resolved[prop]
1196
+ if (typeof resolvedMethod !== 'function') {
1197
+ if (prop in _resolved) return _resolved[prop]
1198
+ throw firstError
1199
+ }
1200
+
1201
+ if (prop === 'click' || prop === 'dblclick') {
1202
+ const href = await _resolved.getAttribute('href').catch(() => null)
1203
+ if (href) {
1204
+ await _goto(driverContext, href)
1205
+ console.log(`[AI] Navigated to: ${href}`)
1206
+ return
1207
+ }
1208
+ return resolvedMethod.call(_resolved, { ...(args[0] || {}), force: true })
1209
+ }
1210
+ const FORCE_VAL = { fill: 1, type: 1, selectOption: 1, press: 1 }
1211
+ let retryArgs = [...args]
1212
+ if (['hover', 'tap', 'check', 'uncheck'].includes(prop)) {
1213
+ retryArgs[0] = { ...(retryArgs[0] || {}), force: true }
1214
+ } else if (prop in FORCE_VAL) {
1215
+ retryArgs[FORCE_VAL[prop]] = { ...(retryArgs[FORCE_VAL[prop]] || {}), force: true }
1216
+ }
1217
+ return resolvedMethod.apply(_resolved, retryArgs)
1218
+ }
1219
+ }
1220
+ }
1221
+ })
1222
+ }
1223
+
1224
+ /**
1225
+ * Wrap a Playwright page so that page.locator(selector, { ai: 'description' })
1226
+ * automatically creates AI-fallback locators. Locators without { ai } are
1227
+ * returned unchanged — zero overhead on stable selectors.
1228
+ *
1229
+ * Call once per test (or in a fixture for all tests).
1230
+ *
1231
+ * @param {Object} pageObj - Playwright Page object
1232
+ * @returns {Proxy} Proxied page with enhanced locator() method
1233
+ *
1234
+ * @example
1235
+ * const p = el.wrapPage(page)
1236
+ * await p.locator('#btn', { ai: 'Submit button' }).click() // AI fallback
1237
+ * await p.locator('#btn').click() // normal, no AI
1238
+ */
1239
+ function wrapPage(pageObj) {
1240
+ return new Proxy(pageObj, {
1241
+ get(target, prop, receiver) {
1242
+ if (prop === 'locator') {
1243
+ return function (selector, options = {}) {
1244
+ const { ai, ...locatorOptions } = options
1245
+ const loc = Object.keys(locatorOptions).length > 0
1246
+ ? target.locator(selector, locatorOptions) : target.locator(selector)
1247
+ return ai ? wrap(target, loc, ai) : loc
1248
+ }
1249
+ }
1250
+ return Reflect.get(target, prop, receiver)
1251
+ }
1252
+ })
1253
+ }
1254
+
1255
+ /**
1256
+ * Wrap a WDIO browser so that browser.$(selector, { ai: 'description' })
1257
+ * automatically creates AI-fallback elements. $() calls without { ai }
1258
+ * are returned unchanged — zero overhead on stable selectors.
1259
+ *
1260
+ * Call once in before() hook or wdio.conf.js.
1261
+ *
1262
+ * @param {Object} browserObj - WDIO Browser object
1263
+ * @returns {Proxy} Proxied browser with enhanced $() method
1264
+ *
1265
+ * @example
1266
+ * const b = el.wrapBrowser(browser)
1267
+ * await b.$('#btn', { ai: 'Submit button' }).click() // AI fallback
1268
+ * await b.$('#btn').click() // normal, no AI
1269
+ */
1270
+ function wrapBrowser(browserObj) {
1271
+ return new Proxy(browserObj, {
1272
+ get(target, prop, receiver) {
1273
+ if (prop === '$') {
1274
+ return function (selector, options = {}) {
1275
+ const { ai, ...rest } = options
1276
+ const el = Object.keys(rest).length > 0 ? target.$(selector, rest) : target.$(selector)
1277
+ return ai ? wrap(target, el, ai) : el
1278
+ }
1279
+ }
1280
+ return Reflect.get(target, prop, receiver)
1281
+ }
1282
+ })
1283
+ }
1284
+
1285
+ return { wrap, wrapPage, wrapBrowser, locate, find, click }
1286
+ }
1287
+
1288
+ module.exports = { createElementus }