elementus-ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +292 -0
- package/elementus.js +1288 -0
- package/package.json +42 -0
package/elementus.js
ADDED
|
@@ -0,0 +1,1288 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ╔════════════════════════════════════════════════════════════════════╗
|
|
3
|
+
* ║ Elementus — Self-healing element resolution for Playwright & WDIO ║
|
|
4
|
+
* ╚══════════════════════════════���═════════════════════════════════════╝
|
|
5
|
+
*
|
|
6
|
+
* When a selector breaks, elementus uses AI to find the element by
|
|
7
|
+
* natural-language description. Works with any action (click, fill, hover)
|
|
8
|
+
* and any assertion (toHaveText, toBeVisible). Supports local LLMs via
|
|
9
|
+
* LM Studio and cloud LLMs via Google Gemini API.
|
|
10
|
+
*
|
|
11
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
12
|
+
* 1. INSTALLATION
|
|
13
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
14
|
+
*
|
|
15
|
+
* npm install elementus
|
|
16
|
+
*
|
|
17
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
18
|
+
* 2. LLM PROVIDER SETUP (choose one)
|
|
19
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
20
|
+
*
|
|
21
|
+
* Option A — Local LLM via LM Studio (free, private, no API key):
|
|
22
|
+
* 1. Download LM Studio from https://lmstudio.ai
|
|
23
|
+
* 2. Load a vision-capable model (e.g., gemma-4-26b-a4b-it)
|
|
24
|
+
* 3. Start the local server (default: http://localhost:1234)
|
|
25
|
+
* 4. Configure:
|
|
26
|
+
* const el = createElementus({
|
|
27
|
+
* provider: 'lmstudio',
|
|
28
|
+
* lmStudioUrl: 'http://localhost:1234/v1/chat/completions',
|
|
29
|
+
* model: 'gemma-4-26b-a4b-it',
|
|
30
|
+
* })
|
|
31
|
+
*
|
|
32
|
+
* Option B — Google Gemini API (cloud, fast, better vision):
|
|
33
|
+
* 1. Get an API key from https://aistudio.google.com/apikey
|
|
34
|
+
* 2. Configure:
|
|
35
|
+
* const el = createElementus({
|
|
36
|
+
* provider: 'gemini',
|
|
37
|
+
* geminiApiKey: 'AIza...', // or set GEMINI_API_KEY env var
|
|
38
|
+
* geminiModel: 'gemini-2.5-flash',
|
|
39
|
+
* })
|
|
40
|
+
*
|
|
41
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
42
|
+
* 3. FRAMEWORK SETUP
|
|
43
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
44
|
+
*
|
|
45
|
+
* Playwright — wrap page once, add { ai } to any locator:
|
|
46
|
+
*
|
|
47
|
+
* const { createElementus } = require('elementus')
|
|
48
|
+
* const el = createElementus({ provider: 'gemini', geminiApiKey: '...' })
|
|
49
|
+
*
|
|
50
|
+
* // In test or fixture:
|
|
51
|
+
* const p = el.wrapPage(page)
|
|
52
|
+
* await p.locator('#btn', { ai: 'Submit order button' }).click()
|
|
53
|
+
* await p.locator('#email', { ai: 'Email input field' }).fill('test@test.com')
|
|
54
|
+
*
|
|
55
|
+
* // Locators WITHOUT { ai } work normally — zero overhead:
|
|
56
|
+
* await p.locator('#always-stable').click()
|
|
57
|
+
*
|
|
58
|
+
* Playwright fixture (recommended — wrap once for all tests):
|
|
59
|
+
*
|
|
60
|
+
* // fixtures.js
|
|
61
|
+
* const { test: base } = require('@playwright/test')
|
|
62
|
+
* const { createElementus } = require('elementus')
|
|
63
|
+
* const el = createElementus({ provider: 'gemini', geminiApiKey: '...' })
|
|
64
|
+
*
|
|
65
|
+
* module.exports = base.extend({
|
|
66
|
+
* page: async ({ page }, use) => {
|
|
67
|
+
* await use(el.wrapPage(page))
|
|
68
|
+
* }
|
|
69
|
+
* })
|
|
70
|
+
*
|
|
71
|
+
* // In tests — page is already wrapped:
|
|
72
|
+
* test('example', async ({ page }) => {
|
|
73
|
+
* await page.locator('#btn', { ai: 'Submit button' }).click()
|
|
74
|
+
* })
|
|
75
|
+
*
|
|
76
|
+
* WDIO — wrap browser once, add { ai } to any $() selector:
|
|
77
|
+
*
|
|
78
|
+
* const { createElementus } = require('elementus')
|
|
79
|
+
* const el = createElementus({ provider: 'lmstudio' })
|
|
80
|
+
*
|
|
81
|
+
* // In before hook or config:
|
|
82
|
+
* const b = el.wrapBrowser(browser)
|
|
83
|
+
* await b.$('#btn', { ai: 'Submit order button' }).click()
|
|
84
|
+
* await b.$('#email', { ai: 'Email input field' }).setValue('test@test.com')
|
|
85
|
+
*
|
|
86
|
+
* // $() calls WITHOUT { ai } work normally:
|
|
87
|
+
* await b.$('#always-stable').click()
|
|
88
|
+
*
|
|
89
|
+
* Appium (native Android/iOS/Flutter) — same wrapBrowser pattern:
|
|
90
|
+
*
|
|
91
|
+
* const { createElementus } = require('elementus')
|
|
92
|
+
* const el = createElementus({ provider: 'gemini', geminiApiKey: '...' })
|
|
93
|
+
*
|
|
94
|
+
* // In before hook:
|
|
95
|
+
* const d = el.wrapBrowser(driver)
|
|
96
|
+
* await d.$('~loginButton', { ai: 'Login button on welcome screen' }).click()
|
|
97
|
+
* await d.$('~emailField', { ai: 'Email input' }).setValue('test@test.com')
|
|
98
|
+
*
|
|
99
|
+
* // Works with Flutter, React Native, native Android/iOS — any Appium driver.
|
|
100
|
+
* // Instead of DOM scanning, Elementus parses the native element tree
|
|
101
|
+
* // from driver.getPageSource() (XML) and applies the same AI scoring.
|
|
102
|
+
*
|
|
103
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
104
|
+
* 4. API REFERENCE
|
|
105
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
106
|
+
*
|
|
107
|
+
* el.wrapPage(page)
|
|
108
|
+
* Wraps a Playwright page. Returns a proxy where page.locator(selector,
|
|
109
|
+
* { ai: 'description' }) auto-creates AI-fallback locators.
|
|
110
|
+
* Locators without { ai } pass through unchanged.
|
|
111
|
+
*
|
|
112
|
+
* el.wrapBrowser(browser)
|
|
113
|
+
* Wraps a WDIO browser. Returns a proxy where browser.$(selector,
|
|
114
|
+
* { ai: 'description' }) auto-creates AI-fallback elements.
|
|
115
|
+
* $() calls without { ai } pass through unchanged.
|
|
116
|
+
*
|
|
117
|
+
* el.wrap(context, locator, description)
|
|
118
|
+
* Low-level: wraps any single locator/element with AI fallback.
|
|
119
|
+
* Use wrapPage/wrapBrowser instead for cleaner code.
|
|
120
|
+
* - context: page (Playwright) or browser (WDIO)
|
|
121
|
+
* - locator: Playwright Locator or WDIO Element
|
|
122
|
+
* - description: natural-language element description
|
|
123
|
+
* Returns a Proxy that tries the original, falls back to AI on failure.
|
|
124
|
+
*
|
|
125
|
+
* el.find(context, description)
|
|
126
|
+
* Find element by description only (no locator needed).
|
|
127
|
+
* Returns a real Playwright Locator / WDIO Element for any action.
|
|
128
|
+
* - context: page (Playwright) or browser (WDIO)
|
|
129
|
+
* - description: natural-language element description
|
|
130
|
+
* Example:
|
|
131
|
+
* const el = await el.find(page, 'Submit button')
|
|
132
|
+
* await el.click()
|
|
133
|
+
* await expect(el).toHaveText('Submit')
|
|
134
|
+
*
|
|
135
|
+
* el.locate(context, locator, description)
|
|
136
|
+
* Try locator first, fall back to AI if locator fails.
|
|
137
|
+
* Returns a Playwright Locator / WDIO Element.
|
|
138
|
+
* Respects your framework's configured action timeout.
|
|
139
|
+
* Example:
|
|
140
|
+
* const el = await el.locate(page, page.locator('#btn'), 'Submit')
|
|
141
|
+
* await el.click()
|
|
142
|
+
*
|
|
143
|
+
* el.click(context, locator, description)
|
|
144
|
+
* Click with optimized fallback: uses page.goto() for links (avoids
|
|
145
|
+
* hover/overlay issues) and JS click for buttons (no mouse movement).
|
|
146
|
+
* Use this for navigation clicks. For other actions, use wrap/find.
|
|
147
|
+
* Respects your framework's configured action timeout.
|
|
148
|
+
* Example:
|
|
149
|
+
* await el.click(page, page.locator('#nav'), 'Blog page link')
|
|
150
|
+
*
|
|
151
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
152
|
+
* 5. CONFIGURATION OPTIONS
|
|
153
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
154
|
+
*
|
|
155
|
+
* createElementus({
|
|
156
|
+
* // LLM Provider
|
|
157
|
+
* provider: 'lmstudio', // 'lmstudio' | 'gemini'
|
|
158
|
+
*
|
|
159
|
+
* // LM Studio (when provider = 'lmstudio')
|
|
160
|
+
* lmStudioUrl: 'http://localhost:1234/v1/chat/completions',
|
|
161
|
+
* model: 'gemma-4-26b-a4b-it',
|
|
162
|
+
*
|
|
163
|
+
* // Gemini (when provider = 'gemini')
|
|
164
|
+
* geminiApiKey: null, // or GEMINI_API_KEY env var
|
|
165
|
+
* geminiModel: 'gemini-2.5-flash',
|
|
166
|
+
*
|
|
167
|
+
* // Behavior
|
|
168
|
+
* maxCandidates: 20, // max elements sent to LLM for disambiguation
|
|
169
|
+
* visionMaxWidth: 1280, // max screenshot width (px) sent to vision LLM
|
|
170
|
+
*
|
|
171
|
+
* // Debugging
|
|
172
|
+
* debug: false, // save screenshots to debugDir
|
|
173
|
+
* debugDir: './debug', // directory for debug screenshots
|
|
174
|
+
*
|
|
175
|
+
* // Custom stop words (merged with defaults)
|
|
176
|
+
* stopWords: null, // Set of words to ignore in descriptions
|
|
177
|
+
* })
|
|
178
|
+
*
|
|
179
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
180
|
+
* 6. HOW IT WORKS (fallback pipeline)
|
|
181
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
182
|
+
*
|
|
183
|
+
* When a locator/selector fails, elementus runs this pipeline:
|
|
184
|
+
*
|
|
185
|
+
* Step 1: Locator/Selector
|
|
186
|
+
* Try the original selector. If it works, done — zero overhead.
|
|
187
|
+
*
|
|
188
|
+
* Step 2: DOM Scoring
|
|
189
|
+
* Scan all interactive elements on the page. Score each by keyword
|
|
190
|
+
* and phrase relevance to the description. If one clear winner, use it.
|
|
191
|
+
* If multiple tied: send top candidates to LLM for disambiguation.
|
|
192
|
+
* If all identical (e.g., 10x "Edit" buttons): use positional LLM
|
|
193
|
+
* with coordinates ("first Edit button near the top").
|
|
194
|
+
*
|
|
195
|
+
* Step 3: Vision (last resort)
|
|
196
|
+
* Take a full-page screenshot with a 3x3 labeled grid overlay.
|
|
197
|
+
* Ask the vision LLM which region contains the target element.
|
|
198
|
+
* Scroll to that region, re-scan DOM. If still unresolved,
|
|
199
|
+
* ask LLM for precise pixel coordinates.
|
|
200
|
+
*
|
|
201
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
202
|
+
* 7. TIPS FOR WRITING DESCRIPTIONS
|
|
203
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
204
|
+
*
|
|
205
|
+
* Good descriptions use words that appear in or near the element:
|
|
206
|
+
* 'Submit order button' — matches <button>Submit</button>
|
|
207
|
+
* 'Email input field' — matches <input> near "Email" label
|
|
208
|
+
* 'Privacy Policy footer link' — matches <a>Privacy Policy</a>
|
|
209
|
+
*
|
|
210
|
+
* For identical elements, add positional context:
|
|
211
|
+
* 'first Edit button near the top'
|
|
212
|
+
* 'Delete button in the third row'
|
|
213
|
+
* 'Add to Cart for the last product'
|
|
214
|
+
*
|
|
215
|
+
* Avoid vague descriptions:
|
|
216
|
+
* BAD: 'the button' — matches every button
|
|
217
|
+
* BAD: 'click here' — no useful keywords
|
|
218
|
+
* GOOD: 'Save Changes button' — specific, matchable text
|
|
219
|
+
*
|
|
220
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
221
|
+
* 8. WHICH API TO USE FOR WHAT
|
|
222
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
223
|
+
*
|
|
224
|
+
* "I want to click a link/button":
|
|
225
|
+
* → Use wrapPage + { ai } or el.click
|
|
226
|
+
* await p.locator('#btn', { ai: 'Submit order' }).click()
|
|
227
|
+
*
|
|
228
|
+
* "I want to fill an input":
|
|
229
|
+
* → Use wrapPage + { ai }
|
|
230
|
+
* await p.locator('#email', { ai: 'Email input' }).fill('a@b.com')
|
|
231
|
+
*
|
|
232
|
+
* "I want to read text or an attribute":
|
|
233
|
+
* → Use wrapPage + { ai } or el.find
|
|
234
|
+
* const text = await p.locator('#price', { ai: 'Product price' }).textContent()
|
|
235
|
+
* const href = await p.locator('#link', { ai: 'Blog link' }).getAttribute('href')
|
|
236
|
+
*
|
|
237
|
+
* "I want to assert (expect) on an element":
|
|
238
|
+
* → Use el.find — returns a real locator you can pass to expect()
|
|
239
|
+
* const el = await el.find(page, 'Submit order button')
|
|
240
|
+
* await expect(el).toBeVisible()
|
|
241
|
+
* await expect(el).toHaveText('Submit')
|
|
242
|
+
* await expect(el).toHaveAttribute('href', '/checkout')
|
|
243
|
+
*
|
|
244
|
+
* "I want to check isVisible / isEnabled (boolean result)":
|
|
245
|
+
* → Just use { ai } — handled automatically. Boolean query methods
|
|
246
|
+
* (isVisible, isEnabled, isChecked, isHidden, isEditable) are detected
|
|
247
|
+
* by the Proxy and resolve via AI first, then query the real element.
|
|
248
|
+
* const vis = await p.locator('#gone', { ai: 'Submit' }).isVisible() // true
|
|
249
|
+
* const on = await p.locator('#gone', { ai: 'Checkbox' }).isChecked() // true/false
|
|
250
|
+
*
|
|
251
|
+
* "I want to navigate to a page via link click":
|
|
252
|
+
* → Use el.click — it uses page.goto(href) which is faster and avoids
|
|
253
|
+
* CSS hover/overlay issues that regular click() can trigger.
|
|
254
|
+
* await el.click(page, page.locator('#blog'), 'Blog page link')
|
|
255
|
+
*
|
|
256
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
257
|
+
*/
|
|
258
|
+
|
|
259
|
+
const fs = require('fs')
|
|
260
|
+
const path = require('path')
|
|
261
|
+
|
|
262
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
263
|
+
// Defaults & constants
|
|
264
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
265
|
+
|
|
266
|
+
const DEFAULTS = {
|
|
267
|
+
provider: 'lmstudio',
|
|
268
|
+
lmStudioUrl: 'http://localhost:1234/v1/chat/completions',
|
|
269
|
+
model: 'gemma-4-26b-a4b-it',
|
|
270
|
+
geminiApiKey: null,
|
|
271
|
+
geminiModel: 'gemini-2.5-flash',
|
|
272
|
+
maxCandidates: 20,
|
|
273
|
+
debug: false,
|
|
274
|
+
debugDir: null,
|
|
275
|
+
stopWords: null,
|
|
276
|
+
visionMaxWidth: 1280,
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
const DEFAULT_STOP_WORDS = new Set([
|
|
280
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of',
|
|
281
|
+
'with', 'by', 'from', 'is', 'it', 'its', 'this', 'that', 'be', 'are', 'was',
|
|
282
|
+
'were', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'not',
|
|
283
|
+
'link', 'button', 'click', 'press', 'navigate', 'navigation', 'nav',
|
|
284
|
+
'page', 'menu', 'top', 'bottom', 'footer', 'header', 'sidebar', 'bar',
|
|
285
|
+
'find', 'locate', 'element', 'item', 'icon', 'label', 'text', 'section'
|
|
286
|
+
])
|
|
287
|
+
|
|
288
|
+
const INTERACTIVE_TAGS = ['a', 'button', 'input', 'select', 'textarea', 'label', 'summary']
|
|
289
|
+
const INTERACTIVE_ROLES = ['button', 'link', 'menuitem', 'menuitemcheckbox', 'menuitemradio',
|
|
290
|
+
'tab', 'checkbox', 'radio', 'option', 'combobox', 'switch', 'treeitem', 'gridcell']
|
|
291
|
+
const INTERACTIVE_SELECTORS = 'a, button, input, select, textarea, [role="button"], [role="link"], [role="menuitem"], [role="tab"], [role="checkbox"], [role="radio"]'
|
|
292
|
+
|
|
293
|
+
const REGION_LABELS = [
|
|
294
|
+
['top-left', 'top-center', 'top-right' ],
|
|
295
|
+
['middle-left', 'middle-center', 'middle-right'],
|
|
296
|
+
['bottom-left', 'bottom-center', 'bottom-right'],
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
300
|
+
// Factory
|
|
301
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* Create a elementus instance with the given configuration.
|
|
305
|
+
*
|
|
306
|
+
* @param {Object} userConfig
|
|
307
|
+
* @param {'lmstudio'|'gemini'} [userConfig.provider='lmstudio'] - LLM provider
|
|
308
|
+
* @param {string} [userConfig.lmStudioUrl='http://localhost:1234/v1/chat/completions'] - LM Studio endpoint
|
|
309
|
+
* @param {string} [userConfig.model='gemma-4-26b-a4b-it'] - LM Studio model name
|
|
310
|
+
* @param {string|null} [userConfig.geminiApiKey=null] - Google Gemini API key (or GEMINI_API_KEY env var)
|
|
311
|
+
* @param {string} [userConfig.geminiModel='gemini-2.5-flash'] - Gemini model ID
|
|
312
|
+
* @param {number} [userConfig.maxCandidates=20] - max elements sent to LLM for disambiguation
|
|
313
|
+
* @param {boolean} [userConfig.debug=false] - save debug screenshots
|
|
314
|
+
* @param {string|null} [userConfig.debugDir=null] - directory for debug screenshots
|
|
315
|
+
* @param {Set<string>|null} [userConfig.stopWords=null] - custom stop words (replaces defaults)
|
|
316
|
+
* @param {number} [userConfig.visionMaxWidth=1280] - max screenshot width (px) sent to vision LLM
|
|
317
|
+
* @returns {{ wrap, wrapPage, wrapBrowser, locate, find, click }}
|
|
318
|
+
*/
|
|
319
|
+
function createElementus(userConfig = {}) {
|
|
320
|
+
const config = { ...DEFAULTS, ...userConfig }
|
|
321
|
+
const stopWords = config.stopWords || DEFAULT_STOP_WORDS
|
|
322
|
+
|
|
323
|
+
// ── Driver adapter — auto-detects Playwright vs WDIO ─────────────────
|
|
324
|
+
|
|
325
|
+
function _eval(ctx, fn, args) {
|
|
326
|
+
// Playwright: page.evaluate(fn, args) | WDIO: browser.execute(fn, args)
|
|
327
|
+
if (typeof ctx.evaluate === 'function') {
|
|
328
|
+
return args !== undefined ? ctx.evaluate(fn, args) : ctx.evaluate(fn)
|
|
329
|
+
}
|
|
330
|
+
if (typeof ctx.execute === 'function') {
|
|
331
|
+
return args !== undefined ? ctx.execute(fn, args) : ctx.execute(fn)
|
|
332
|
+
}
|
|
333
|
+
throw new Error('Context must have evaluate() (Playwright) or execute() (WDIO)')
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
async function _screenshot(ctx, fullPage = false) {
|
|
337
|
+
if (typeof ctx.screenshot === 'function') {
|
|
338
|
+
// Playwright — returns Buffer
|
|
339
|
+
const buf = await ctx.screenshot({ type: 'png', fullPage, scale: 'css' })
|
|
340
|
+
return { buffer: buf, base64: buf.toString('base64') }
|
|
341
|
+
}
|
|
342
|
+
if (typeof ctx.takeScreenshot === 'function') {
|
|
343
|
+
// WDIO — returns base64 string (viewport only)
|
|
344
|
+
const b64 = await ctx.takeScreenshot()
|
|
345
|
+
return { buffer: Buffer.from(b64, 'base64'), base64: b64 }
|
|
346
|
+
}
|
|
347
|
+
throw new Error('Context must have screenshot() (Playwright) or takeScreenshot() (WDIO)')
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
async function _goto(ctx, url) {
|
|
351
|
+
if (typeof ctx.goto === 'function') return ctx.goto(url, { waitUntil: 'load' })
|
|
352
|
+
if (typeof ctx.url === 'function') return ctx.url(url)
|
|
353
|
+
// Native apps: no URL navigation — silently skip
|
|
354
|
+
if (_isNative(ctx)) return
|
|
355
|
+
throw new Error('Context must have goto() (Playwright) or url() (WDIO)')
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
async function _wait(ctx, ms) {
|
|
359
|
+
if (typeof ctx.waitForTimeout === 'function') return ctx.waitForTimeout(ms)
|
|
360
|
+
if (typeof ctx.pause === 'function') return ctx.pause(ms)
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
async function _makeLocator(ctx, selector) {
|
|
364
|
+
// Playwright: sync locator() | WDIO: async $()
|
|
365
|
+
if (typeof ctx.locator === 'function') return ctx.locator(selector)
|
|
366
|
+
if (typeof ctx.$ === 'function') return ctx.$(selector)
|
|
367
|
+
throw new Error('Context must have locator() (Playwright) or $() (WDIO)')
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
function _isPlaywright(ctx) {
|
|
371
|
+
return typeof ctx.evaluate === 'function'
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
function _isNative(ctx) {
|
|
375
|
+
// Appium native: has getPageSource but no evaluate/execute for browser JS
|
|
376
|
+
// (or execute exists but would fail — we detect via getPageSource presence + no DOM)
|
|
377
|
+
return typeof ctx.getPageSource === 'function' &&
|
|
378
|
+
typeof ctx.evaluate !== 'function'
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// ── LLM helpers — multi-provider ─────────────────────────────────────
|
|
382
|
+
|
|
383
|
+
async function _lmStudioText(prompt, maxTokens) {
|
|
384
|
+
const res = await fetch(config.lmStudioUrl, {
|
|
385
|
+
method: 'POST',
|
|
386
|
+
headers: { 'Content-Type': 'application/json' },
|
|
387
|
+
body: JSON.stringify({
|
|
388
|
+
model: config.model,
|
|
389
|
+
messages: [{ role: 'user', content: prompt }],
|
|
390
|
+
max_tokens: maxTokens, temperature: 0
|
|
391
|
+
})
|
|
392
|
+
})
|
|
393
|
+
if (!res.ok) throw new Error(`LM Studio ${res.status}: ${await res.text()}`)
|
|
394
|
+
return (await res.json()).choices[0].message.content.trim()
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
async function _lmStudioVision(prompt, base64Image, maxTokens) {
|
|
398
|
+
const res = await fetch(config.lmStudioUrl, {
|
|
399
|
+
method: 'POST',
|
|
400
|
+
headers: { 'Content-Type': 'application/json' },
|
|
401
|
+
body: JSON.stringify({
|
|
402
|
+
model: config.model,
|
|
403
|
+
messages: [{ role: 'user', content: [
|
|
404
|
+
{ type: 'text', text: prompt },
|
|
405
|
+
{ type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } }
|
|
406
|
+
]}],
|
|
407
|
+
max_tokens: maxTokens, temperature: 0
|
|
408
|
+
})
|
|
409
|
+
})
|
|
410
|
+
if (!res.ok) throw new Error(`LM Studio ${res.status}: ${await res.text()}`)
|
|
411
|
+
return (await res.json()).choices[0].message.content.trim()
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
function _geminiUrl() {
|
|
415
|
+
const key = config.geminiApiKey || process.env.GEMINI_API_KEY
|
|
416
|
+
if (!key) throw new Error('Gemini API key required: set geminiApiKey or GEMINI_API_KEY env var')
|
|
417
|
+
return `https://generativelanguage.googleapis.com/v1beta/models/${config.geminiModel}:generateContent?key=${key}`
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
function _geminiExtractText(data) {
|
|
421
|
+
if (!data.candidates?.[0]?.content?.parts) {
|
|
422
|
+
console.log(`[LLM] Gemini raw response: ${JSON.stringify(data).slice(0, 500)}`)
|
|
423
|
+
throw new Error('Unexpected Gemini response structure')
|
|
424
|
+
}
|
|
425
|
+
const parts = data.candidates[0].content.parts
|
|
426
|
+
// Thinking models return multiple parts (thought + output) — take the last text part
|
|
427
|
+
for (let i = parts.length - 1; i >= 0; i--) {
|
|
428
|
+
if (parts[i].text && !parts[i].thought) return parts[i].text.trim()
|
|
429
|
+
}
|
|
430
|
+
// Fallback: return any text part
|
|
431
|
+
for (let i = parts.length - 1; i >= 0; i--) {
|
|
432
|
+
if (parts[i].text) return parts[i].text.trim()
|
|
433
|
+
}
|
|
434
|
+
throw new Error('No text in Gemini response')
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
async function _geminiText(prompt, maxTokens) {
|
|
438
|
+
const res = await fetch(_geminiUrl(), {
|
|
439
|
+
method: 'POST',
|
|
440
|
+
headers: { 'Content-Type': 'application/json' },
|
|
441
|
+
body: JSON.stringify({
|
|
442
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
443
|
+
generationConfig: { maxOutputTokens: maxTokens, temperature: 0, responseMimeType: 'application/json', thinkingConfig: { thinkingBudget: 0 } }
|
|
444
|
+
})
|
|
445
|
+
})
|
|
446
|
+
if (!res.ok) throw new Error(`Gemini ${res.status}: ${await res.text()}`)
|
|
447
|
+
return _geminiExtractText(await res.json())
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
async function _geminiVision(prompt, base64Image, maxTokens) {
|
|
451
|
+
const res = await fetch(_geminiUrl(), {
|
|
452
|
+
method: 'POST',
|
|
453
|
+
headers: { 'Content-Type': 'application/json' },
|
|
454
|
+
body: JSON.stringify({
|
|
455
|
+
contents: [{ parts: [
|
|
456
|
+
{ text: prompt },
|
|
457
|
+
{ inline_data: { mime_type: 'image/png', data: base64Image } }
|
|
458
|
+
]}],
|
|
459
|
+
generationConfig: { maxOutputTokens: maxTokens, temperature: 0, responseMimeType: 'application/json', thinkingConfig: { thinkingBudget: 0 } }
|
|
460
|
+
})
|
|
461
|
+
})
|
|
462
|
+
if (!res.ok) throw new Error(`Gemini ${res.status}: ${await res.text()}`)
|
|
463
|
+
return _geminiExtractText(await res.json())
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
async function askLLMText(prompt, maxTokens = 131072) {
|
|
467
|
+
const t0 = Date.now()
|
|
468
|
+
const result = config.provider === 'gemini' ? await _geminiText(prompt, maxTokens) : await _lmStudioText(prompt, maxTokens)
|
|
469
|
+
console.log(`[LLM] Text response: ${Date.now() - t0}ms`)
|
|
470
|
+
return result
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
async function askLLMVision(prompt, base64Image, maxTokens = 131072) {
|
|
474
|
+
const t0 = Date.now()
|
|
475
|
+
const result = config.provider === 'gemini' ? await _geminiVision(prompt, base64Image, maxTokens) : await _lmStudioVision(prompt, base64Image, maxTokens)
|
|
476
|
+
console.log(`[LLM] Vision response: ${Date.now() - t0}ms`)
|
|
477
|
+
return result
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
function parseJSON(content) {
|
|
481
|
+
const start = content.indexOf('{')
|
|
482
|
+
if (start === -1) throw new Error(`No JSON found in: ${content}`)
|
|
483
|
+
let depth = 0
|
|
484
|
+
for (let i = start; i < content.length; i++) {
|
|
485
|
+
if (content[i] === '{') depth++
|
|
486
|
+
else if (content[i] === '}') {
|
|
487
|
+
depth--
|
|
488
|
+
if (depth === 0) return JSON.parse(content.slice(start, i + 1))
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
throw new Error(`Unbalanced JSON in: ${content}`)
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
function saveDebug(filename, buffer) {
|
|
495
|
+
if (!config.debug || !config.debugDir) return
|
|
496
|
+
fs.mkdirSync(config.debugDir, { recursive: true })
|
|
497
|
+
fs.writeFileSync(path.join(config.debugDir, filename), buffer)
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
async function _resizeScreenshot(ctx, shot, origWidth, origHeight) {
|
|
501
|
+
const maxW = config.visionMaxWidth
|
|
502
|
+
if (origWidth <= maxW) return { base64: shot.base64, scale: 1 }
|
|
503
|
+
const scale = origWidth / maxW
|
|
504
|
+
const newH = Math.round(origHeight / scale)
|
|
505
|
+
const resized = await _eval(ctx, ({ b64, w, h }) => {
|
|
506
|
+
const img = new Image()
|
|
507
|
+
const canvas = document.createElement('canvas')
|
|
508
|
+
canvas.width = w; canvas.height = h
|
|
509
|
+
return new Promise(resolve => {
|
|
510
|
+
img.onload = () => {
|
|
511
|
+
canvas.getContext('2d').drawImage(img, 0, 0, w, h)
|
|
512
|
+
resolve(canvas.toDataURL('image/png').split(',')[1])
|
|
513
|
+
}
|
|
514
|
+
img.src = 'data:image/png;base64,' + b64
|
|
515
|
+
})
|
|
516
|
+
}, { b64: shot.base64, w: maxW, h: newH })
|
|
517
|
+
console.log(`[Vision] Resized screenshot: ${origWidth}×${origHeight} → ${maxW}×${newH} (scale ${scale.toFixed(2)}x)`)
|
|
518
|
+
return { base64: resized, scale }
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// ── Native app XML parsing (Appium) ───────────────────────────────
|
|
522
|
+
|
|
523
|
+
// Interactive element types in Android/iOS/Flutter native trees
|
|
524
|
+
const NATIVE_INTERACTIVE = new Set([
|
|
525
|
+
// Android
|
|
526
|
+
'android.widget.button', 'android.widget.imagebutton', 'android.widget.edittext',
|
|
527
|
+
'android.widget.checkbox', 'android.widget.radiobutton', 'android.widget.switch',
|
|
528
|
+
'android.widget.togglebutton', 'android.widget.spinner', 'android.widget.imageview',
|
|
529
|
+
'android.widget.textview', 'android.view.view',
|
|
530
|
+
// iOS
|
|
531
|
+
'xcuielementtypebutton', 'xcuielementtypetextfield', 'xcuielementtypesecuretextfield',
|
|
532
|
+
'xcuielementtypeswitch', 'xcuielementtypepicker', 'xcuielementtypeimage',
|
|
533
|
+
'xcuielementtypestatictext', 'xcuielementtypeother', 'xcuielementtypecell',
|
|
534
|
+
'xcuielementtypelink',
|
|
535
|
+
])
|
|
536
|
+
|
|
537
|
+
function _parseNativeXml(xmlSource) {
|
|
538
|
+
// Lightweight XML parser — extracts elements with bounds and text from
|
|
539
|
+
// Appium's getPageSource() XML. Works for Android and iOS element trees.
|
|
540
|
+
const elements = []
|
|
541
|
+
// Match self-closing and open tags with attributes
|
|
542
|
+
const tagRegex = /<([a-zA-Z0-9._]+)\s([^>]*?)\/?>|<([a-zA-Z0-9._]+)\s([^>]*?)>[^<]*<\/\3>/g
|
|
543
|
+
let match
|
|
544
|
+
|
|
545
|
+
while ((match = tagRegex.exec(xmlSource)) !== null) {
|
|
546
|
+
const tagName = (match[1] || match[3] || '').toLowerCase()
|
|
547
|
+
const attrs = match[2] || match[4] || ''
|
|
548
|
+
|
|
549
|
+
// Extract attributes (indexOf is faster than creating RegExp per call)
|
|
550
|
+
const get = (name) => {
|
|
551
|
+
const needle = name + '="'
|
|
552
|
+
const i = attrs.indexOf(needle)
|
|
553
|
+
if (i === -1) return null
|
|
554
|
+
const start = i + needle.length
|
|
555
|
+
return attrs.substring(start, attrs.indexOf('"', start))
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// Get text from various attribute names across platforms
|
|
559
|
+
const text = (get('text') || get('content-desc') || get('label') ||
|
|
560
|
+
get('name') || get('value') || '').trim()
|
|
561
|
+
if (!text) continue
|
|
562
|
+
|
|
563
|
+
// Get bounds — Android: bounds="[x1,y1][x2,y2]", iOS: x,y,width,height attrs
|
|
564
|
+
let docX = 0, docY = 0
|
|
565
|
+
const bounds = get('bounds')
|
|
566
|
+
if (bounds) {
|
|
567
|
+
// Android format: [x1,y1][x2,y2]
|
|
568
|
+
const bm = bounds.match(/\[(\d+),(\d+)\]\[(\d+),(\d+)\]/)
|
|
569
|
+
if (bm) {
|
|
570
|
+
docX = Math.round((+bm[1] + +bm[3]) / 2)
|
|
571
|
+
docY = Math.round((+bm[2] + +bm[4]) / 2)
|
|
572
|
+
}
|
|
573
|
+
} else {
|
|
574
|
+
// iOS format: separate x, y, width, height attributes
|
|
575
|
+
const x = +(get('x') || 0), y = +(get('y') || 0)
|
|
576
|
+
const w = +(get('width') || 0), h = +(get('height') || 0)
|
|
577
|
+
if (w === 0 || h === 0) continue
|
|
578
|
+
docX = Math.round(x + w / 2)
|
|
579
|
+
docY = Math.round(y + h / 2)
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
if (docX <= 0 && docY <= 0) continue
|
|
583
|
+
|
|
584
|
+
// Determine if interactive (by type or clickable attribute)
|
|
585
|
+
const clickable = get('clickable') === 'true' || get('enabled') === 'true'
|
|
586
|
+
const isInteractive = NATIVE_INTERACTIVE.has(tagName) || clickable
|
|
587
|
+
|
|
588
|
+
if (!isInteractive) continue
|
|
589
|
+
|
|
590
|
+
elements.push({
|
|
591
|
+
text: text.replace(/\s+/g, ' '),
|
|
592
|
+
tag: tagName.split('.').pop(), // 'android.widget.Button' → 'button'
|
|
593
|
+
role: get('class') || tagName,
|
|
594
|
+
href: null, // native apps don't have hrefs
|
|
595
|
+
docX,
|
|
596
|
+
docY,
|
|
597
|
+
// Native-specific: store identifiers for locator building
|
|
598
|
+
_resourceId: get('resource-id') || null,
|
|
599
|
+
_accessibilityId: get('content-desc') || get('accessibility-id') || get('label') || null,
|
|
600
|
+
_xpath: null, // set later if needed
|
|
601
|
+
})
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
return elements
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
async function getAllElementsNative(ctx) {
|
|
608
|
+
const source = await ctx.getPageSource()
|
|
609
|
+
const elements = _parseNativeXml(source)
|
|
610
|
+
console.log(`[Native] Parsed ${elements.length} interactive elements from page source`)
|
|
611
|
+
return elements
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
// Build an Appium locator from native element data (no DOM attribute stamping)
|
|
615
|
+
async function markByElementNative(ctx, element) {
|
|
616
|
+
// Priority: accessibility-id > resource-id > xpath by text
|
|
617
|
+
if (element._accessibilityId) {
|
|
618
|
+
console.log(`[Resolve] Native: accessibility-id "${element._accessibilityId}"`)
|
|
619
|
+
return ctx.$(`~${element._accessibilityId}`)
|
|
620
|
+
}
|
|
621
|
+
if (element._resourceId) {
|
|
622
|
+
console.log(`[Resolve] Native: resource-id "${element._resourceId}"`)
|
|
623
|
+
return ctx.$(`android=new UiSelector().resourceId("${element._resourceId}")`)
|
|
624
|
+
}
|
|
625
|
+
// Fallback: find by text content
|
|
626
|
+
console.log(`[Resolve] Native: text "${element.text}"`)
|
|
627
|
+
const escapedText = element.text.replace(/"/g, '\\"')
|
|
628
|
+
// Try accessibility id first (works cross-platform), then text-based
|
|
629
|
+
const found = await ctx.$(`~${element.text}`).catch(() => null)
|
|
630
|
+
if (found && await found.isExisting()) return found
|
|
631
|
+
// Android UiSelector fallback
|
|
632
|
+
return ctx.$(`android=new UiSelector().text("${escapedText}")`)
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// ── DOM scanning (web) ───────────────────────────────────────────────
|
|
636
|
+
|
|
637
|
+
async function getAllElements(ctx) {
|
|
638
|
+
// Dispatch: native app → parse XML, web → evaluate JS in browser
|
|
639
|
+
if (_isNative(ctx)) return getAllElementsNative(ctx)
|
|
640
|
+
return _eval(ctx, ({ selectors }) => {
|
|
641
|
+
function extract(el) {
|
|
642
|
+
const rect = el.getBoundingClientRect()
|
|
643
|
+
if (rect.width === 0 || rect.height === 0) return null
|
|
644
|
+
const docX = Math.round(rect.left + window.scrollX + rect.width / 2)
|
|
645
|
+
if (docX < 0 || docX > window.innerWidth) return null
|
|
646
|
+
const text = el.textContent.trim().replace(/\s+/g, ' ')
|
|
647
|
+
if (!text) return null
|
|
648
|
+
return {
|
|
649
|
+
text,
|
|
650
|
+
tag: el.tagName.toLowerCase(),
|
|
651
|
+
role: el.getAttribute('role') || null,
|
|
652
|
+
href: el.getAttribute('href') || null,
|
|
653
|
+
docX,
|
|
654
|
+
docY: Math.round(rect.top + window.scrollY + rect.height / 2),
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
// Fast pass: interactive selectors + onclick + tabindex (no getComputedStyle)
|
|
658
|
+
const seen = new Set()
|
|
659
|
+
const results = []
|
|
660
|
+
for (const el of document.querySelectorAll(selectors + ',[onclick],[tabindex]')) {
|
|
661
|
+
if (seen.has(el)) continue
|
|
662
|
+
seen.add(el)
|
|
663
|
+
const data = extract(el)
|
|
664
|
+
if (data) results.push(data)
|
|
665
|
+
}
|
|
666
|
+
// Slow pass: cursor:pointer elements not caught by selectors
|
|
667
|
+
for (const el of document.querySelectorAll('*')) {
|
|
668
|
+
if (seen.has(el)) continue
|
|
669
|
+
if (window.getComputedStyle(el).cursor === 'pointer') {
|
|
670
|
+
const data = extract(el)
|
|
671
|
+
if (data) results.push(data)
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
return results
|
|
675
|
+
}, { selectors: INTERACTIVE_SELECTORS })
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// ── Scoring ──────────────────────────────────────────────────────────
|
|
679
|
+
|
|
680
|
+
function _normalizeWords(description) {
|
|
681
|
+
return description.toLowerCase().replace(/[^a-z0-9& ]/g, ' ').split(/\s+/).filter(w => w.length > 1)
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
function extractKeywordsAndPhrases(description) {
|
|
685
|
+
const words = _normalizeWords(description)
|
|
686
|
+
const keywords = words.filter(w => !stopWords.has(w))
|
|
687
|
+
const phrases = []; let run = []
|
|
688
|
+
for (const w of words) {
|
|
689
|
+
if (!stopWords.has(w)) { run.push(w) }
|
|
690
|
+
else { if (run.length >= 2) phrases.push(run.join(' ')); run = [] }
|
|
691
|
+
}
|
|
692
|
+
if (run.length >= 2) phrases.push(run.join(' '))
|
|
693
|
+
return { keywords, phrases }
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
function scoreCandidate(el, keywords, phrases) {
|
|
697
|
+
return phrases.reduce((s, p) => s + (el._ltext.includes(p) || el._lhref.includes(p) ? 3 : 0), 0) +
|
|
698
|
+
keywords.reduce((s, kw) => s + (el._ltext.includes(kw) || el._lhref.includes(kw) ? 1 : 0), 0)
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// ── Element resolution ───────────────────────────────────────────────
|
|
702
|
+
|
|
703
|
+
async function findElementInDOM(ctx, description, regionBounds = null) {
|
|
704
|
+
let elements = await getAllElements(ctx)
|
|
705
|
+
|
|
706
|
+
if (elements.length === 0) {
|
|
707
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
708
|
+
console.log(`[DOM] 0 elements \u2014 waiting for render (${attempt + 1}/3)`)
|
|
709
|
+
await _wait(ctx, 1000)
|
|
710
|
+
elements = await getAllElements(ctx)
|
|
711
|
+
if (elements.length > 0) break
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
const seen = new Set()
|
|
716
|
+
elements = elements.filter(e => {
|
|
717
|
+
const key = `${e.text}|${e.docX}|${e.docY}`
|
|
718
|
+
return seen.has(key) ? false : seen.add(key)
|
|
719
|
+
})
|
|
720
|
+
|
|
721
|
+
if (regionBounds) {
|
|
722
|
+
const { x1, y1, x2, y2 } = regionBounds
|
|
723
|
+
elements = elements.filter(e => e.docX >= x1 && e.docX <= x2 && e.docY >= y1 && e.docY <= y2)
|
|
724
|
+
console.log(`[DOM] ${elements.length} elements in region`)
|
|
725
|
+
} else {
|
|
726
|
+
console.log(`[DOM] ${elements.length} elements on page`)
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
if (elements.length === 0) return null
|
|
730
|
+
|
|
731
|
+
// Pre-compute lowercase text/href for scoring (avoids repeated toLowerCase per element)
|
|
732
|
+
elements.forEach(e => { e._ltext = e.text.toLowerCase(); e._lhref = (e.href || '').toLowerCase() })
|
|
733
|
+
|
|
734
|
+
const { keywords, phrases } = extractKeywordsAndPhrases(description)
|
|
735
|
+
console.log(`[DOM] Keywords: [${keywords.join(', ')}] | Phrases: [${phrases.join(', ')}]`)
|
|
736
|
+
|
|
737
|
+
const scored = elements
|
|
738
|
+
.map(e => ({ ...e, score: scoreCandidate(e, keywords, phrases) }))
|
|
739
|
+
.filter(e => e.score > 0)
|
|
740
|
+
.sort((a, b) => b.score - a.score)
|
|
741
|
+
|
|
742
|
+
if (scored.length === 0) {
|
|
743
|
+
if (!regionBounds) { console.log(`[DOM] No matches \u2014 signalling vision`); return null }
|
|
744
|
+
const capped = elements.slice(0, config.maxCandidates)
|
|
745
|
+
console.log(`[DOM] No matches in region \u2014 sending ${capped.length} to LLM`)
|
|
746
|
+
return disambiguateWithLLM(capped, description)
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
const topScore = scored[0].score
|
|
750
|
+
const topMatches = scored.filter(e => e.score === topScore)
|
|
751
|
+
console.log(`[DOM] Top score: ${topScore} | Top matches: ${topMatches.length}`)
|
|
752
|
+
topMatches.slice(0, 5).forEach(e => console.log(` [${e.score}] <${e.role || e.tag}> "${e.text}"`))
|
|
753
|
+
|
|
754
|
+
if (topMatches.length === 1) {
|
|
755
|
+
console.log(`[DOM] Clear match: "${topMatches[0].text}"`)
|
|
756
|
+
return topMatches[0]
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
if (!regionBounds && topMatches.length / elements.length > 0.4) {
|
|
760
|
+
console.log(`[DOM] Keyword too generic \u2014 signalling vision`); return null
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
const firstHref = topMatches[0].href || ''
|
|
764
|
+
const shortestLen = Math.min(...topMatches.map(e => e.text.length))
|
|
765
|
+
const firstPrefix = topMatches[0].text.slice(0, shortestLen).toLowerCase()
|
|
766
|
+
const allIdentical = topMatches.every(e =>
|
|
767
|
+
e.text.slice(0, shortestLen).toLowerCase() === firstPrefix && (e.href || '') === firstHref
|
|
768
|
+
)
|
|
769
|
+
if (allIdentical) {
|
|
770
|
+
console.log(`[DOM] ${topMatches.length} identical ("${firstPrefix}") \u2014 positional LLM`)
|
|
771
|
+
return disambiguateWithPosition(topMatches, description)
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
const capped = topMatches.slice(0, config.maxCandidates)
|
|
775
|
+
console.log(`[DOM] ${capped.length} tied \u2014 LLM disambiguating...`)
|
|
776
|
+
return disambiguateWithLLM(capped, description)
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
async function disambiguateWithLLM(candidates, description) {
|
|
780
|
+
const list = candidates.map((e, i) => {
|
|
781
|
+
const hint = e.href ? ` \u2192 ${e.href}` : ''
|
|
782
|
+
return `[${i}] <${e.role || e.tag}> "${e.text}"${hint}`
|
|
783
|
+
}).join('\n')
|
|
784
|
+
let content
|
|
785
|
+
try {
|
|
786
|
+
content = await askLLMText(
|
|
787
|
+
`I need to click: "${description}"\n\nCandidates:\n${list}\n\nReturn ONLY JSON: {"index": <number>}`)
|
|
788
|
+
} catch (err) { console.log(`[DOM] LLM failed: ${err.message}`); return null }
|
|
789
|
+
console.log(`[DOM] LLM response: ${content}`)
|
|
790
|
+
let parsed = null
|
|
791
|
+
try { const { index } = parseJSON(content); if (typeof index === 'number' && isFinite(index)) parsed = Math.round(index) } catch {}
|
|
792
|
+
if (parsed === null || parsed < 0 || parsed >= candidates.length) {
|
|
793
|
+
console.log(`[DOM] Invalid index (${parsed}) \u2014 signalling vision`); return null
|
|
794
|
+
}
|
|
795
|
+
const chosen = candidates[parsed]
|
|
796
|
+
console.log(`[DOM] Chose [${parsed}]: "${chosen.text}" at doc(${chosen.docX}, ${chosen.docY})`)
|
|
797
|
+
return chosen
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
async function disambiguateWithPosition(candidates, description) {
|
|
801
|
+
const capped = candidates.slice(0, config.maxCandidates)
|
|
802
|
+
const list = capped.map((e, i) =>
|
|
803
|
+
`[${i}] <${e.role || e.tag}> "${e.text}" at position (x=${e.docX}, y=${e.docY})`
|
|
804
|
+
).join('\n')
|
|
805
|
+
let content
|
|
806
|
+
try {
|
|
807
|
+
content = await askLLMText(
|
|
808
|
+
`I need to click: "${description}"\n\n` +
|
|
809
|
+
`Identical elements at different positions. Smaller y = higher on page.\n\n` +
|
|
810
|
+
`${list}\n\nReturn ONLY JSON: {"index": <number>}`)
|
|
811
|
+
} catch (err) { console.log(`[DOM] Positional LLM failed: ${err.message}`); return null }
|
|
812
|
+
console.log(`[DOM] Positional LLM: ${content}`)
|
|
813
|
+
let parsed = null
|
|
814
|
+
try { const { index } = parseJSON(content); if (typeof index === 'number' && isFinite(index)) parsed = Math.round(index) } catch {}
|
|
815
|
+
if (parsed === null || parsed < 0 || parsed >= capped.length) return null
|
|
816
|
+
const chosen = capped[parsed]
|
|
817
|
+
console.log(`[DOM] Positional: chose [${parsed}] at doc(${chosen.docX}, ${chosen.docY})`)
|
|
818
|
+
return chosen
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
// ── Vision ───────────────────────────────────────────────────────────
|
|
822
|
+
|
|
823
|
+
async function identifyRegionViaVision(ctx, description) {
|
|
824
|
+
// Combined eval: get dimensions + draw grid overlay in one round trip
|
|
825
|
+
const { viewWidth, docHeight } = await _eval(ctx, ({ labels }) => {
|
|
826
|
+
const w = window.innerWidth, h = document.body.scrollHeight
|
|
827
|
+
const canvas = document.createElement('canvas')
|
|
828
|
+
canvas.id = '__vision_grid__'
|
|
829
|
+
canvas.style.cssText = 'position:absolute;top:0;left:0;z-index:999999;pointer-events:none;'
|
|
830
|
+
canvas.width = w; canvas.height = h
|
|
831
|
+
document.body.appendChild(canvas)
|
|
832
|
+
const ctx = canvas.getContext('2d'), cw = w / 3, ch = h / 3
|
|
833
|
+
const fontSize = Math.max(16, Math.min(cw, ch) * 0.08)
|
|
834
|
+
for (let r = 0; r < 3; r++) for (let c = 0; c < 3; c++) {
|
|
835
|
+
const x = c * cw, y = r * ch
|
|
836
|
+
ctx.strokeStyle = 'rgba(255,50,50,0.7)'; ctx.lineWidth = 2
|
|
837
|
+
ctx.strokeRect(x + 1, y + 1, cw - 2, ch - 2)
|
|
838
|
+
ctx.font = `bold ${fontSize}px sans-serif`; ctx.textAlign = 'center'; ctx.textBaseline = 'middle'
|
|
839
|
+
const tw = ctx.measureText(labels[r][c]).width
|
|
840
|
+
ctx.fillStyle = 'rgba(0,0,0,0.6)'
|
|
841
|
+
ctx.fillRect(x + cw/2 - tw/2 - 4, y + ch/2 - fontSize/2 - 3, tw + 8, fontSize + 6)
|
|
842
|
+
ctx.fillStyle = 'white'; ctx.fillText(labels[r][c], x + cw / 2, y + ch / 2)
|
|
843
|
+
}
|
|
844
|
+
return { viewWidth: w, docHeight: h }
|
|
845
|
+
}, { labels: REGION_LABELS })
|
|
846
|
+
|
|
847
|
+
const shot = await _screenshot(ctx, true)
|
|
848
|
+
saveDebug('debug_region.png', shot.buffer)
|
|
849
|
+
await _eval(ctx, () => document.getElementById('__vision_grid__')?.remove())
|
|
850
|
+
|
|
851
|
+
const regionImg = await _resizeScreenshot(ctx, shot, viewWidth, docHeight)
|
|
852
|
+
const content = await askLLMVision(
|
|
853
|
+
`The screenshot shows a full webpage with a 3x3 grid:\n` +
|
|
854
|
+
`${REGION_LABELS.map(r => r.join(' | ')).join('\n')}\n\n` +
|
|
855
|
+
`Which region contains: "${description}"?\n` +
|
|
856
|
+
`Return ONLY JSON: {"region": "<label>"}\nValid: ${REGION_LABELS.flat().join(', ')}`,
|
|
857
|
+
regionImg.base64)
|
|
858
|
+
console.log(`[Vision] Region: ${content}`)
|
|
859
|
+
|
|
860
|
+
const { region: raw } = parseJSON(content)
|
|
861
|
+
const region = raw.toLowerCase().trim()
|
|
862
|
+
const row = REGION_LABELS.findIndex(r => r.includes(region))
|
|
863
|
+
const col = row >= 0 ? REGION_LABELS[row].indexOf(region) : -1
|
|
864
|
+
if (row < 0 || col < 0) throw new Error(`Unknown region: "${raw}"`)
|
|
865
|
+
|
|
866
|
+
const cw = viewWidth / 3, ch = docHeight / 3, OV = 0.20
|
|
867
|
+
return {
|
|
868
|
+
x1: Math.max(0, col * cw - cw * OV), y1: Math.max(0, row * ch - ch * OV),
|
|
869
|
+
x2: Math.min(viewWidth, (col + 1) * cw + cw * OV), y2: Math.min(docHeight, (row + 1) * ch + ch * OV),
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
async function locatePreciseViaVision(ctx, description) {
|
|
874
|
+
const { viewWidth, docHeight } = await _eval(ctx, () => ({
|
|
875
|
+
viewWidth: window.innerWidth, docHeight: document.body.scrollHeight
|
|
876
|
+
}))
|
|
877
|
+
const shot = await _screenshot(ctx, true)
|
|
878
|
+
saveDebug('debug_precise.png', shot.buffer)
|
|
879
|
+
|
|
880
|
+
const { base64: resizedB64, scale } = await _resizeScreenshot(ctx, shot, viewWidth, docHeight)
|
|
881
|
+
const resizedW = Math.round(viewWidth / scale), resizedH = Math.round(docHeight / scale)
|
|
882
|
+
const content = await askLLMVision(
|
|
883
|
+
`Screenshot: ${resizedW}\u00d7${resizedH}px (full page). Origin (0,0) = top-left.\n\n` +
|
|
884
|
+
`Find the CENTER of: "${description}"\n\n` +
|
|
885
|
+
`Return ONLY JSON: {"x": <number>, "y": <number>}`, resizedB64, 30)
|
|
886
|
+
console.log(`[Vision] Coordinates: ${content}`)
|
|
887
|
+
|
|
888
|
+
const { x, y } = parseJSON(content)
|
|
889
|
+
return {
|
|
890
|
+
docX: Math.max(0, Math.min(viewWidth - 1, Math.round(x * scale))),
|
|
891
|
+
docY: Math.max(0, Math.min(docHeight - 1, Math.round(y * scale)))
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
// ── Scroll, mark, click helpers ──────────────────────────────────────
|
|
896
|
+
|
|
897
|
+
async function scrollIntoView(ctx, docY) {
|
|
898
|
+
if (_isNative(ctx)) return // native apps handle scrolling differently — skip
|
|
899
|
+
const { scrollY, viewHeight } = await _eval(ctx, () => ({
|
|
900
|
+
scrollY: window.scrollY, viewHeight: window.innerHeight
|
|
901
|
+
}))
|
|
902
|
+
if (docY < scrollY || docY > scrollY + viewHeight) {
|
|
903
|
+
await _eval(ctx, top => window.scrollTo({ top, behavior: 'instant' }), docY - viewHeight / 2)
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
async function markByElement(ctx, element) {
|
|
908
|
+
if (_isNative(ctx)) return markByElementNative(ctx, element)
|
|
909
|
+
await scrollIntoView(ctx, element.docY)
|
|
910
|
+
const uid = `sr-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
|
|
911
|
+
const marked = await _eval(ctx, ({ tag, text, href, docX, docY, uid }) => {
|
|
912
|
+
function isClippedByParent(el) {
|
|
913
|
+
const rect = el.getBoundingClientRect()
|
|
914
|
+
let p = el.parentElement
|
|
915
|
+
while (p && p !== document.body) {
|
|
916
|
+
const ps = window.getComputedStyle(p)
|
|
917
|
+
if (ps.overflow === 'hidden' || ps.overflow === 'clip' || ps.overflowY === 'hidden') {
|
|
918
|
+
const pr = p.getBoundingClientRect()
|
|
919
|
+
if (rect.bottom > pr.bottom + 1 || rect.top < pr.top - 1) return true
|
|
920
|
+
}
|
|
921
|
+
p = p.parentElement
|
|
922
|
+
}
|
|
923
|
+
return false
|
|
924
|
+
}
|
|
925
|
+
const candidates = []
|
|
926
|
+
const selector = href ? tag + '[href="' + CSS.escape(href) + '"]' : tag
|
|
927
|
+
for (const el of document.querySelectorAll(selector)) {
|
|
928
|
+
const elText = el.textContent.trim().replace(/\s+/g, ' ')
|
|
929
|
+
if (elText !== text) continue
|
|
930
|
+
const rect = el.getBoundingClientRect()
|
|
931
|
+
if (rect.width === 0 || rect.height === 0) continue
|
|
932
|
+
const cx = Math.round(rect.left + window.scrollX + rect.width / 2)
|
|
933
|
+
const cy = Math.round(rect.top + window.scrollY + rect.height / 2)
|
|
934
|
+
const dist = Math.abs(cx - docX) + Math.abs(cy - docY)
|
|
935
|
+
const visible = !isClippedByParent(el)
|
|
936
|
+
candidates.push({ el, dist, visible })
|
|
937
|
+
}
|
|
938
|
+
candidates.sort((a, b) => {
|
|
939
|
+
if (a.visible !== b.visible) return a.visible ? -1 : 1
|
|
940
|
+
return a.dist - b.dist
|
|
941
|
+
})
|
|
942
|
+
if (candidates.length === 0) return null
|
|
943
|
+
candidates[0].el.setAttribute('data-elementus', uid)
|
|
944
|
+
return candidates[0].el.tagName.toLowerCase()
|
|
945
|
+
}, { tag: element.tag, text: element.text, href: element.href, docX: element.docX, docY: element.docY, uid })
|
|
946
|
+
|
|
947
|
+
if (!marked) throw new Error(`Could not mark <${element.tag}> "${element.text}"`)
|
|
948
|
+
console.log(`[Resolve] Marked <${marked}> "${element.text}" at doc(${element.docX}, ${element.docY})`)
|
|
949
|
+
return _makeLocator(ctx, `[data-elementus="${uid}"]`)
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
async function markAtCoordinates(ctx, docX, docY) {
|
|
953
|
+
if (!_isNative(ctx)) await scrollIntoView(ctx, docY)
|
|
954
|
+
const uid = `sr-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
|
|
955
|
+
const marked = await _eval(ctx, ({ docX, docY, uid, selectors }) => {
|
|
956
|
+
const vx = docX - window.scrollX, vy = docY - window.scrollY
|
|
957
|
+
const top = document.elementFromPoint(vx, vy)
|
|
958
|
+
if (!top) return null
|
|
959
|
+
let target = top.closest(selectors)
|
|
960
|
+
if (!target && typeof document.elementsFromPoint === 'function') {
|
|
961
|
+
for (const el of document.elementsFromPoint(vx, vy)) {
|
|
962
|
+
target = el.matches(selectors) ? el : el.closest(selectors)
|
|
963
|
+
if (target) break
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
const final = target || top
|
|
967
|
+
final.setAttribute('data-elementus', uid)
|
|
968
|
+
return final.tagName.toLowerCase()
|
|
969
|
+
}, { docX, docY, uid, selectors: INTERACTIVE_SELECTORS })
|
|
970
|
+
if (!marked) throw new Error(`No element at doc(${docX}, ${docY})`)
|
|
971
|
+
console.log(`[Resolve] Marked <${marked}> at doc(${docX}, ${docY})`)
|
|
972
|
+
return _makeLocator(ctx, `[data-elementus="${uid}"]`)
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
async function scrollAndClick(ctx, element) {
|
|
976
|
+
if (_isNative(ctx)) {
|
|
977
|
+
// Native app: resolve via markByElementNative, then click the element
|
|
978
|
+
console.log(`\u2713 Tapping "${element.text}" \u2014 native (${element.docX}, ${element.docY})`)
|
|
979
|
+
const nativeEl = await markByElementNative(ctx, element)
|
|
980
|
+
await nativeEl.click()
|
|
981
|
+
return
|
|
982
|
+
}
|
|
983
|
+
await scrollIntoView(ctx, element.docY)
|
|
984
|
+
const { vx, vy } = await _eval(ctx, ({ docX, docY }) => ({
|
|
985
|
+
vx: docX - window.scrollX, vy: docY - window.scrollY
|
|
986
|
+
}), { docX: element.docX, docY: element.docY })
|
|
987
|
+
console.log(`\u2713 Clicking "${element.text}" \u2014 doc(${element.docX}, ${element.docY})`)
|
|
988
|
+
if (element.href && element.tag === 'a') {
|
|
989
|
+
await _goto(ctx, element.href)
|
|
990
|
+
console.log(`[Click] Navigated to: ${element.href}`)
|
|
991
|
+
return
|
|
992
|
+
}
|
|
993
|
+
const clicked = await _eval(ctx, ({ x, y }) => {
|
|
994
|
+
const el = document.elementFromPoint(x, y)
|
|
995
|
+
if (!el) return 'null'
|
|
996
|
+
if (typeof el.click === 'function') el.click()
|
|
997
|
+
else el.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }))
|
|
998
|
+
return el.tagName + ':' + (el.textContent?.trim().slice(0, 40) || '')
|
|
999
|
+
}, { x: vx, y: vy })
|
|
1000
|
+
console.log(`[Click] JS click: ${clicked}`)
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
async function clickAtCoords(ctx, coords) {
|
|
1004
|
+
if (_isNative(ctx)) {
|
|
1005
|
+
// Native app: tap at absolute coordinates via Appium action
|
|
1006
|
+
console.log(`[Vision] Native tap at (${coords.docX}, ${coords.docY})`)
|
|
1007
|
+
await ctx.action('pointer', { parameters: { pointerType: 'touch' } })
|
|
1008
|
+
.move({ x: coords.docX, y: coords.docY })
|
|
1009
|
+
.down().up()
|
|
1010
|
+
.perform()
|
|
1011
|
+
return
|
|
1012
|
+
}
|
|
1013
|
+
await scrollIntoView(ctx, coords.docY)
|
|
1014
|
+
const { vx, vy } = await _eval(ctx, ({ docX, docY }) => ({
|
|
1015
|
+
vx: docX - window.scrollX, vy: docY - window.scrollY
|
|
1016
|
+
}), { docX: coords.docX, docY: coords.docY })
|
|
1017
|
+
const info = await _eval(ctx, ({ x, y }) => {
|
|
1018
|
+
const el = document.elementFromPoint(x, y)
|
|
1019
|
+
if (!el) return null
|
|
1020
|
+
const a = el.closest('a')
|
|
1021
|
+
return { href: a?.getAttribute('href') || null, isAnchor: !!a }
|
|
1022
|
+
}, { x: vx, y: vy })
|
|
1023
|
+
if (info?.href && info.isAnchor) {
|
|
1024
|
+
await _goto(ctx, info.href)
|
|
1025
|
+
console.log(`[Vision] Navigated to: ${info.href}`)
|
|
1026
|
+
return
|
|
1027
|
+
}
|
|
1028
|
+
await _eval(ctx, ({ x, y }) => {
|
|
1029
|
+
const el = document.elementFromPoint(x, y)
|
|
1030
|
+
if (!el) return
|
|
1031
|
+
if (typeof el.click === 'function') el.click()
|
|
1032
|
+
else el.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }))
|
|
1033
|
+
}, { x: vx, y: vy })
|
|
1034
|
+
console.log(`[Vision] JS click at (${vx}, ${vy})`)
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// ── Vision fallback (shared) ─────────────────────────────────────────
|
|
1038
|
+
|
|
1039
|
+
async function visionFallback(ctx, description) {
|
|
1040
|
+
console.log(`[Vision] DOM returned null \u2014 activating vision`)
|
|
1041
|
+
const region = await identifyRegionViaVision(ctx, description)
|
|
1042
|
+
const vh = await _eval(ctx, () => window.innerHeight)
|
|
1043
|
+
await _eval(ctx, top => window.scrollTo({ top, behavior: 'instant' }), (region.y1 + region.y2) / 2 - vh / 2)
|
|
1044
|
+
const element = await findElementInDOM(ctx, description, region)
|
|
1045
|
+
if (element) return { element, coords: null }
|
|
1046
|
+
console.log(`[Vision] DOM unresolved \u2014 precise coordinates...`)
|
|
1047
|
+
const coords = await locatePreciseViaVision(ctx, description)
|
|
1048
|
+
return { element: null, coords }
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
// ── Public API ───────────────────────────────────────────────────────
|
|
1052
|
+
|
|
1053
|
+
async function _findByDescription(ctx, description) {
|
|
1054
|
+
let element = await findElementInDOM(ctx, description)
|
|
1055
|
+
if (element) return markByElement(ctx, element)
|
|
1056
|
+
try {
|
|
1057
|
+
const result = await visionFallback(ctx, description)
|
|
1058
|
+
if (result.element) return markByElement(ctx, result.element)
|
|
1059
|
+
return markAtCoordinates(ctx, result.coords.docX, result.coords.docY)
|
|
1060
|
+
} catch (err) {
|
|
1061
|
+
throw new Error(`All fallback paths exhausted for "${description}": ${err.message}`)
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
/**
|
|
1066
|
+
* Try locator first, fall back to AI-based description search if locator fails.
|
|
1067
|
+
* Returns a framework-native locator/element usable for any action or assertion.
|
|
1068
|
+
*
|
|
1069
|
+
* @param {Object} ctx - page (Playwright) or browser (WDIO)
|
|
1070
|
+
* @param {Object} locator - Playwright Locator or WDIO Element to try first
|
|
1071
|
+
* @param {string} description - natural-language element description for AI fallback
|
|
1072
|
+
* @returns {Promise<Object>} Playwright Locator or WDIO Element
|
|
1073
|
+
*/
|
|
1074
|
+
async function locate(ctx, locator, description) {
|
|
1075
|
+
try {
|
|
1076
|
+
if (_isPlaywright(ctx)) {
|
|
1077
|
+
await locator.waitFor({ state: 'attached' })
|
|
1078
|
+
} else {
|
|
1079
|
+
await locator.waitForExist()
|
|
1080
|
+
}
|
|
1081
|
+
console.log(`\u2713 Located via locator`)
|
|
1082
|
+
return locator
|
|
1083
|
+
} catch {
|
|
1084
|
+
console.log(`\u2717 Locator failed \u2014 searching for: "${description}"`)
|
|
1085
|
+
}
|
|
1086
|
+
return _findByDescription(ctx, description)
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
/**
|
|
1090
|
+
* Find an element by natural-language description only (no locator needed).
|
|
1091
|
+
* Searches the page DOM, uses LLM disambiguation, and vision as last resort.
|
|
1092
|
+
* Returns a framework-native locator/element usable for any action or assertion.
|
|
1093
|
+
*
|
|
1094
|
+
* @param {Object} ctx - page (Playwright) or browser (WDIO)
|
|
1095
|
+
* @param {string} description - natural-language element description
|
|
1096
|
+
* @returns {Promise<Object>} Playwright Locator or WDIO Element
|
|
1097
|
+
*
|
|
1098
|
+
* @example
|
|
1099
|
+
* const el = await el.find(page, 'Submit order button')
|
|
1100
|
+
* await el.click()
|
|
1101
|
+
* await expect(el).toHaveText('Submit')
|
|
1102
|
+
*/
|
|
1103
|
+
async function find(ctx, description) {
|
|
1104
|
+
console.log(`[Find] "${description}"`)
|
|
1105
|
+
return _findByDescription(ctx, description)
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
/**
|
|
1109
|
+
* Click with locator-first fallback + optimized click strategy.
|
|
1110
|
+
* Uses page.goto() for <a href> links (avoids hover/overlay issues)
|
|
1111
|
+
* and JS elementFromPoint click for buttons (no mouse cursor movement).
|
|
1112
|
+
* Best for navigation actions. For fill/hover/assert, use wrap or find.
|
|
1113
|
+
*
|
|
1114
|
+
* @param {Object} ctx - page (Playwright) or browser (WDIO)
|
|
1115
|
+
* @param {Object} locator - Playwright Locator or WDIO Element to try first
|
|
1116
|
+
* @param {string} description - natural-language element description for AI fallback
|
|
1117
|
+
* @returns {Promise<void>}
|
|
1118
|
+
*
|
|
1119
|
+
* @example
|
|
1120
|
+
* await el.click(page, page.locator('#nav-blog'), 'Blog page link')
|
|
1121
|
+
*/
|
|
1122
|
+
async function click(ctx, locator, description) {
|
|
1123
|
+
try {
|
|
1124
|
+
await locator.click()
|
|
1125
|
+
console.log(`\u2713 Clicked via locator`)
|
|
1126
|
+
return
|
|
1127
|
+
} catch {
|
|
1128
|
+
console.log(`\u2717 Locator failed \u2014 searching for: "${description}"`)
|
|
1129
|
+
}
|
|
1130
|
+
let element = await findElementInDOM(ctx, description)
|
|
1131
|
+
if (element) { await scrollAndClick(ctx, element); return }
|
|
1132
|
+
try {
|
|
1133
|
+
const result = await visionFallback(ctx, description)
|
|
1134
|
+
if (result.element) { await scrollAndClick(ctx, result.element); return }
|
|
1135
|
+
await clickAtCoords(ctx, result.coords)
|
|
1136
|
+
} catch (err) {
|
|
1137
|
+
throw new Error(`All fallback paths exhausted for "${description}": ${err.message}`)
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
/**
|
|
1142
|
+
* Wrap a single locator/element with AI fallback. Returns a Proxy that
|
|
1143
|
+
* intercepts all method calls (click, fill, textContent, getAttribute, etc.).
|
|
1144
|
+
* If the original method fails, AI resolves the element and retries.
|
|
1145
|
+
* The wrapped object looks and acts like the original — Playwright assertions,
|
|
1146
|
+
* WDIO expect, and all framework APIs work transparently.
|
|
1147
|
+
*
|
|
1148
|
+
* For cleaner code, prefer wrapPage() or wrapBrowser() over calling this directly.
|
|
1149
|
+
*
|
|
1150
|
+
* @param {Object} driverContext - page (Playwright) or browser (WDIO)
|
|
1151
|
+
* @param {Object} locator - Playwright Locator or WDIO Element
|
|
1152
|
+
* @param {string} description - natural-language element description
|
|
1153
|
+
* @returns {Proxy} Proxy that behaves like the original locator/element
|
|
1154
|
+
*
|
|
1155
|
+
* @example
|
|
1156
|
+
* const btn = el.wrap(page, page.locator('#old-btn'), 'Submit button')
|
|
1157
|
+
* await btn.click() // tries #old-btn → fail → AI finds Submit → click
|
|
1158
|
+
* await btn.textContent() // same fallback for any method
|
|
1159
|
+
*/
|
|
1160
|
+
function wrap(driverContext, locator, description) {
|
|
1161
|
+
const PASSTHROUGH = new Set([
|
|
1162
|
+
'then', 'catch', 'finally', 'toString', 'valueOf', 'toJSON',
|
|
1163
|
+
Symbol.toPrimitive, Symbol.toStringTag, Symbol.iterator, Symbol.asyncIterator,
|
|
1164
|
+
])
|
|
1165
|
+
let _resolved = null
|
|
1166
|
+
|
|
1167
|
+
return new Proxy(locator, {
|
|
1168
|
+
get(target, prop, receiver) {
|
|
1169
|
+
if (typeof prop === 'symbol' || PASSTHROUGH.has(prop)) {
|
|
1170
|
+
return Reflect.get(target, prop, receiver)
|
|
1171
|
+
}
|
|
1172
|
+
const original = target[prop]
|
|
1173
|
+
if (typeof original !== 'function') return original
|
|
1174
|
+
|
|
1175
|
+
// Boolean query methods (isVisible, isEnabled, etc.) return false instead
|
|
1176
|
+
// of throwing on missing elements. We can't detect failure from the return
|
|
1177
|
+
// value, so resolve via AI first, then query the real element.
|
|
1178
|
+
const BOOL_QUERIES = ['isVisible', 'isEnabled', 'isChecked', 'isHidden', 'isEditable']
|
|
1179
|
+
|
|
1180
|
+
return async function (...args) {
|
|
1181
|
+
if (BOOL_QUERIES.includes(prop)) {
|
|
1182
|
+
if (!_resolved) {
|
|
1183
|
+
console.log(`[AI] ${prop}() \u2014 resolving via AI first for "${description}"`)
|
|
1184
|
+
_resolved = await _findByDescription(driverContext, description)
|
|
1185
|
+
}
|
|
1186
|
+
return _resolved[prop](...args)
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
try {
|
|
1190
|
+
return await original.apply(target, args)
|
|
1191
|
+
} catch (firstError) {
|
|
1192
|
+
console.log(`[AI] ${String(prop)}() failed \u2014 AI fallback for "${description}"`)
|
|
1193
|
+
if (!_resolved) _resolved = await _findByDescription(driverContext, description)
|
|
1194
|
+
|
|
1195
|
+
const resolvedMethod = _resolved[prop]
|
|
1196
|
+
if (typeof resolvedMethod !== 'function') {
|
|
1197
|
+
if (prop in _resolved) return _resolved[prop]
|
|
1198
|
+
throw firstError
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
if (prop === 'click' || prop === 'dblclick') {
|
|
1202
|
+
const href = await _resolved.getAttribute('href').catch(() => null)
|
|
1203
|
+
if (href) {
|
|
1204
|
+
await _goto(driverContext, href)
|
|
1205
|
+
console.log(`[AI] Navigated to: ${href}`)
|
|
1206
|
+
return
|
|
1207
|
+
}
|
|
1208
|
+
return resolvedMethod.call(_resolved, { ...(args[0] || {}), force: true })
|
|
1209
|
+
}
|
|
1210
|
+
const FORCE_VAL = { fill: 1, type: 1, selectOption: 1, press: 1 }
|
|
1211
|
+
let retryArgs = [...args]
|
|
1212
|
+
if (['hover', 'tap', 'check', 'uncheck'].includes(prop)) {
|
|
1213
|
+
retryArgs[0] = { ...(retryArgs[0] || {}), force: true }
|
|
1214
|
+
} else if (prop in FORCE_VAL) {
|
|
1215
|
+
retryArgs[FORCE_VAL[prop]] = { ...(retryArgs[FORCE_VAL[prop]] || {}), force: true }
|
|
1216
|
+
}
|
|
1217
|
+
return resolvedMethod.apply(_resolved, retryArgs)
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
}
|
|
1221
|
+
})
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
/**
|
|
1225
|
+
* Wrap a Playwright page so that page.locator(selector, { ai: 'description' })
|
|
1226
|
+
* automatically creates AI-fallback locators. Locators without { ai } are
|
|
1227
|
+
* returned unchanged — zero overhead on stable selectors.
|
|
1228
|
+
*
|
|
1229
|
+
* Call once per test (or in a fixture for all tests).
|
|
1230
|
+
*
|
|
1231
|
+
* @param {Object} pageObj - Playwright Page object
|
|
1232
|
+
* @returns {Proxy} Proxied page with enhanced locator() method
|
|
1233
|
+
*
|
|
1234
|
+
* @example
|
|
1235
|
+
* const p = el.wrapPage(page)
|
|
1236
|
+
* await p.locator('#btn', { ai: 'Submit button' }).click() // AI fallback
|
|
1237
|
+
* await p.locator('#btn').click() // normal, no AI
|
|
1238
|
+
*/
|
|
1239
|
+
function wrapPage(pageObj) {
|
|
1240
|
+
return new Proxy(pageObj, {
|
|
1241
|
+
get(target, prop, receiver) {
|
|
1242
|
+
if (prop === 'locator') {
|
|
1243
|
+
return function (selector, options = {}) {
|
|
1244
|
+
const { ai, ...locatorOptions } = options
|
|
1245
|
+
const loc = Object.keys(locatorOptions).length > 0
|
|
1246
|
+
? target.locator(selector, locatorOptions) : target.locator(selector)
|
|
1247
|
+
return ai ? wrap(target, loc, ai) : loc
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
return Reflect.get(target, prop, receiver)
|
|
1251
|
+
}
|
|
1252
|
+
})
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
/**
|
|
1256
|
+
* Wrap a WDIO browser so that browser.$(selector, { ai: 'description' })
|
|
1257
|
+
* automatically creates AI-fallback elements. $() calls without { ai }
|
|
1258
|
+
* are returned unchanged — zero overhead on stable selectors.
|
|
1259
|
+
*
|
|
1260
|
+
* Call once in before() hook or wdio.conf.js.
|
|
1261
|
+
*
|
|
1262
|
+
* @param {Object} browserObj - WDIO Browser object
|
|
1263
|
+
* @returns {Proxy} Proxied browser with enhanced $() method
|
|
1264
|
+
*
|
|
1265
|
+
* @example
|
|
1266
|
+
* const b = el.wrapBrowser(browser)
|
|
1267
|
+
* await b.$('#btn', { ai: 'Submit button' }).click() // AI fallback
|
|
1268
|
+
* await b.$('#btn').click() // normal, no AI
|
|
1269
|
+
*/
|
|
1270
|
+
function wrapBrowser(browserObj) {
|
|
1271
|
+
return new Proxy(browserObj, {
|
|
1272
|
+
get(target, prop, receiver) {
|
|
1273
|
+
if (prop === '$') {
|
|
1274
|
+
return function (selector, options = {}) {
|
|
1275
|
+
const { ai, ...rest } = options
|
|
1276
|
+
const el = Object.keys(rest).length > 0 ? target.$(selector, rest) : target.$(selector)
|
|
1277
|
+
return ai ? wrap(target, el, ai) : el
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
return Reflect.get(target, prop, receiver)
|
|
1281
|
+
}
|
|
1282
|
+
})
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
return { wrap, wrapPage, wrapBrowser, locate, find, click }
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
module.exports = { createElementus }
|