browserwire 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +113 -0
- package/cli/api/bridge.js +64 -0
- package/cli/api/openapi.js +175 -0
- package/cli/api/router.js +280 -0
- package/cli/api/swagger-ui.js +26 -0
- package/cli/discovery/classify.js +304 -0
- package/cli/discovery/compile.js +392 -0
- package/cli/discovery/enrich.js +376 -0
- package/cli/discovery/entities.js +356 -0
- package/cli/discovery/llm-client.js +352 -0
- package/cli/discovery/locators.js +326 -0
- package/cli/discovery/perceive.js +476 -0
- package/cli/discovery/session.js +930 -0
- package/cli/discovery/synthesize-workflows.js +295 -0
- package/cli/index.js +63 -0
- package/cli/manifest-store.js +140 -0
- package/cli/server.js +539 -0
- package/extension/background.js +1512 -0
- package/extension/content-script.js +491 -0
- package/extension/discovery.js +495 -0
- package/extension/executor.js +392 -0
- package/extension/icons/icon-128.png +0 -0
- package/extension/icons/icon-16.png +0 -0
- package/extension/icons/icon-48.png +0 -0
- package/extension/manifest.json +33 -0
- package/extension/shared/protocol.js +50 -0
- package/extension/sidepanel.html +277 -0
- package/extension/sidepanel.js +211 -0
- package/extension/vendor/LICENSE +22 -0
- package/extension/vendor/rrweb-record.min.js +84 -0
- package/package.json +49 -0
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* perceive.js — Vision LLM Perception Module
|
|
3
|
+
*
|
|
4
|
+
* Takes a page skeleton + annotated screenshot and uses a vision LLM to
|
|
5
|
+
* identify the 15-25 most meaningful elements and their semantic roles.
|
|
6
|
+
*
|
|
7
|
+
* Input: { skeleton[], screenshot (base64 JPEG), pageText, url, title }
|
|
8
|
+
* Output: { domain, domainDescription, entities[], actions[], compositeActions[] }
|
|
9
|
+
*
|
|
10
|
+
* All scanIds in the output are validated against the input skeleton.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { getLLMConfig, callLLM, callLLMWithVision } from "./llm-client.js";
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// System prompt
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
const SYSTEM_PROMPT = `You are a web application analyst with computer vision capabilities. You are given:
|
|
20
|
+
1. An annotated screenshot of a web page where interactable elements are highlighted with orange boxes labeled with their s-ID (e.g., "s11" = scanId 11)
|
|
21
|
+
2. A compact HTML skeleton listing all labeled elements (some elements have state-* attributes showing live runtime state)
|
|
22
|
+
|
|
23
|
+
Your task: understand what this page does and identify the 15-25 most meaningful interactive elements, semantic regions, views (readable data), and the page/route context.
|
|
24
|
+
|
|
25
|
+
## Views (Read Operations)
|
|
26
|
+
|
|
27
|
+
Think of this page as a REST API endpoint. What data does it DISPLAY?
|
|
28
|
+
|
|
29
|
+
Identify views — structured data visible on the page:
|
|
30
|
+
- Lists (e.g., event list, message inbox, search results)
|
|
31
|
+
- Detail views (e.g., event details, user profile)
|
|
32
|
+
- Status displays (e.g., notification count, account balance)
|
|
33
|
+
|
|
34
|
+
For each view, provide CSS selectors for data extraction:
|
|
35
|
+
- containerSelector: CSS selector for the data region
|
|
36
|
+
- itemSelector: for lists, CSS selector for each repeating item (relative to container)
|
|
37
|
+
- fields: for each data field, a CSS selector relative to the item/container
|
|
38
|
+
|
|
39
|
+
## Pages (Routes)
|
|
40
|
+
|
|
41
|
+
What page/route is this? Identify:
|
|
42
|
+
- Route pattern (generalize IDs: "/events/:id" not "/events/123")
|
|
43
|
+
- Which views are visible
|
|
44
|
+
- Which actions are available
|
|
45
|
+
|
|
46
|
+
## Output Format
|
|
47
|
+
|
|
48
|
+
Respond with ONLY valid JSON (no markdown fences, no explanation):
|
|
49
|
+
{
|
|
50
|
+
"domain": "string (e.g. event_management, messaging, email_client)",
|
|
51
|
+
"domainDescription": "string (1-2 sentences describing what this page/site does)",
|
|
52
|
+
"pageState": {
|
|
53
|
+
"name": "string (human name for this page)",
|
|
54
|
+
"routePattern": "string (e.g. /events, /events/:id, /login)",
|
|
55
|
+
"description": "string (1 sentence describing this page)",
|
|
56
|
+
"stateSignals": [
|
|
57
|
+
{ "kind": "selector_exists|text_match|url_pattern", "value": "string", "selector": "string (only for text_match: element whose text to test)", "weight": 0.8 }
|
|
58
|
+
]
|
|
59
|
+
},
|
|
60
|
+
"entities": [
|
|
61
|
+
{
|
|
62
|
+
"name": "snake_case_entity_name (noun describing the region)",
|
|
63
|
+
"scanIds": [/* numbers: the s-IDs of elements belonging to this region */],
|
|
64
|
+
"description": "string"
|
|
65
|
+
}
|
|
66
|
+
],
|
|
67
|
+
"views": [
|
|
68
|
+
{
|
|
69
|
+
"name": "snake_case_view_name (e.g. event_list, user_profile)",
|
|
70
|
+
"description": "string",
|
|
71
|
+
"isList": true,
|
|
72
|
+
"isDynamic": false,
|
|
73
|
+
"containerSelector": "CSS selector for the data region",
|
|
74
|
+
"itemSelector": "CSS selector for each repeating item (relative to container, omit if isList=false)",
|
|
75
|
+
"fields": [
|
|
76
|
+
{ "name": "field_name", "type": "string|number|boolean|date", "selector": "CSS selector relative to item/container" }
|
|
77
|
+
],
|
|
78
|
+
"entityScanIds": [/* numbers: s-IDs of elements within this view's data region */]
|
|
79
|
+
}
|
|
80
|
+
],
|
|
81
|
+
"actions": [
|
|
82
|
+
{
|
|
83
|
+
"scanId": /* number: the s-ID from the screenshot/skeleton */,
|
|
84
|
+
"semanticName": "snake_case_verb_noun (e.g. create_event, submit_login, search_events)",
|
|
85
|
+
"interactionKind": "click|type|select|navigate",
|
|
86
|
+
"description": "string",
|
|
87
|
+
"preconditions": [
|
|
88
|
+
{ "description": "string", "stateField": "optional field name that must be non-empty" }
|
|
89
|
+
],
|
|
90
|
+
"locator": {
|
|
91
|
+
"kind": "xpath" | "css",
|
|
92
|
+
"value": "semantic selector referencing surrounding labels/context",
|
|
93
|
+
"reasoning": "brief explanation of why this selector was chosen"
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
],
|
|
97
|
+
"compositeActions": [
|
|
98
|
+
{
|
|
99
|
+
"name": "snake_case_workflow_name",
|
|
100
|
+
"description": "string",
|
|
101
|
+
"stepScanIds": [/* ordered scanIds of the steps */],
|
|
102
|
+
"inputs": [{ "name": "string", "type": "string", "description": "string" }]
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
## Rules
|
|
108
|
+
- Only reference scanIds that appear in the HTML skeleton (id="s{scanId}")
|
|
109
|
+
- Use snake_case for all names. Semantic names must be developer-friendly verbs/nouns
|
|
110
|
+
- NEVER use "generic", "unknown", "element", or raw numbers in names
|
|
111
|
+
- Focus on the 15-25 most meaningful elements — skip decorative/redundant ones
|
|
112
|
+
- Group related elements into entities (UI regions: forms, nav bars, cards, dialogs)
|
|
113
|
+
- Create composite actions for multi-step workflows (search box + button, login form, filters)
|
|
114
|
+
- All interactable elements with orange boxes in the screenshot should appear in "actions"
|
|
115
|
+
- For views: use specific CSS selectors that would work for runtime extraction without LLM
|
|
116
|
+
- Generalize route patterns: use :id for numeric/UUID path segments (e.g., /events/:id not /events/123)
|
|
117
|
+
- NEVER use skeleton scan IDs (like #s10, #s12, div#s5) as CSS selectors for views — these are temporary labels that do NOT exist in the real DOM. Use semantic selectors instead: class names, data-* attributes, ARIA roles, tag+attribute combos
|
|
118
|
+
- Field selectors MUST be specific enough to distinguish different fields within an item — never use just "div" or the same selector for multiple fields
|
|
119
|
+
- If you cannot determine a specific selector for a view field, omit that field entirely
|
|
120
|
+
|
|
121
|
+
## Dynamic Content Rules
|
|
122
|
+
|
|
123
|
+
Mark views as \`isDynamic: true\` when their content is server-driven and changes over time:
|
|
124
|
+
- Lists fed from a database (event lists, message inboxes, search results, activity feeds) → \`isDynamic: true\`
|
|
125
|
+
- Detail views that show different records per URL parameter → \`isDynamic: true\`
|
|
126
|
+
- Status counters, notification badges, live metrics → \`isDynamic: true\`
|
|
127
|
+
- Static nav/chrome/fixed labels/headings → \`isDynamic: false\`
|
|
128
|
+
|
|
129
|
+
For dynamic views, NEVER hardcode specific values (names, dates, IDs, usernames) in CSS selectors or field selectors.
|
|
130
|
+
Use structural selectors: class names, data-* attributes, tag+attribute combos, ARIA roles.
|
|
131
|
+
A selector like \`.event-title\` is correct. A selector like \`[data-id="123"]\` is wrong.
|
|
132
|
+
|
|
133
|
+
## Page State Signal Rules
|
|
134
|
+
|
|
135
|
+
For each pageState, provide 2–4 \`stateSignals\` to identify this specific UI state, especially for SPAs where URLs don't change:
|
|
136
|
+
- \`selector_exists\`: A CSS selector that is present ONLY on this page state (not shared across all pages). Example: \`[data-page="event-detail"]\`, \`.event-form\`, \`#login-panel\`. Weight: 0.8–0.9
|
|
137
|
+
- \`text_match\`: A regex pattern matched against the text content of a specific element. Use \`selector\` to specify which element (e.g., \`h1\`, \`.breadcrumb\`, \`[role="tab"][aria-selected="true"]\`). Example: \`value: "^Event Details$"\`, \`selector: "h1"\`. Weight: 0.7–0.8
|
|
138
|
+
- \`url_pattern\`: A regex matched against the pathname. Only use as a tiebreaker (weight 0.5–0.6), not as the primary signal.
|
|
139
|
+
|
|
140
|
+
Rules:
|
|
141
|
+
- Provide the most specific signals you can observe from the screenshot/DOM
|
|
142
|
+
- NEVER use dynamic values (usernames, dates, IDs) in signal values
|
|
143
|
+
- Prefer heading text, breadcrumb text, or active tab labels for text_match signals
|
|
144
|
+
- Prefer data-page, data-view, or section-specific class selectors for selector_exists
|
|
145
|
+
|
|
146
|
+
## Locator Rules (for each action)
|
|
147
|
+
|
|
148
|
+
For each action, provide a "locator" field with a semantic selector:
|
|
149
|
+
- Prefer XPath when you need ancestor/sibling context (e.g. label associations)
|
|
150
|
+
- Prefer CSS when stable attributes (data-testid, name, aria-label) are available
|
|
151
|
+
- NEVER use absolute paths like /html/body/div[3]/... or positional indexes like //div[5]
|
|
152
|
+
- Reference nearby labels, headings, or ARIA relationships for ambiguous elements
|
|
153
|
+
- A checkbox near "Remember me" → //label[contains(.,'Remember me')]/following-sibling::input[@type='checkbox']
|
|
154
|
+
- An input with placeholder "Search..." → input[placeholder='Search...'] or //input[@placeholder='Search...']
|
|
155
|
+
- A button labeled "Submit" → //button[normalize-space()='Submit'] or button[type='submit']
|
|
156
|
+
- Use name, aria-label, data-testid, placeholder, or visible text before resorting to structural selectors
|
|
157
|
+
- If no reliable semantic locator can be determined, omit the locator field entirely`;
|
|
158
|
+
|
|
159
|
+
// ---------------------------------------------------------------------------
|
|
160
|
+
// HTML skeleton builder
|
|
161
|
+
// ---------------------------------------------------------------------------
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Build a compact HTML skeleton string from skeleton entries.
|
|
165
|
+
* Uses id="s{scanId}" as the element marker for LLM reference.
|
|
166
|
+
*/
|
|
167
|
+
const buildHtmlSkeleton = (skeleton) => {
|
|
168
|
+
const lines = [];
|
|
169
|
+
|
|
170
|
+
for (const entry of skeleton) {
|
|
171
|
+
const attrs = [`id="s${entry.scanId}"`];
|
|
172
|
+
|
|
173
|
+
if (entry.role) attrs.push(`role="${entry.role}"`);
|
|
174
|
+
if (entry.attributes?.href) {
|
|
175
|
+
attrs.push(`href="${entry.attributes.href.slice(0, 80)}"`);
|
|
176
|
+
}
|
|
177
|
+
if (entry.attributes?.type) {
|
|
178
|
+
attrs.push(`type="${entry.attributes.type}"`);
|
|
179
|
+
}
|
|
180
|
+
if (entry.attributes?.placeholder) {
|
|
181
|
+
attrs.push(`placeholder="${entry.attributes.placeholder.slice(0, 60)}"`);
|
|
182
|
+
}
|
|
183
|
+
if (entry.attributes?.["aria-label"]) {
|
|
184
|
+
attrs.push(`aria-label="${entry.attributes["aria-label"].slice(0, 60)}"`);
|
|
185
|
+
}
|
|
186
|
+
if (entry.attributes?.name) {
|
|
187
|
+
attrs.push(`name="${entry.attributes.name.slice(0, 40)}"`);
|
|
188
|
+
}
|
|
189
|
+
if (entry.attributes?.["data-testid"]) {
|
|
190
|
+
attrs.push(`data-testid="${entry.attributes["data-testid"].slice(0, 40)}"`);
|
|
191
|
+
}
|
|
192
|
+
if (entry.attributes?.class) {
|
|
193
|
+
attrs.push(`class="${entry.attributes.class}"`);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Include element state as state-* attributes for LLM context
|
|
197
|
+
if (entry.state) {
|
|
198
|
+
if (entry.state.value != null) attrs.push(`state-value="${String(entry.state.value).slice(0, 60)}"`);
|
|
199
|
+
if (entry.state.checked != null) attrs.push(`state-checked="${entry.state.checked}"`);
|
|
200
|
+
if (entry.state.selectedOption != null) attrs.push(`state-selectedOption="${entry.state.selectedOption.slice(0, 40)}"`);
|
|
201
|
+
if (entry.state.disabled) attrs.push(`state-disabled="true"`);
|
|
202
|
+
if (entry.state.expanded != null) attrs.push(`state-expanded="${entry.state.expanded}"`);
|
|
203
|
+
if (entry.state.selected != null) attrs.push(`state-selected="${entry.state.selected}"`);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const attrsStr = attrs.join(" ");
|
|
207
|
+
const text = entry.text ? entry.text.slice(0, 60) : "";
|
|
208
|
+
|
|
209
|
+
// Self-closing for void / replaced elements
|
|
210
|
+
if (entry.tagName === "input" || entry.tagName === "select") {
|
|
211
|
+
lines.push(`<${entry.tagName} ${attrsStr} />`);
|
|
212
|
+
} else if (text) {
|
|
213
|
+
lines.push(`<${entry.tagName} ${attrsStr}>${text}</${entry.tagName}>`);
|
|
214
|
+
} else {
|
|
215
|
+
lines.push(`<${entry.tagName} ${attrsStr}></${entry.tagName}>`);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
return lines.join("\n");
|
|
220
|
+
};
|
|
221
|
+
|
|
222
|
+
// ---------------------------------------------------------------------------
|
|
223
|
+
// Validation
|
|
224
|
+
// ---------------------------------------------------------------------------
|
|
225
|
+
|
|
226
|
+
const isSkeletonSelector = (sel) => /(?:^|[\s,])#s\d+|div#s\d+|id="s\d+"/.test(sel);
|
|
227
|
+
const isTooGeneric = (sel) => /^(div|span|p|a|li|ul|section|article)$/.test(sel.trim());
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Parse and validate the LLM perception output.
|
|
231
|
+
* Rejects any scanIds not present in the skeleton.
|
|
232
|
+
* Returns validated perception or null on failure.
|
|
233
|
+
*/
|
|
234
|
+
const validatePerception = (rawResponse, validScanIds) => {
|
|
235
|
+
let parsed;
|
|
236
|
+
try {
|
|
237
|
+
let jsonStr = rawResponse.trim();
|
|
238
|
+
const fenceMatch = jsonStr.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
|
|
239
|
+
if (fenceMatch) jsonStr = fenceMatch[1].trim();
|
|
240
|
+
parsed = JSON.parse(jsonStr);
|
|
241
|
+
} catch (error) {
|
|
242
|
+
console.warn("[browserwire-cli] vision LLM returned unparseable JSON:", error.message);
|
|
243
|
+
return null;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (!parsed || typeof parsed !== "object") return null;
|
|
247
|
+
|
|
248
|
+
const domain = typeof parsed.domain === "string" ? parsed.domain : "unknown";
|
|
249
|
+
const domainDescription = typeof parsed.domainDescription === "string" ? parsed.domainDescription : "";
|
|
250
|
+
|
|
251
|
+
// Validate entities
|
|
252
|
+
const entities = [];
|
|
253
|
+
if (Array.isArray(parsed.entities)) {
|
|
254
|
+
for (const e of parsed.entities) {
|
|
255
|
+
if (!e || typeof e.name !== "string" || !Array.isArray(e.scanIds)) continue;
|
|
256
|
+
const validatedScanIds = e.scanIds.filter(
|
|
257
|
+
(id) => typeof id === "number" && validScanIds.has(id)
|
|
258
|
+
);
|
|
259
|
+
if (validatedScanIds.length === 0) continue;
|
|
260
|
+
entities.push({
|
|
261
|
+
name: e.name,
|
|
262
|
+
scanIds: validatedScanIds,
|
|
263
|
+
description: typeof e.description === "string" ? e.description : ""
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Validate pageState
|
|
269
|
+
let pageState = null;
|
|
270
|
+
if (parsed.pageState && typeof parsed.pageState === "object") {
|
|
271
|
+
const ps = parsed.pageState;
|
|
272
|
+
if (typeof ps.name === "string" && typeof ps.routePattern === "string") {
|
|
273
|
+
const validSignalKinds = ["selector_exists", "text_match", "url_pattern"];
|
|
274
|
+
const stateSignals = Array.isArray(ps.stateSignals)
|
|
275
|
+
? ps.stateSignals.filter((s) =>
|
|
276
|
+
s && validSignalKinds.includes(s.kind) &&
|
|
277
|
+
typeof s.value === "string" && s.value.trim().length > 0 &&
|
|
278
|
+
typeof s.weight === "number" && s.weight > 0 && s.weight <= 1
|
|
279
|
+
).map((s) => ({
|
|
280
|
+
kind: s.kind,
|
|
281
|
+
value: s.value.trim(),
|
|
282
|
+
...(s.kind === "text_match" && typeof s.selector === "string" && s.selector.trim()
|
|
283
|
+
? { selector: s.selector.trim() }
|
|
284
|
+
: {}),
|
|
285
|
+
weight: s.weight
|
|
286
|
+
}))
|
|
287
|
+
: [];
|
|
288
|
+
pageState = {
|
|
289
|
+
name: ps.name,
|
|
290
|
+
routePattern: ps.routePattern,
|
|
291
|
+
description: typeof ps.description === "string" ? ps.description : "",
|
|
292
|
+
stateSignals
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Validate views
|
|
298
|
+
const views = [];
|
|
299
|
+
if (Array.isArray(parsed.views)) {
|
|
300
|
+
for (const v of parsed.views) {
|
|
301
|
+
if (!v || typeof v.name !== "string" || typeof v.containerSelector !== "string") continue;
|
|
302
|
+
if (isSkeletonSelector(v.containerSelector)) continue;
|
|
303
|
+
if (!Array.isArray(v.fields) || v.fields.length === 0) continue;
|
|
304
|
+
|
|
305
|
+
const validFields = v.fields.filter(
|
|
306
|
+
(f) => f && typeof f.name === "string" && typeof f.selector === "string"
|
|
307
|
+
&& f.selector.trim() !== ""
|
|
308
|
+
&& !isSkeletonSelector(f.selector)
|
|
309
|
+
&& !isTooGeneric(f.selector)
|
|
310
|
+
).map((f) => ({
|
|
311
|
+
name: f.name,
|
|
312
|
+
type: ["string", "number", "boolean", "date"].includes(f.type) ? f.type : "string",
|
|
313
|
+
selector: f.selector
|
|
314
|
+
}));
|
|
315
|
+
|
|
316
|
+
if (validFields.length === 0) continue;
|
|
317
|
+
|
|
318
|
+
const entityScanIds = Array.isArray(v.entityScanIds)
|
|
319
|
+
? v.entityScanIds.filter((id) => typeof id === "number" && validScanIds.has(id))
|
|
320
|
+
: [];
|
|
321
|
+
|
|
322
|
+
views.push({
|
|
323
|
+
name: v.name,
|
|
324
|
+
description: typeof v.description === "string" ? v.description : "",
|
|
325
|
+
isList: v.isList === true,
|
|
326
|
+
isDynamic: v.isDynamic === true,
|
|
327
|
+
containerSelector: v.containerSelector,
|
|
328
|
+
itemSelector: typeof v.itemSelector === "string" ? v.itemSelector : null,
|
|
329
|
+
fields: validFields,
|
|
330
|
+
entityScanIds
|
|
331
|
+
});
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Validate actions
|
|
336
|
+
const actions = [];
|
|
337
|
+
if (Array.isArray(parsed.actions)) {
|
|
338
|
+
for (const a of parsed.actions) {
|
|
339
|
+
if (!a || typeof a.scanId !== "number" || typeof a.semanticName !== "string") continue;
|
|
340
|
+
if (!validScanIds.has(a.scanId)) {
|
|
341
|
+
console.warn(`[browserwire-cli] vision LLM referenced unknown scanId: ${a.scanId}`);
|
|
342
|
+
continue;
|
|
343
|
+
}
|
|
344
|
+
const kind = a.interactionKind;
|
|
345
|
+
const preconditions = Array.isArray(a.preconditions)
|
|
346
|
+
? a.preconditions.filter(
|
|
347
|
+
(p) => p && typeof p.description === "string"
|
|
348
|
+
).map((p) => ({
|
|
349
|
+
description: p.description,
|
|
350
|
+
stateField: typeof p.stateField === "string" ? p.stateField : null
|
|
351
|
+
}))
|
|
352
|
+
: [];
|
|
353
|
+
|
|
354
|
+
// Parse optional LLM-generated semantic locator
|
|
355
|
+
let locator = null;
|
|
356
|
+
if (a.locator && typeof a.locator === "object") {
|
|
357
|
+
const locKind = a.locator.kind;
|
|
358
|
+
const locValue = a.locator.value;
|
|
359
|
+
if (["xpath", "css"].includes(locKind) && typeof locValue === "string" && locValue.trim().length > 0) {
|
|
360
|
+
locator = {
|
|
361
|
+
kind: locKind,
|
|
362
|
+
value: locValue.trim(),
|
|
363
|
+
reasoning: typeof a.locator.reasoning === "string" ? a.locator.reasoning : ""
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
const actionEntry = {
|
|
369
|
+
scanId: a.scanId,
|
|
370
|
+
semanticName: a.semanticName,
|
|
371
|
+
interactionKind: ["click", "type", "select", "navigate"].includes(kind) ? kind : "click",
|
|
372
|
+
description: typeof a.description === "string" ? a.description : "",
|
|
373
|
+
preconditions
|
|
374
|
+
};
|
|
375
|
+
if (locator) actionEntry.locator = locator;
|
|
376
|
+
actions.push(actionEntry);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Validate composite actions (must have ≥ 2 valid steps)
|
|
381
|
+
const compositeActions = [];
|
|
382
|
+
if (Array.isArray(parsed.compositeActions)) {
|
|
383
|
+
for (const ca of parsed.compositeActions) {
|
|
384
|
+
if (!ca || typeof ca.name !== "string" || !Array.isArray(ca.stepScanIds)) continue;
|
|
385
|
+
const validSteps = ca.stepScanIds.filter(
|
|
386
|
+
(id) => typeof id === "number" && validScanIds.has(id)
|
|
387
|
+
);
|
|
388
|
+
if (validSteps.length < 2) continue;
|
|
389
|
+
const inputs = Array.isArray(ca.inputs)
|
|
390
|
+
? ca.inputs.filter((i) => i && typeof i.name === "string")
|
|
391
|
+
: [];
|
|
392
|
+
compositeActions.push({
|
|
393
|
+
name: ca.name,
|
|
394
|
+
description: typeof ca.description === "string" ? ca.description : "",
|
|
395
|
+
stepScanIds: validSteps,
|
|
396
|
+
inputs
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
return { domain, domainDescription, pageState, views, entities, actions, compositeActions };
|
|
402
|
+
};
|
|
403
|
+
|
|
404
|
+
// ---------------------------------------------------------------------------
|
|
405
|
+
// Public API
|
|
406
|
+
// ---------------------------------------------------------------------------
|
|
407
|
+
|
|
408
|
+
/**
|
|
409
|
+
* Perceive a page snapshot using the vision LLM.
|
|
410
|
+
*
|
|
411
|
+
* @param {{ skeleton: object[], screenshot: string|null, pageText: string, url: string, title: string }} payload
|
|
412
|
+
* @returns {Promise<object|null>} perception result or null on failure/no-LLM
|
|
413
|
+
*/
|
|
414
|
+
export const perceiveSnapshot = async (payload) => {
|
|
415
|
+
const config = getLLMConfig();
|
|
416
|
+
if (!config) {
|
|
417
|
+
console.log("[browserwire-cli] LLM not configured, skipping perception");
|
|
418
|
+
return null;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
const { skeleton = [], screenshot, pageText, url, title } = payload;
|
|
422
|
+
|
|
423
|
+
if (skeleton.length === 0) {
|
|
424
|
+
console.warn("[browserwire-cli] empty skeleton, skipping perception");
|
|
425
|
+
return null;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
const htmlSkeleton = buildHtmlSkeleton(skeleton);
|
|
429
|
+
const validScanIds = new Set(skeleton.map((e) => e.scanId));
|
|
430
|
+
|
|
431
|
+
const skeletonKB = Math.round(htmlSkeleton.length / 1024 * 10) / 10;
|
|
432
|
+
console.log(
|
|
433
|
+
`[browserwire-cli] perceiving: ${skeleton.length} skeleton elements (${skeletonKB}KB html), ` +
|
|
434
|
+
`screenshot=${screenshot ? "yes" : "no"}`
|
|
435
|
+
);
|
|
436
|
+
|
|
437
|
+
const context = [
|
|
438
|
+
`URL: ${url}`,
|
|
439
|
+
title ? `Title: ${title}` : "",
|
|
440
|
+
pageText ? `\nPage text (excerpt): ${pageText.slice(0, 500)}` : ""
|
|
441
|
+
].filter(Boolean).join("\n");
|
|
442
|
+
|
|
443
|
+
const userContent = `${context}\n\nHTML Skeleton:\n${htmlSkeleton}`;
|
|
444
|
+
|
|
445
|
+
let rawResponse;
|
|
446
|
+
try {
|
|
447
|
+
if (screenshot) {
|
|
448
|
+
rawResponse = await callLLMWithVision(SYSTEM_PROMPT, screenshot, userContent, config);
|
|
449
|
+
} else {
|
|
450
|
+
// Text-only fallback when no screenshot is available
|
|
451
|
+
rawResponse = await callLLM(SYSTEM_PROMPT, userContent, config);
|
|
452
|
+
}
|
|
453
|
+
} catch (error) {
|
|
454
|
+
console.warn(`[browserwire-cli] vision LLM call failed: ${error.message}`);
|
|
455
|
+
return null;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
if (!rawResponse || rawResponse.trim().length === 0) {
|
|
459
|
+
console.warn("[browserwire-cli] vision LLM returned empty response");
|
|
460
|
+
return null;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
const perception = validatePerception(rawResponse, validScanIds);
|
|
464
|
+
if (!perception) {
|
|
465
|
+
console.warn("[browserwire-cli] vision LLM output failed validation");
|
|
466
|
+
return null;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
console.log(
|
|
470
|
+
`[browserwire-cli] perception complete: domain="${perception.domain}" ` +
|
|
471
|
+
`entities=${perception.entities.length} actions=${perception.actions.length} ` +
|
|
472
|
+
`views=${perception.views.length} composites=${perception.compositeActions.length}`
|
|
473
|
+
);
|
|
474
|
+
|
|
475
|
+
return perception;
|
|
476
|
+
};
|