@mcp-b/smart-dom-reader 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +441 -0
- package/dist/bundle-string.d.ts +11 -0
- package/dist/bundle-string.js +8 -0
- package/dist/bundle-string.js.map +1 -0
- package/dist/index.d.ts +467 -0
- package/dist/index.js +1823 -0
- package/dist/index.js.map +1 -0
- package/package.json +82 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 mcp-b contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
# Smart DOM Reader
|
|
2
|
+
|
|
3
|
+
A stateless, token-efficient TypeScript library for extracting DOM information optimized for AI-powered userscript generation. Combines wisdom from multiple DOM extraction approaches to provide intelligent, context-aware element extraction.
|
|
4
|
+
|
|
5
|
+
## Key Features
|
|
6
|
+
|
|
7
|
+
- **Two extraction approaches**: Progressive (step-by-step) and Full (single-pass)
|
|
8
|
+
- **Stateless architecture**: All functions accept document/element parameters
|
|
9
|
+
- **Multiple selector strategies**: CSS, XPath, text-based, data-testid
|
|
10
|
+
- **Smart content detection**: Automatically identifies main content areas
|
|
11
|
+
- **Context preservation**: Maintains element relationships and semantic context
|
|
12
|
+
- **Shadow DOM & iframe support**: Traverses complex DOM structures
|
|
13
|
+
- **Token-efficient**: Optimized for LLM context windows
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
npm install @mcp-b/smart-dom-reader
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Two Extraction Approaches
|
|
22
|
+
|
|
23
|
+
### 1. Full Extraction (SmartDOMReader)
|
|
24
|
+
|
|
25
|
+
**When to use:** You need all information upfront and have sufficient token budget for processing the complete output. Best for automation, testing, and scenarios where you know exactly what you need.
|
|
26
|
+
|
|
27
|
+
```typescript
|
|
28
|
+
import { SmartDOMReader } from '@mcp-b/smart-dom-reader';
|
|
29
|
+
|
|
30
|
+
// Pass document explicitly - no window dependency
|
|
31
|
+
const doc = document; // or any Document object
|
|
32
|
+
|
|
33
|
+
// Interactive mode - extract only interactive elements
|
|
34
|
+
const interactiveData = SmartDOMReader.extractInteractive(doc);
|
|
35
|
+
|
|
36
|
+
// Full mode - extract interactive + semantic elements
|
|
37
|
+
const fullData = SmartDOMReader.extractFull(doc);
|
|
38
|
+
|
|
39
|
+
// Custom options
|
|
40
|
+
const customData = SmartDOMReader.extractInteractive(doc, {
|
|
41
|
+
mainContentOnly: true,
|
|
42
|
+
viewportOnly: true,
|
|
43
|
+
includeHidden: false
|
|
44
|
+
});
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 2. Progressive Extraction (ProgressiveExtractor)
|
|
48
|
+
|
|
49
|
+
**When to use:** Working with AI/LLMs where token efficiency is critical. Allows making intelligent decisions at each step rather than extracting everything upfront.
|
|
50
|
+
|
|
51
|
+
```typescript
|
|
52
|
+
import { ProgressiveExtractor } from '@mcp-b/smart-dom-reader';
|
|
53
|
+
|
|
54
|
+
// Step 1: Get high-level page structure (minimal tokens)
|
|
55
|
+
// Structure can be extracted from the whole document or a specific container element
|
|
56
|
+
const structure = ProgressiveExtractor.extractStructure(document);
|
|
57
|
+
console.log(structure.summary); // Quick stats about the page
|
|
58
|
+
console.log(structure.regions); // Map of page regions
|
|
59
|
+
console.log(structure.suggestions); // AI-friendly hints
|
|
60
|
+
|
|
61
|
+
// Step 2: Extract details from specific region based on structure
|
|
62
|
+
const mainContent = ProgressiveExtractor.extractRegion(
|
|
63
|
+
structure.summary.mainContentSelector,
|
|
64
|
+
document,
|
|
65
|
+
{ mode: 'interactive' }
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
// Step 3: Extract readable content from a region
|
|
69
|
+
const articleText = ProgressiveExtractor.extractContent(
|
|
70
|
+
'article.main-article',
|
|
71
|
+
document,
|
|
72
|
+
{ includeHeadings: true, includeLists: true }
|
|
73
|
+
);
|
|
74
|
+
|
|
75
|
+
// Structure scoped to a container (e.g., navigation only)
|
|
76
|
+
const nav = document.querySelector('nav');
|
|
77
|
+
if (nav) {
|
|
78
|
+
const navOutline = ProgressiveExtractor.extractStructure(nav);
|
|
79
|
+
// navOutline.regions will only include elements within <nav>
|
|
80
|
+
}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Extraction Modes
|
|
84
|
+
|
|
85
|
+
### Interactive Mode
|
|
86
|
+
Focuses on elements users can interact with:
|
|
87
|
+
- Buttons and button-like elements
|
|
88
|
+
- Links
|
|
89
|
+
- Form inputs (text, select, textarea)
|
|
90
|
+
- Clickable elements with handlers
|
|
91
|
+
- Form structures and associations
|
|
92
|
+
|
|
93
|
+
### Full Mode
|
|
94
|
+
Includes everything from interactive mode plus:
|
|
95
|
+
- Semantic HTML elements (articles, sections, nav)
|
|
96
|
+
- Headings hierarchy
|
|
97
|
+
- Images with alt text
|
|
98
|
+
- Tables and lists
|
|
99
|
+
- Content structure and relationships
|
|
100
|
+
|
|
101
|
+
## API Comparison
|
|
102
|
+
|
|
103
|
+
### Full Extraction API
|
|
104
|
+
|
|
105
|
+
```typescript
|
|
106
|
+
// Class-based with options
|
|
107
|
+
const reader = new SmartDOMReader({
|
|
108
|
+
mode: 'interactive',
|
|
109
|
+
mainContentOnly: true,
|
|
110
|
+
viewportOnly: false
|
|
111
|
+
});
|
|
112
|
+
const result = reader.extract(document);
|
|
113
|
+
|
|
114
|
+
// Static methods for convenience
|
|
115
|
+
SmartDOMReader.extractInteractive(document);
|
|
116
|
+
SmartDOMReader.extractFull(document);
|
|
117
|
+
SmartDOMReader.extractFromElement(element, 'interactive');
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Progressive Extraction API
|
|
121
|
+
|
|
122
|
+
```typescript
|
|
123
|
+
// Step 1: Structure overview (Document or Element)
|
|
124
|
+
const overview = ProgressiveExtractor.extractStructure(document);
|
|
125
|
+
// Returns: regions, forms, summary, suggestions
|
|
126
|
+
|
|
127
|
+
// Step 2: Region extraction
|
|
128
|
+
const region = ProgressiveExtractor.extractRegion(
|
|
129
|
+
selector,
|
|
130
|
+
document,
|
|
131
|
+
options
|
|
132
|
+
);
|
|
133
|
+
// Returns: Full SmartDOMResult for that region
|
|
134
|
+
|
|
135
|
+
// Step 3: Content extraction
|
|
136
|
+
const content = ProgressiveExtractor.extractContent(
|
|
137
|
+
selector,
|
|
138
|
+
document,
|
|
139
|
+
{ includeMedia: true }
|
|
140
|
+
);
|
|
141
|
+
// Returns: Text content, headings, lists, tables, media
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Output Structure
|
|
145
|
+
|
|
146
|
+
Both approaches return structured data optimized for AI processing:
|
|
147
|
+
|
|
148
|
+
```typescript
|
|
149
|
+
interface SmartDOMResult {
|
|
150
|
+
mode: 'interactive' | 'full';
|
|
151
|
+
timestamp: number;
|
|
152
|
+
|
|
153
|
+
page: {
|
|
154
|
+
url: string;
|
|
155
|
+
title: string;
|
|
156
|
+
hasErrors: boolean;
|
|
157
|
+
isLoading: boolean;
|
|
158
|
+
hasModals: boolean;
|
|
159
|
+
hasFocus?: string;
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
landmarks: {
|
|
163
|
+
navigation: string[];
|
|
164
|
+
main: string[];
|
|
165
|
+
forms: string[];
|
|
166
|
+
headers: string[];
|
|
167
|
+
footers: string[];
|
|
168
|
+
articles: string[];
|
|
169
|
+
sections: string[];
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
interactive: {
|
|
173
|
+
buttons: ExtractedElement[];
|
|
174
|
+
links: ExtractedElement[];
|
|
175
|
+
inputs: ExtractedElement[];
|
|
176
|
+
forms: FormInfo[];
|
|
177
|
+
clickable: ExtractedElement[];
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
semantic?: { // Only in full mode
|
|
181
|
+
headings: ExtractedElement[];
|
|
182
|
+
images: ExtractedElement[];
|
|
183
|
+
tables: ExtractedElement[];
|
|
184
|
+
lists: ExtractedElement[];
|
|
185
|
+
articles: ExtractedElement[];
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
metadata?: { // Only in full mode
|
|
189
|
+
totalElements: number;
|
|
190
|
+
extractedElements: number;
|
|
191
|
+
mainContent?: string;
|
|
192
|
+
language?: string;
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Element Information
|
|
198
|
+
|
|
199
|
+
Each extracted element includes comprehensive selector strategies with ranking (stable-first):
|
|
200
|
+
|
|
201
|
+
```typescript
|
|
202
|
+
interface ExtractedElement {
|
|
203
|
+
tag: string;
|
|
204
|
+
text: string;
|
|
205
|
+
|
|
206
|
+
selector: {
|
|
207
|
+
css: string; // Best CSS selector (ranked stable-first)
|
|
208
|
+
xpath: string; // XPath selector
|
|
209
|
+
textBased?: string; // Text-content based hint
|
|
210
|
+
dataTestId?: string; // data-testid if available
|
|
211
|
+
ariaLabel?: string; // ARIA label if available
|
|
212
|
+
candidates?: Array<{
|
|
213
|
+
type: 'id' | 'data-testid' | 'role-aria' | 'name' | 'class-path' | 'css-path' | 'xpath' | 'text';
|
|
214
|
+
value: string;
|
|
215
|
+
score: number; // Higher = more stable/robust
|
|
216
|
+
}>;
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
attributes: Record<string, string>;
|
|
220
|
+
|
|
221
|
+
context: {
|
|
222
|
+
nearestForm?: string;
|
|
223
|
+
nearestSection?: string;
|
|
224
|
+
nearestMain?: string;
|
|
225
|
+
nearestNav?: string;
|
|
226
|
+
parentChain: string[];
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
// Compact flags: only present when true to save tokens
|
|
230
|
+
interaction: {
|
|
231
|
+
click?: boolean;
|
|
232
|
+
change?: boolean;
|
|
233
|
+
submit?: boolean;
|
|
234
|
+
nav?: boolean;
|
|
235
|
+
disabled?: boolean;
|
|
236
|
+
hidden?: boolean;
|
|
237
|
+
role?: string; // aria role when present
|
|
238
|
+
form?: string; // associated form selector
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## Options
|
|
244
|
+
|
|
245
|
+
| Option | Type | Default | Description |
|
|
246
|
+
|--------|------|---------|-------------|
|
|
247
|
+
| `mode` | `'interactive' \| 'full'` | `'interactive'` | Extraction mode |
|
|
248
|
+
| `maxDepth` | `number` | `5` | Maximum traversal depth |
|
|
249
|
+
| `includeHidden` | `boolean` | `false` | Include hidden elements |
|
|
250
|
+
| `includeShadowDOM` | `boolean` | `true` | Traverse shadow DOM |
|
|
251
|
+
| `includeIframes` | `boolean` | `false` | Traverse iframes |
|
|
252
|
+
| `viewportOnly` | `boolean` | `false` | Only visible viewport elements |
|
|
253
|
+
| `mainContentOnly` | `boolean` | `false` | Focus on main content area |
|
|
254
|
+
| `customSelectors` | `string[]` | `[]` | Additional selectors to extract |
|
|
255
|
+
|
|
256
|
+
## Use Cases
|
|
257
|
+
|
|
258
|
+
### AI Userscript Generation (Progressive Approach)
|
|
259
|
+
```typescript
|
|
260
|
+
// First, understand the page structure
|
|
261
|
+
const structure = ProgressiveExtractor.extractStructure(document);
|
|
262
|
+
|
|
263
|
+
// AI decides which region to focus on based on structure
|
|
264
|
+
const targetRegion = structure.regions.main?.selector || 'body';
|
|
265
|
+
|
|
266
|
+
// Extract detailed information from chosen region
|
|
267
|
+
const details = ProgressiveExtractor.extractRegion(
|
|
268
|
+
targetRegion,
|
|
269
|
+
document,
|
|
270
|
+
{ mode: 'interactive', viewportOnly: true }
|
|
271
|
+
);
|
|
272
|
+
|
|
273
|
+
// Generate userscript prompt with focused context
|
|
274
|
+
const prompt = `
|
|
275
|
+
Page: ${details.page.title}
|
|
276
|
+
Main form: ${details.interactive.forms[0]?.selector}
|
|
277
|
+
Submit button: ${details.interactive.buttons.find(b => b.text.includes('Submit'))?.selector.css}
|
|
278
|
+
|
|
279
|
+
Write a userscript to auto-fill and submit this form.
|
|
280
|
+
`;
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### Test Automation (Full Extraction)
|
|
284
|
+
```typescript
|
|
285
|
+
// Get all interactive elements at once
|
|
286
|
+
const testData = SmartDOMReader.extractInteractive(document, {
|
|
287
|
+
customSelectors: ['[data-test]', '[data-cy]']
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
// Use multiple selector strategies for robust testing
|
|
291
|
+
testData.interactive.buttons.forEach(button => {
|
|
292
|
+
console.log(`Button: ${button.text}`);
|
|
293
|
+
console.log(` CSS: ${button.selector.css}`);
|
|
294
|
+
console.log(` XPath: ${button.selector.xpath}`);
|
|
295
|
+
console.log(` TestID: ${button.selector.dataTestId}`);
|
|
296
|
+
console.log(` Ranked candidates:`, button.selector.candidates?.slice(0, 3));
|
|
297
|
+
});
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
### Content Analysis (Progressive Approach)
|
|
301
|
+
```typescript
|
|
302
|
+
// Get structure first
|
|
303
|
+
const structure = ProgressiveExtractor.extractStructure(document);
|
|
304
|
+
|
|
305
|
+
// Extract readable content from main area
|
|
306
|
+
const content = ProgressiveExtractor.extractContent(
|
|
307
|
+
structure.summary.mainContentSelector || 'main',
|
|
308
|
+
document,
|
|
309
|
+
{ includeHeadings: true, includeTables: true }
|
|
310
|
+
);
|
|
311
|
+
|
|
312
|
+
console.log(`Word count: ${content.metadata.wordCount}`);
|
|
313
|
+
console.log(`Headings: ${content.text.headings?.length}`);
|
|
314
|
+
console.log(`Has interactive elements: ${content.metadata.hasInteractive}`);
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Stateless Architecture
|
|
318
|
+
|
|
319
|
+
All methods are stateless and accept document/element parameters explicitly:
|
|
320
|
+
|
|
321
|
+
```typescript
|
|
322
|
+
// No window or document globals required
|
|
323
|
+
function extractFromIframe(iframe: HTMLIFrameElement) {
|
|
324
|
+
const iframeDoc = iframe.contentDocument;
|
|
325
|
+
if (iframeDoc) {
|
|
326
|
+
return SmartDOMReader.extractInteractive(iframeDoc);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Works with any document context
|
|
331
|
+
function extractFromShadowRoot(shadowRoot: ShadowRoot) {
|
|
332
|
+
const container = shadowRoot.querySelector('.container');
|
|
333
|
+
if (container) {
|
|
334
|
+
return SmartDOMReader.extractFromElement(container);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
/**
|
|
339
|
+
* Stateless bundle string (for extensions / userScripts)
|
|
340
|
+
*
|
|
341
|
+
* The library also provides a self-contained IIFE bundle as a string
|
|
342
|
+
* export that can be injected and executed without touching window scope.
|
|
343
|
+
*/
|
|
344
|
+
import { SMART_DOM_READER_BUNDLE } from '@mcp-b/smart-dom-reader/bundle-string';
|
|
345
|
+
|
|
346
|
+
function execute(method, args) {
|
|
347
|
+
const code = `(() => {\n${SMART_DOM_READER_BUNDLE}\nreturn SmartDOMReaderBundle.executeExtraction(${JSON.stringify(
|
|
348
|
+
'extractStructure'
|
|
349
|
+
)}, ${JSON.stringify({ selector: undefined, formatOptions: { detail: 'summary' } })});\n})()`;
|
|
350
|
+
// inject `code` into the page (e.g., chrome.userScripts.execute)
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// Note: The bundle contains guarded fallbacks (e.g., typeof require === 'function')
|
|
354
|
+
// that are no-ops in the browser; there are no runtime imports.
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
## Design Philosophy
|
|
358
|
+
|
|
359
|
+
This library is designed to provide:
|
|
360
|
+
|
|
361
|
+
1. **Token Efficiency**: Progressive extraction minimizes token usage for AI applications
|
|
362
|
+
2. **Flexibility**: Choose between complete extraction or step-by-step approach
|
|
363
|
+
3. **Statelessness**: No global dependencies, works in any JavaScript environment
|
|
364
|
+
4. **Multiple Selector Strategies**: Robust element targeting with fallbacks
|
|
365
|
+
5. **Semantic Understanding**: Preserves meaning and relationships
|
|
366
|
+
6. **Interactive Focus**: Prioritizes elements users interact with
|
|
367
|
+
7. **Context Preservation**: Maintains element relationships
|
|
368
|
+
8. **Framework Agnostic**: Works with any web application
|
|
369
|
+
|
|
370
|
+
## Credits
|
|
371
|
+
|
|
372
|
+
Inspired by:
|
|
373
|
+
- [stacking-contexts-inspector](https://github.com/andreadev-it/stacking-contexts-inspector) - DOM traversal techniques
|
|
374
|
+
- [dom-to-semantic-markdown](https://github.com/romansky/dom-to-semantic-markdown) - Content scoring algorithms
|
|
375
|
+
- [z-context](https://github.com/gwwar/z-context) - Selector generation approaches
|
|
376
|
+
|
|
377
|
+
## License
|
|
378
|
+
|
|
379
|
+
MIT
|
|
380
|
+
|
|
381
|
+
## MCP Server (Golden Path)
|
|
382
|
+
|
|
383
|
+
For AI agents, use the bundled MCP server which returns XML-wrapped Markdown instead of JSON. This keeps responses concise and readable for LLMs while providing clear structural boundaries.
|
|
384
|
+
|
|
385
|
+
- Output format: always XML envelope with a single section tag containing Markdown in CDATA
|
|
386
|
+
- Structure: `<page title="..." url="...">\n <outline><![CDATA[ ...markdown... ]]></outline>\n</page>`
|
|
387
|
+
- Region: `<page ...>\n <section><![CDATA[ ...markdown... ]]></section>\n</page>`
|
|
388
|
+
- Content: `<page ...>\n <content><![CDATA[ ...markdown... ]]></content>\n</page>`
|
|
389
|
+
- Golden path sequence:
|
|
390
|
+
1) `dom_extract_structure` → get page outline and pick a target
|
|
391
|
+
2) `dom_extract_region` → get actionable selectors for that area
|
|
392
|
+
3) Write a script; if unstable, re-run with higher detail or limits
|
|
393
|
+
4) Optional: `dom_extract_content` for readable text context
|
|
394
|
+
|
|
395
|
+
### Running the server
|
|
396
|
+
|
|
397
|
+
Ensure the library is built so the formatter is available:
|
|
398
|
+
|
|
399
|
+
```
|
|
400
|
+
pnpm -w --filter @mcp-b/smart-dom-reader run build
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
Build and update the embedded bundle, then start the MCP server (stdio):
|
|
404
|
+
|
|
405
|
+
```
|
|
406
|
+
pnpm --filter @mcp-b/smart-dom-reader bundle:mcp
|
|
407
|
+
pnpm --filter @mcp-b/smart-dom-reader-server run start
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
Or directly with tsx:
|
|
411
|
+
|
|
412
|
+
```
|
|
413
|
+
tsx smart-dom-reader/mcp-server/src/index.ts
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
### Tool overview (inputs only)
|
|
417
|
+
|
|
418
|
+
- `browser_connect` → `{ headless?: boolean, executablePath?: string }`
|
|
419
|
+
- `browser_navigate` → `{ url: string }`
|
|
420
|
+
- `dom_extract_structure` → `{ selector?: string, detail?: 'summary'|'region'|'deep', maxTextLength?: number, maxElements?: number }`
|
|
421
|
+
- `dom_extract_region` → `{ selector: string, options?: { mode?: 'interactive'|'full', includeHidden?: boolean, maxDepth?: number, detail?: 'summary'|'region'|'deep', maxTextLength?: number, maxElements?: number } }`
|
|
422
|
+
- `dom_extract_content` → `{ selector: string, options?: { includeHeadings?: boolean, includeLists?: boolean, includeMedia?: boolean, maxTextLength?: number, detail?: 'summary'|'region'|'deep', maxElements?: number } }`
|
|
423
|
+
- `dom_extract_interactive` → `{ selector?: string, options?: { viewportOnly?: boolean, maxDepth?: number, detail?: 'summary'|'region'|'deep', maxTextLength?: number, maxElements?: number } }`
|
|
424
|
+
- `browser_screenshot` → `{ path?: string, fullPage?: boolean }`
|
|
425
|
+
- `browser_close` → `{}`
|
|
426
|
+
|
|
427
|
+
All extraction tools return XML-wrapped Markdown with a short “Next:” instruction at the bottom to guide the following step.
|
|
428
|
+
|
|
429
|
+
## Local Testing (Playwright)
|
|
430
|
+
|
|
431
|
+
Run the library in a real browser against local HTML (no network):
|
|
432
|
+
|
|
433
|
+
```
|
|
434
|
+
pnpm --filter @mcp-b/smart-dom-reader bundle:mcp
|
|
435
|
+
pnpm --filter @mcp-b/smart-dom-reader test:local
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
What it validates:
|
|
439
|
+
- Stable selectors (ID, data-testid, role+aria, name/id)
|
|
440
|
+
- Semantic extraction (headings/images/tables/lists)
|
|
441
|
+
- Shadow DOM detection
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Auto-generated bundle module for smart-dom-reader
|
|
3
|
+
* DO NOT EDIT - Generated by generate-bundle-module.mjs
|
|
4
|
+
*
|
|
5
|
+
* This module exports the bundled smart-dom-reader library as a string
|
|
6
|
+
* that can be injected into web pages for stateless DOM extraction.
|
|
7
|
+
*/
|
|
8
|
+
declare const SMART_DOM_READER_BUNDLE = "var SmartDOMReaderBundle = (function(exports) {\n class ContentDetection {\n /**\n * Find the main content area of a page\n * Inspired by dom-to-semantic-markdown's approach\n */\n static findMainContent(doc) {\n const mainElement = doc.querySelector('main, [role=\"main\"]');\n if (mainElement) {\n return mainElement;\n }\n if (!doc.body) {\n return doc.documentElement;\n }\n return this.detectMainContent(doc.body);\n }\n /**\n * Detect main content using scoring algorithm\n */\n static detectMainContent(rootElement) {\n const candidates = [];\n const minScore = 15;\n this.collectCandidates(rootElement, candidates, minScore);\n if (candidates.length === 0) {\n return rootElement;\n }\n candidates.sort((a, b) => this.calculateContentScore(b) - this.calculateContentScore(a));\n let bestCandidate = candidates[0];\n for (let i = 1; i < candidates.length; i++) {\n const isIndependent = !candidates.some(\n (other, j) => j !== i && other.contains(candidates[i])\n );\n if (isIndependent && this.calculateContentScore(candidates[i]) > this.calculateContentScore(bestCandidate)) {\n bestCandidate = candidates[i];\n }\n }\n return bestCandidate;\n }\n /**\n * Collect content candidates\n */\n static collectCandidates(element, candidates, minScore) {\n const score = this.calculateContentScore(element);\n if (score >= minScore) {\n candidates.push(element);\n }\n Array.from(element.children).forEach((child) => {\n this.collectCandidates(child, candidates, minScore);\n });\n }\n /**\n * Calculate content score for an element\n */\n static calculateContentScore(element) {\n let score = 0;\n const semanticClasses = [\n \"article\",\n \"content\",\n \"main-container\",\n \"main\",\n \"main-content\",\n \"post\",\n \"entry\"\n ];\n const semanticIds = [\"content\", \"main\", \"article\", \"post\", \"entry\"];\n semanticClasses.forEach((cls) => {\n if (element.classList.contains(cls)) {\n score += 10;\n }\n });\n semanticIds.forEach((id) => {\n if (element.id && element.id.toLowerCase().includes(id)) {\n score += 10;\n }\n });\n const tag = element.tagName.toLowerCase();\n const highValueTags = [\"article\", \"main\", \"section\"];\n if (highValueTags.includes(tag)) {\n score += 8;\n }\n const paragraphs = element.getElementsByTagName(\"p\").length;\n score += Math.min(paragraphs * 2, 10);\n const headings = element.querySelectorAll(\"h1, h2, h3\").length;\n score += Math.min(headings * 3, 9);\n const textLength = element.textContent?.trim().length || 0;\n if (textLength > 300) {\n score += Math.min(Math.floor(textLength / 300) * 2, 10);\n }\n const linkDensity = this.calculateLinkDensity(element);\n if (linkDensity < 0.3) {\n score += 5;\n } else if (linkDensity > 0.5) {\n score -= 5;\n }\n if (element.hasAttribute(\"data-main\") || element.hasAttribute(\"data-content\") || element.hasAttribute(\"itemprop\")) {\n score += 8;\n }\n const role = element.getAttribute(\"role\");\n if (role === \"main\" || role === \"article\") {\n score += 10;\n }\n if (element.matches(\n \"aside, nav, header, footer, .sidebar, .navigation, .menu, .ad, .advertisement\"\n )) {\n score -= 10;\n }\n const forms = element.getElementsByTagName(\"form\").length;\n if (forms > 2) {\n score -= 5;\n }\n return Math.max(0, score);\n }\n /**\n * Calculate link density in an element\n */\n static calculateLinkDensity(element) {\n const links = element.getElementsByTagName(\"a\");\n let linkTextLength = 0;\n for (const link of Array.from(links)) {\n linkTextLength += link.textContent?.length || 0;\n }\n const totalTextLength = element.textContent?.length || 1;\n return linkTextLength / totalTextLength;\n }\n /**\n * Check if an element is likely navigation\n */\n static isNavigation(element) {\n const tag = element.tagName.toLowerCase();\n if (tag === \"nav\" || element.getAttribute(\"role\") === \"navigation\") {\n return true;\n }\n const navPatterns = [/nav/i, /menu/i, /sidebar/i, /toolbar/i];\n const classesAndId = (element.className + \" \" + element.id).toLowerCase();\n return navPatterns.some((pattern) => pattern.test(classesAndId));\n }\n /**\n * Check if element is likely supplementary content\n */\n static isSupplementary(element) {\n const tag = element.tagName.toLowerCase();\n if (tag === \"aside\" || element.getAttribute(\"role\") === \"complementary\") {\n return true;\n }\n const supplementaryPatterns = [/sidebar/i, /widget/i, /related/i, /advertisement/i, /social/i];\n const classesAndId = (element.className + \" \" + element.id).toLowerCase();\n return supplementaryPatterns.some((pattern) => pattern.test(classesAndId));\n }\n /**\n * Detect page landmarks\n */\n static detectLandmarks(doc) {\n const landmarks = {\n navigation: [],\n main: [],\n complementary: [],\n contentinfo: [],\n banner: [],\n search: [],\n form: [],\n region: []\n };\n const landmarkSelectors = {\n navigation: 'nav, [role=\"navigation\"]',\n main: 'main, [role=\"main\"]',\n complementary: 'aside, [role=\"complementary\"]',\n contentinfo: 'footer, [role=\"contentinfo\"]',\n banner: 'header, [role=\"banner\"]',\n search: '[role=\"search\"]',\n form: 'form[aria-label], form[aria-labelledby], [role=\"form\"]',\n region: 'section[aria-label], section[aria-labelledby], [role=\"region\"]'\n };\n for (const [landmark, selector] of Object.entries(landmarkSelectors)) {\n const elements = doc.querySelectorAll(selector);\n landmarks[landmark] = Array.from(elements);\n }\n return landmarks;\n }\n }\n class SelectorGenerator {\n /**\n * Generate multiple selector strategies for an element\n */\n static generateSelectors(element) {\n const doc = element.ownerDocument || document;\n const candidates = [];\n if (element.id && this.isUniqueId(element.id, doc)) {\n candidates.push({ type: \"id\", value: `#${CSS.escape(element.id)}`, score: 100 });\n }\n const testId = this.getDataTestId(element);\n if (testId) {\n const v = `[data-testid=\"${CSS.escape(testId)}\"]`;\n candidates.push({\n type: \"data-testid\",\n value: v,\n score: 90 + (this.isUniqueSelectorSafe(v, doc) ? 5 : 0)\n });\n }\n const role = element.getAttribute(\"role\");\n const aria = element.getAttribute(\"aria-label\");\n if (role && aria) {\n const v = `[role=\"${CSS.escape(role)}\"][aria-label=\"${CSS.escape(aria)}\"]`;\n candidates.push({\n type: \"role-aria\",\n value: v,\n score: 85 + (this.isUniqueSelectorSafe(v, doc) ? 5 : 0)\n });\n }\n const nameAttr = element.getAttribute(\"name\");\n if (nameAttr) {\n const v = `[name=\"${CSS.escape(nameAttr)}\"]`;\n candidates.push({\n type: \"name\",\n value: v,\n score: 78 + (this.isUniqueSelectorSafe(v, doc) ? 5 : 0)\n });\n }\n const pathCss = this.generateCSSSelector(element, doc);\n const structuralPenalty = (pathCss.match(/:nth-child\\(/g) || []).length * 10;\n const classBonus = pathCss.includes(\".\") ? 8 : 0;\n const pathScore = Math.max(0, 70 + classBonus - structuralPenalty);\n candidates.push({ type: \"class-path\", value: pathCss, score: pathScore });\n const xpath = this.generateXPath(element, doc);\n candidates.push({ type: \"xpath\", value: xpath, score: 40 });\n const textBased = this.generateTextBasedSelector(element);\n if (textBased) candidates.push({ type: \"text\", value: textBased, score: 30 });\n candidates.sort((a, b) => b.score - a.score);\n const bestCss = candidates.find((c) => c.type !== \"xpath\" && c.type !== \"text\")?.value || pathCss;\n return {\n css: bestCss,\n xpath,\n textBased,\n dataTestId: testId || void 0,\n ariaLabel: aria || void 0,\n candidates\n };\n }\n /**\n * Generate a unique CSS selector for an element\n */\n static generateCSSSelector(element, doc) {\n if (element.id && this.isUniqueId(element.id, doc)) {\n return `#${CSS.escape(element.id)}`;\n }\n const testId = this.getDataTestId(element);\n if (testId) {\n return `[data-testid=\"${CSS.escape(testId)}\"]`;\n }\n const path = [];\n let current = element;\n while (current && current.nodeType === Node.ELEMENT_NODE) {\n let selector = current.nodeName.toLowerCase();\n if (current.id && this.isUniqueId(current.id, doc)) {\n selector = `#${CSS.escape(current.id)}`;\n path.unshift(selector);\n break;\n }\n const classes = this.getMeaningfulClasses(current);\n if (classes.length > 0) {\n selector += \".\" + classes.map((c) => CSS.escape(c)).join(\".\");\n }\n const siblings = current.parentElement?.children;\n if (siblings && siblings.length > 1) {\n const index = Array.from(siblings).indexOf(current);\n if (index > 0 || !this.isUniqueSelector(selector, current.parentElement)) {\n selector += `:nth-child(${index + 1})`;\n }\n }\n path.unshift(selector);\n current = current.parentElement;\n }\n return this.optimizePath(path, element, doc);\n }\n /**\n * Generate XPath for an element\n */\n static generateXPath(element, doc) {\n if (element.id && this.isUniqueId(element.id, doc)) {\n return `//*[@id=\"${element.id}\"]`;\n }\n const path = [];\n let current = element;\n while (current && current.nodeType === Node.ELEMENT_NODE) {\n const tagName = current.nodeName.toLowerCase();\n if (current.id && this.isUniqueId(current.id, doc)) {\n path.unshift(`//*[@id=\"${current.id}\"]`);\n break;\n }\n let xpath = tagName;\n const siblings = current.parentElement?.children;\n if (siblings) {\n const sameTagSiblings = Array.from(siblings).filter(\n (s) => s.nodeName.toLowerCase() === tagName\n );\n if (sameTagSiblings.length > 1) {\n const index = sameTagSiblings.indexOf(current) + 1;\n xpath += `[${index}]`;\n }\n }\n path.unshift(xpath);\n current = current.parentElement;\n }\n return \"//\" + path.join(\"/\");\n }\n /**\n * Generate a text-based selector for buttons and links\n */\n static generateTextBasedSelector(element) {\n const text = element.textContent?.trim();\n if (!text || text.length > 50) return void 0;\n const tag = element.nodeName.toLowerCase();\n if ([\"button\", \"a\", \"label\"].includes(tag)) {\n const escapedText = text.replace(/['\"\\\\]/g, \"\\\\$&\");\n return `${tag}:contains(\"${escapedText}\")`;\n }\n return void 0;\n }\n /**\n * Get data-testid or similar attributes\n */\n static getDataTestId(element) {\n return element.getAttribute(\"data-testid\") || element.getAttribute(\"data-test-id\") || element.getAttribute(\"data-test\") || element.getAttribute(\"data-cy\") || void 0;\n }\n /**\n * Check if an ID is unique in the document\n */\n static isUniqueId(id, doc) {\n return doc.querySelectorAll(`#${CSS.escape(id)}`).length === 1;\n }\n /**\n * Check if a selector is unique within a container\n */\n static isUniqueSelector(selector, container) {\n try {\n return container.querySelectorAll(selector).length === 1;\n } catch {\n return false;\n }\n }\n static isUniqueSelectorSafe(selector, doc) {\n try {\n return doc.querySelectorAll(selector).length === 1;\n } catch {\n return false;\n }\n }\n /**\n * Get meaningful classes (filtering out utility classes)\n */\n static getMeaningfulClasses(element) {\n const classes = Array.from(element.classList);\n const utilityPatterns = [\n /^(p|m|w|h|text|bg|border|flex|grid|col|row)-/,\n /^(xs|sm|md|lg|xl|2xl):/,\n /^(hover|focus|active|disabled|checked):/,\n /^js-/,\n /^is-/,\n /^has-/\n ];\n return classes.filter((cls) => {\n if (cls.length < 3) return false;\n return !utilityPatterns.some((pattern) => pattern.test(cls));\n }).slice(0, 2);\n }\n /**\n * Optimize the selector path by removing unnecessary parts\n */\n static optimizePath(path, element, doc) {\n for (let i = 0; i < path.length - 1; i++) {\n const shortPath = path.slice(i).join(\" > \");\n try {\n const matches = doc.querySelectorAll(shortPath);\n if (matches.length === 1 && matches[0] === element) {\n return shortPath;\n }\n } catch {\n }\n }\n return path.join(\" > \");\n }\n /**\n * Get a human-readable path description\n */\n static getContextPath(element) {\n const path = [];\n let current = element;\n let depth = 0;\n const maxDepth = 5;\n while (current && current !== element.ownerDocument?.body && depth < maxDepth) {\n const tag = current.nodeName.toLowerCase();\n let descriptor = tag;\n if (current.id) {\n descriptor = `${tag}#${current.id}`;\n } else if (current.className && typeof current.className === \"string\") {\n const firstClass = current.className.split(\" \")[0];\n if (firstClass) {\n descriptor = `${tag}.${firstClass}`;\n }\n }\n const role = current.getAttribute(\"role\");\n if (role) {\n descriptor += `[role=\"${role}\"]`;\n }\n path.unshift(descriptor);\n current = current.parentElement;\n depth++;\n }\n return path;\n }\n }\n class DOMTraversal {\n static INTERACTIVE_SELECTORS = [\n \"button\",\n \"a[href]\",\n 'input:not([type=\"hidden\"])',\n \"textarea\",\n \"select\",\n '[role=\"button\"]',\n \"[onclick]\",\n '[contenteditable=\"true\"]',\n \"summary\",\n '[tabindex]:not([tabindex=\"-1\"])'\n ];\n static SEMANTIC_SELECTORS = [\n \"h1\",\n \"h2\",\n \"h3\",\n \"h4\",\n \"h5\",\n \"h6\",\n \"article\",\n \"section\",\n \"nav\",\n \"aside\",\n \"main\",\n \"header\",\n \"footer\",\n \"form\",\n \"table\",\n \"ul\",\n \"ol\",\n \"img[alt]\",\n \"figure\",\n \"video\",\n \"audio\",\n '[role=\"navigation\"]',\n '[role=\"main\"]',\n '[role=\"complementary\"]',\n '[role=\"contentinfo\"]'\n ];\n /**\n * Check if element is visible\n */\n static isVisible(element, computedStyle) {\n const rect = element.getBoundingClientRect();\n const style = computedStyle || element.ownerDocument?.defaultView?.getComputedStyle(element);\n if (!style) return false;\n return !!(rect.width > 0 && rect.height > 0 && style.display !== \"none\" && style.visibility !== \"hidden\" && style.opacity !== \"0\" && element.offsetParent !== null);\n }\n /**\n * Check if element is in viewport\n */\n static isInViewport(element, viewport) {\n const rect = element.getBoundingClientRect();\n const view = viewport || {\n width: element.ownerDocument?.defaultView?.innerWidth || 0,\n height: element.ownerDocument?.defaultView?.innerHeight || 0\n };\n return rect.top < view.height && rect.bottom > 0 && rect.left < view.width && rect.right > 0;\n }\n /**\n * Check if element passes filter criteria\n */\n static passesFilter(element, filter) {\n if (!filter) return true;\n const htmlElement = element;\n if (filter.excludeSelectors?.length) {\n for (const selector of filter.excludeSelectors) {\n if (element.matches(selector)) return false;\n }\n }\n if (filter.includeSelectors?.length) {\n let matches = false;\n for (const selector of filter.includeSelectors) {\n if (element.matches(selector)) {\n matches = true;\n break;\n }\n }\n if (!matches) return false;\n }\n if (filter.tags?.length && !filter.tags.includes(element.tagName.toLowerCase())) {\n return false;\n }\n const textContent = htmlElement.textContent?.toLowerCase() || \"\";\n if (filter.textContains?.length) {\n let hasText = false;\n for (const text of filter.textContains) {\n if (textContent.includes(text.toLowerCase())) {\n hasText = true;\n break;\n }\n }\n if (!hasText) return false;\n }\n if (filter.textMatches?.length) {\n let matches = false;\n for (const pattern of filter.textMatches) {\n if (pattern.test(textContent)) {\n matches = true;\n break;\n }\n }\n if (!matches) return false;\n }\n if (filter.hasAttributes?.length) {\n for (const attr of filter.hasAttributes) {\n if (!element.hasAttribute(attr)) return false;\n }\n }\n if (filter.attributeValues) {\n for (const [attr, value] of Object.entries(filter.attributeValues)) {\n const attrValue = element.getAttribute(attr);\n if (!attrValue) return false;\n if (typeof value === \"string\") {\n if (attrValue !== value) return false;\n } else if (value instanceof RegExp) {\n if (!value.test(attrValue)) return false;\n }\n }\n }\n if (filter.withinSelectors?.length) {\n let isWithin = false;\n for (const selector of filter.withinSelectors) {\n if (element.closest(selector)) {\n isWithin = true;\n break;\n }\n }\n if (!isWithin) return false;\n }\n if (filter.interactionTypes?.length) {\n const interaction = this.getInteractionInfo(element);\n let hasInteraction = false;\n for (const type of filter.interactionTypes) {\n if (interaction[type]) {\n hasInteraction = true;\n break;\n }\n }\n if (!hasInteraction) return false;\n }\n if (filter.nearText) {\n const parent = element.parentElement;\n if (!parent || !parent.textContent?.toLowerCase().includes(filter.nearText.toLowerCase())) {\n return false;\n }\n }\n return true;\n }\n /**\n * Extract element information\n */\n static extractElement(element, options, depth = 0) {\n if (options.maxDepth && depth > options.maxDepth) {\n return null;\n }\n if (!options.includeHidden && !this.isVisible(element)) {\n return null;\n }\n if (options.viewportOnly && !this.isInViewport(element)) {\n return null;\n }\n if (!this.passesFilter(element, options.filter)) {\n return null;\n }\n const htmlElement = element;\n const extracted = {\n tag: element.tagName.toLowerCase(),\n text: this.getElementText(element, options),\n selector: SelectorGenerator.generateSelectors(element),\n attributes: this.getRelevantAttributes(element, options),\n context: this.getElementContext(element),\n interaction: this.getInteractionInfo(element)\n // bounds removed to save tokens\n };\n if (options.mode === \"full\" && this.isSemanticContainer(element)) {\n const children = [];\n if (options.includeShadowDOM && htmlElement.shadowRoot) {\n const shadowChildren = this.extractChildren(htmlElement.shadowRoot, options, depth + 1);\n children.push(...shadowChildren);\n }\n const regularChildren = this.extractChildren(element, options, depth + 1);\n children.push(...regularChildren);\n if (children.length > 0) {\n extracted.children = children;\n }\n }\n return extracted;\n }\n /**\n * Extract children elements\n */\n static extractChildren(container, options, depth) {\n const children = [];\n const elements = container.querySelectorAll(\"*\");\n for (const child of Array.from(elements)) {\n if (this.hasExtractedAncestor(child, elements)) {\n continue;\n }\n const extracted = this.extractElement(child, options, depth);\n if (extracted) {\n children.push(extracted);\n }\n }\n return children;\n }\n /**\n * Check if element has an ancestor that was already extracted\n */\n static hasExtractedAncestor(element, extractedElements) {\n let parent = element.parentElement;\n while (parent) {\n if (Array.from(extractedElements).includes(parent)) {\n return true;\n }\n parent = parent.parentElement;\n }\n return false;\n }\n /**\n * Get relevant attributes for an element\n */\n static getRelevantAttributes(element, options) {\n const relevant = [\n \"id\",\n \"class\",\n \"name\",\n \"type\",\n \"value\",\n \"placeholder\",\n \"href\",\n \"src\",\n \"alt\",\n \"title\",\n \"action\",\n \"method\",\n \"aria-label\",\n \"aria-describedby\",\n \"aria-controls\",\n \"role\",\n \"disabled\",\n \"readonly\",\n \"required\",\n \"checked\",\n \"min\",\n \"max\",\n \"pattern\",\n \"step\",\n \"autocomplete\",\n \"data-testid\",\n \"data-test\",\n \"data-cy\"\n ];\n const attributes = {};\n const attrTruncate = options.attributeTruncateLength ?? 100;\n const dataAttrTruncate = options.dataAttributeTruncateLength ?? 50;\n for (const attr of relevant) {\n const value = element.getAttribute(attr);\n if (value) {\n attributes[attr] = value.length > attrTruncate ? value.substring(0, attrTruncate) + \"...\" : value;\n }\n }\n for (const attr of element.attributes) {\n if (attr.name.startsWith(\"data-\") && !relevant.includes(attr.name)) {\n attributes[attr.name] = attr.value.length > dataAttrTruncate ? attr.value.substring(0, dataAttrTruncate) + \"...\" : attr.value;\n }\n }\n return attributes;\n }\n /**\n * Get element context information\n */\n static getElementContext(element) {\n const context = {\n parentChain: SelectorGenerator.getContextPath(element)\n };\n const form = element.closest(\"form\");\n if (form) {\n context.nearestForm = SelectorGenerator.generateSelectors(form).css;\n }\n const section = element.closest('section, [role=\"region\"]');\n if (section) {\n context.nearestSection = SelectorGenerator.generateSelectors(section).css;\n }\n const main = element.closest('main, [role=\"main\"]');\n if (main) {\n context.nearestMain = SelectorGenerator.generateSelectors(main).css;\n }\n const nav = element.closest('nav, [role=\"navigation\"]');\n if (nav) {\n context.nearestNav = SelectorGenerator.generateSelectors(nav).css;\n }\n return context;\n }\n /**\n * Get interaction information for an element (compact format)\n */\n static getInteractionInfo(element) {\n const htmlElement = element;\n const interaction = {};\n const hasClickHandler = !!(htmlElement.onclick || element.getAttribute(\"onclick\") || element.matches('button, a[href], [role=\"button\"], [tabindex]:not([tabindex=\"-1\"])'));\n if (hasClickHandler) interaction.click = true;\n const hasChangeHandler = !!(htmlElement.onchange || element.getAttribute(\"onchange\") || element.matches(\"input, select, textarea\"));\n if (hasChangeHandler) interaction.change = true;\n const hasSubmitHandler = !!(htmlElement.onsubmit || element.getAttribute(\"onsubmit\") || element.matches(\"form\"));\n if (hasSubmitHandler) interaction.submit = true;\n const triggersNavigation = element.matches('a[href], button[type=\"submit\"]');\n if (triggersNavigation) interaction.nav = true;\n const isDisabled = htmlElement.hasAttribute(\"disabled\") || htmlElement.getAttribute(\"aria-disabled\") === \"true\";\n if (isDisabled) interaction.disabled = true;\n const isHidden = !this.isVisible(element);\n if (isHidden) interaction.hidden = true;\n const ariaRole = element.getAttribute(\"role\");\n if (ariaRole) interaction.role = ariaRole;\n if (element.matches(\"input, textarea, select, button\")) {\n const form = element.form || element.closest(\"form\");\n if (form) {\n interaction.form = SelectorGenerator.generateSelectors(form).css;\n }\n }\n return interaction;\n }\n /**\n * Get text content of an element (limited length)\n */\n static getElementText(element, options) {\n if (element.matches(\"input, textarea\")) {\n const input = element;\n return input.value || input.placeholder || \"\";\n }\n if (element.matches(\"img\")) {\n return element.alt || \"\";\n }\n const text = element.textContent?.trim() || \"\";\n const maxLength = options?.textTruncateLength;\n if (maxLength && text.length > maxLength) {\n return text.substring(0, maxLength) + \"...\";\n }\n return text;\n }\n /**\n * Check if element is a semantic container\n */\n static isSemanticContainer(element) {\n return element.matches(\n 'article, section, nav, aside, main, header, footer, form, table, ul, ol, dl, figure, details, dialog, [role=\"region\"], [role=\"navigation\"], [role=\"main\"], [role=\"complementary\"]'\n );\n }\n /**\n * Get interactive elements\n */\n static getInteractiveElements(container = document, options) {\n const elements = [];\n const selector = this.INTERACTIVE_SELECTORS.join(\", \");\n const found = container.querySelectorAll(selector);\n for (const element of Array.from(found)) {\n const extracted = this.extractElement(element, options);\n if (extracted) {\n elements.push(extracted);\n }\n }\n if (options.customSelectors) {\n for (const customSelector of options.customSelectors) {\n try {\n const customFound = container.querySelectorAll(customSelector);\n for (const element of Array.from(customFound)) {\n const extracted = this.extractElement(element, options);\n if (extracted) {\n elements.push(extracted);\n }\n }\n } catch (e) {\n console.warn(`Invalid custom selector: ${customSelector}`);\n }\n }\n }\n return elements;\n }\n /**\n * Get semantic elements (for full mode)\n */\n static getSemanticElements(container = document, options) {\n const elements = [];\n const selector = this.SEMANTIC_SELECTORS.join(\", \");\n const found = container.querySelectorAll(selector);\n for (const element of Array.from(found)) {\n const extracted = this.extractElement(element, options);\n if (extracted) {\n elements.push(extracted);\n }\n }\n return elements;\n }\n }\n function truncate(text, len) {\n const t = (text ?? \"\").trim();\n if (!len || t.length <= len) return t;\n const keywords = [\n \"login\",\n \"log in\",\n \"sign in\",\n \"sign up\",\n \"submit\",\n \"search\",\n \"filter\",\n \"add to cart\",\n \"next\",\n \"continue\"\n ];\n const lower = t.toLowerCase();\n const hit = keywords.map((k) => ({ k, i: lower.indexOf(k) })).find((x) => x.i > -1);\n const head = Math.max(0, Math.floor(len * 0.66));\n if (hit && hit.i > head) {\n const tailWindow = Math.max(12, len - head - 5);\n const start = Math.max(0, hit.i - Math.floor(tailWindow / 2));\n const end = Math.min(t.length, start + tailWindow);\n return t.slice(0, head).trimEnd() + \" \u2026 \" + t.slice(start, end).trim() + \"\u2026\";\n }\n const slice = t.slice(0, len);\n const lastSpace = slice.lastIndexOf(\" \");\n return (lastSpace > 32 ? slice.slice(0, lastSpace) : slice) + \"\u2026\";\n }\n function bestSelector(el) {\n return el.selector?.css || \"\";\n }\n function hashId(input) {\n let h = 5381;\n for (let i = 0; i < input.length; i++) h = h * 33 ^ input.charCodeAt(i);\n return \"sec-\" + (h >>> 0).toString(36);\n }\n function iconForRegion(key) {\n switch (key) {\n case \"header\":\n return \"\uD83E\uDDED\";\n case \"navigation\":\n return \"\uD83D\uDCD1\";\n case \"main\":\n return \"\uD83D\uDCC4\";\n case \"sections\":\n return \"\uD83D\uDDC2\uFE0F\";\n case \"sidebar\":\n return \"\uD83D\uDCDA\";\n case \"footer\":\n return \"\uD83D\uDD3B\";\n case \"modals\":\n return \"\uD83D\uDCAC\";\n default:\n return \"\uD83D\uDD39\";\n }\n }\n function elementLine(el, opts) {\n const txt = truncate(el.text || el.attributes?.ariaLabel, opts?.maxTextLength ?? 80);\n const sel = bestSelector(el);\n const tag = el.tag.toLowerCase();\n const action = el.interaction?.submit ? \"submit\" : el.interaction?.click ? \"click\" : el.interaction?.change ? \"change\" : void 0;\n const actionText = action ? ` (${action})` : \"\";\n return `- ${tag.toUpperCase()}: ${txt || \"(no text)\"} \u2192 \\`${sel}\\`${actionText}`;\n }\n function selectorQualitySummary(inter) {\n const all = [];\n all.push(...inter.buttons.map((e) => e.selector?.css || \"\"));\n all.push(...inter.links.map((e) => e.selector?.css || \"\"));\n all.push(...inter.inputs.map((e) => e.selector?.css || \"\"));\n all.push(...inter.clickable.map((e) => e.selector?.css || \"\"));\n const total = all.length || 1;\n const idCount = all.filter((s) => s.startsWith(\"#\")).length;\n const testIdCount = all.filter((s) => /\\[data-testid=/.test(s)).length;\n const nthCount = all.filter((s) => /:nth-child\\(/.test(s)).length;\n const stable = idCount + testIdCount;\n const stablePct = Math.round(stable / total * 100);\n const nthPct = Math.round(nthCount / total * 100);\n return `Selector quality: ${stablePct}% stable (ID/data-testid), ${nthPct}% structural (:nth-child)`;\n }\n function renderInteractive(inter, opts) {\n const parts = [];\n const limit = (arr) => typeof opts?.maxElements === \"number\" ? arr.slice(0, opts.maxElements) : arr;\n if (inter.buttons.length) {\n parts.push(\"Buttons:\");\n for (const el of limit(inter.buttons)) parts.push(elementLine(el, opts));\n }\n if (inter.links.length) {\n parts.push(\"Links:\");\n for (const el of limit(inter.links)) parts.push(elementLine(el, opts));\n }\n if (inter.inputs.length) {\n parts.push(\"Inputs:\");\n for (const el of limit(inter.inputs)) parts.push(elementLine(el, opts));\n }\n if (inter.clickable.length) {\n parts.push(\"Other Clickable:\");\n for (const el of limit(inter.clickable)) parts.push(elementLine(el, opts));\n }\n if (inter.forms.length) {\n parts.push(\"Forms:\");\n for (const f of limit(inter.forms)) {\n parts.push(`- FORM: action=${f.action ?? \"-\"} method=${f.method ?? \"-\"} \u2192 \\`${f.selector}\\``);\n }\n }\n return parts.join(\"\\n\");\n }\n function renderRegionInfo(region) {\n const icon = iconForRegion(\"region\");\n const id = hashId(`${region.selector}|${region.label ?? \"\"}|${region.role ?? \"\"}`);\n const label = region.label ? ` ${region.label}` : \"\";\n const stats = [];\n if (region.buttonCount) stats.push(`${region.buttonCount} buttons`);\n if (region.linkCount) stats.push(`${region.linkCount} links`);\n if (region.inputCount) stats.push(`${region.inputCount} inputs`);\n if (region.textPreview) stats.push(`\u201C${truncate(region.textPreview, 80)}\u201D`);\n const statsLine = stats.length ? ` \u2014 ${stats.join(\", \")}` : \"\";\n return `${icon} ${label} \u2192 \\`${region.selector}\\` [${id}]${statsLine}`;\n }\n function wrapXml(body, meta, type = \"section\") {\n const attrs = [\n meta?.title ? `title=\"${escapeXml(meta.title)}\"` : null,\n meta?.url ? `url=\"${escapeXml(meta.url)}\"` : null\n ].filter(Boolean).join(\" \");\n return `<page ${attrs}>\n <${type}><![CDATA[\n${body}\n]]></${type}>\n</page>`;\n }\n function escapeXml(s) {\n return s.replace(/&/g, \"&\").replace(/</g, \"<\").replace(/>/g, \">\").replace(/\"/g, \""\");\n }\n class MarkdownFormatter {\n static structure(overview, _opts = {}, meta) {\n const lines = [];\n lines.push(`# Page Outline`);\n if (meta?.title || meta?.url) {\n lines.push(`Title: ${meta?.title ?? \"\"}`.trim());\n lines.push(`URL: ${meta?.url ?? \"\"}`.trim());\n }\n lines.push(\"\");\n const regions = overview.regions;\n const entries = [\n [\"header\", regions.header],\n [\"navigation\", regions.navigation],\n [\"main\", regions.main],\n [\"sections\", regions.sections],\n [\"sidebar\", regions.sidebar],\n [\"footer\", regions.footer],\n [\"modals\", regions.modals]\n ];\n for (const [key, value] of entries) {\n if (!value) continue;\n const icon = iconForRegion(key);\n if (Array.isArray(value)) {\n if (!value.length) continue;\n lines.push(`## ${icon} ${capitalize(key)}`);\n for (const region of value) lines.push(renderRegionInfo(region));\n } else {\n lines.push(`## ${icon} ${capitalize(key)}`);\n lines.push(renderRegionInfo(value));\n }\n lines.push(\"\");\n }\n if (overview.suggestions?.length) {\n lines.push(\"## Suggestions\");\n for (const s of overview.suggestions) lines.push(`- ${s}`);\n lines.push(\"\");\n }\n lines.push(\n \"Next: choose a region (by selector or [sectionId]) and call dom_extract_region for actionable details.\"\n );\n const body = lines.join(\"\\n\");\n return wrapXml(body, meta, \"outline\");\n }\n static region(result, opts = {}, meta) {\n const lines = [];\n lines.push(`# Region Details`);\n if (meta?.title || meta?.url) {\n lines.push(`Title: ${meta?.title ?? \"\"}`.trim());\n lines.push(`URL: ${meta?.url ?? \"\"}`.trim());\n }\n lines.push(\"\");\n const inter = result.interactive;\n if (result.page) {\n const ps = [\n result.page.hasErrors ? \"errors: yes\" : \"errors: no\",\n result.page.isLoading ? \"loading: yes\" : \"loading: no\",\n result.page.hasModals ? \"modals: yes\" : \"modals: no\"\n ];\n lines.push(`Page state: ${ps.join(\", \")}`);\n }\n const summary = [];\n const count = (arr) => arr ? arr.length : 0;\n summary.push(`${count(inter.buttons)} buttons`);\n summary.push(`${count(inter.links)} links`);\n summary.push(`${count(inter.inputs)} inputs`);\n if (inter.forms?.length) summary.push(`${count(inter.forms)} forms`);\n lines.push(`Summary: ${summary.join(\", \")}`);\n lines.push(selectorQualitySummary(inter));\n lines.push(\"\");\n lines.push(renderInteractive(inter, opts));\n lines.push(\"\");\n lines.push(\n \"Next: write a script using the most stable selectors above. If selectors look unstable, rerun dom_extract_region with higher detail or call dom_extract_content for text context.\"\n );\n const body = lines.join(\"\\n\");\n return wrapXml(body, meta, \"section\");\n }\n static content(content, opts = {}, meta) {\n const lines = [];\n lines.push(`# Content`);\n lines.push(`Selector: \\`${content.selector}\\``);\n lines.push(\"\");\n if (content.text.headings?.length) {\n lines.push(\"Headings:\");\n for (const h of content.text.headings)\n lines.push(`- H${h.level}: ${truncate(h.text, opts.maxTextLength ?? 120)}`);\n lines.push(\"\");\n }\n if (content.text.paragraphs?.length) {\n const limit = typeof opts.maxElements === \"number\" ? opts.maxElements : content.text.paragraphs.length;\n lines.push(\"Paragraphs:\");\n for (const p of content.text.paragraphs.slice(0, limit))\n lines.push(`- ${truncate(p, opts.maxTextLength ?? 200)}`);\n lines.push(\"\");\n }\n if (content.text.lists?.length) {\n lines.push(\"Lists:\");\n for (const list of content.text.lists) {\n lines.push(`- ${list.type.toUpperCase()}:`);\n const limit = typeof opts.maxElements === \"number\" ? opts.maxElements : list.items.length;\n for (const item of list.items.slice(0, limit))\n lines.push(` - ${truncate(item, opts.maxTextLength ?? 120)}`);\n }\n lines.push(\"\");\n }\n if (content.tables?.length) {\n lines.push(\"Tables:\");\n for (const t of content.tables) {\n lines.push(`- Headers: ${t.headers.join(\" | \")}`);\n const limit = typeof opts.maxElements === \"number\" ? opts.maxElements : t.rows.length;\n for (const row of t.rows.slice(0, limit)) lines.push(` - ${row.join(\" | \")}`);\n }\n lines.push(\"\");\n }\n if (content.media?.length) {\n lines.push(\"Media:\");\n const limit = typeof opts.maxElements === \"number\" ? opts.maxElements : content.media.length;\n for (const m of content.media.slice(0, limit)) {\n lines.push(`- ${m.type.toUpperCase()}: ${m.alt ?? \"\"} ${m.src ? `\u2192 ${m.src}` : \"\"}`.trim());\n }\n lines.push(\"\");\n }\n lines.push(\n \"Next: if text is insufficient for targeting, call dom_extract_region for interactive selectors.\"\n );\n const body = lines.join(\"\\n\");\n return wrapXml(body, meta, \"content\");\n }\n }\n function capitalize(s) {\n return s.charAt(0).toUpperCase() + s.slice(1);\n }\n function resolveSmartDomReader() {\n if (typeof window !== \"undefined\") {\n const globalWindow = window;\n const direct = globalWindow.SmartDOMReader;\n if (typeof direct === \"function\") {\n return direct;\n }\n const namespace = globalWindow.SmartDOMReaderNamespace;\n if (namespace && typeof namespace.SmartDOMReader === \"function\") {\n return namespace.SmartDOMReader;\n }\n }\n try {\n if (typeof require === \"function\") {\n const moduleExports = require(\"./index\");\n if (moduleExports && typeof moduleExports.SmartDOMReader === \"function\") {\n return moduleExports.SmartDOMReader;\n }\n if (moduleExports && typeof moduleExports.default === \"function\") {\n return moduleExports.default;\n }\n }\n } catch {\n }\n return void 0;\n }\n class ProgressiveExtractor {\n /**\n * Step 1: Extract high-level structural overview\n * This provides a \"map\" of the page for the AI to understand structure\n */\n static extractStructure(root) {\n const regions = {};\n const header = root.querySelector('header, [role=\"banner\"], .header, #header');\n if (header) {\n regions.header = this.analyzeRegion(header);\n }\n const navs = root.querySelectorAll('nav, [role=\"navigation\"], .nav, .navigation');\n if (navs.length > 0) {\n regions.navigation = Array.from(navs).map((nav) => this.analyzeRegion(nav));\n }\n if (root instanceof Document) {\n const main = ContentDetection.findMainContent(root);\n if (main) {\n regions.main = this.analyzeRegion(main);\n const sections = main.querySelectorAll('section, article, [role=\"region\"]');\n if (sections.length > 0) {\n regions.sections = Array.from(sections).filter((section) => !section.closest(\"nav, header, footer\")).map((section) => this.analyzeRegion(section));\n }\n }\n } else {\n regions.main = this.analyzeRegion(root);\n const sections = root.querySelectorAll('section, article, [role=\"region\"]');\n if (sections.length > 0) {\n regions.sections = Array.from(sections).filter((section) => !section.closest(\"nav, header, footer\")).map((section) => this.analyzeRegion(section));\n }\n }\n const sidebars = root.querySelectorAll('aside, [role=\"complementary\"], .sidebar, #sidebar');\n if (sidebars.length > 0) {\n regions.sidebar = Array.from(sidebars).map((sidebar) => this.analyzeRegion(sidebar));\n }\n const footer = root.querySelector('footer, [role=\"contentinfo\"], .footer, #footer');\n if (footer) {\n regions.footer = this.analyzeRegion(footer);\n }\n const modals = root.querySelectorAll('[role=\"dialog\"], .modal, .popup, .overlay');\n const visibleModals = Array.from(modals).filter((modal) => DOMTraversal.isVisible(modal));\n if (visibleModals.length > 0) {\n regions.modals = visibleModals.map((modal) => this.analyzeRegion(modal));\n }\n const forms = this.extractFormOverview(root);\n const summary = this.calculateSummary(root, regions, forms);\n const suggestions = this.generateSuggestions(regions, summary);\n return { regions, forms, summary, suggestions };\n }\n /**\n * Step 2: Extract detailed information from a specific region\n */\n static extractRegion(selector, doc, options = {}, smartDomReaderCtor) {\n const element = doc.querySelector(selector);\n if (!element) return null;\n const SmartDOMReaderCtor = smartDomReaderCtor ?? resolveSmartDomReader();\n if (!SmartDOMReaderCtor) {\n throw new Error(\n \"SmartDOMReader is unavailable. Ensure the Smart DOM Reader module is loaded before calling extractRegion.\"\n );\n }\n const reader = new SmartDOMReaderCtor(options);\n return reader.extract(element, options);\n }\n /**\n * Step 3: Extract readable content from a region\n */\n static extractContent(selector, doc, options = {}) {\n const element = doc.querySelector(selector);\n if (!element) return null;\n const result = {\n selector,\n text: {},\n metadata: {\n wordCount: 0,\n hasInteractive: false\n }\n };\n if (options.includeHeadings !== false) {\n const headings = element.querySelectorAll(\"h1, h2, h3, h4, h5, h6\");\n result.text.headings = Array.from(headings).map((h) => ({\n level: parseInt(h.tagName[1]),\n text: this.getTextContent(h, options.maxTextLength)\n }));\n }\n const paragraphs = element.querySelectorAll(\"p\");\n if (paragraphs.length > 0) {\n result.text.paragraphs = Array.from(paragraphs).map((p) => this.getTextContent(p, options.maxTextLength)).filter((text) => text.length > 0);\n }\n if (options.includeLists !== false) {\n const lists = element.querySelectorAll(\"ul, ol\");\n result.text.lists = Array.from(lists).map((list) => ({\n type: list.tagName.toLowerCase(),\n items: Array.from(list.querySelectorAll(\"li\")).map(\n (li) => this.getTextContent(li, options.maxTextLength)\n )\n }));\n }\n if (options.includeTables !== false) {\n const tables = element.querySelectorAll(\"table\");\n result.tables = Array.from(tables).map((table) => {\n const headers = Array.from(table.querySelectorAll(\"th\")).map(\n (th) => this.getTextContent(th)\n );\n const rows = Array.from(table.querySelectorAll(\"tr\")).filter((tr) => tr.querySelector(\"td\")).map((tr) => Array.from(tr.querySelectorAll(\"td\")).map((td) => this.getTextContent(td)));\n return { headers, rows };\n });\n }\n if (options.includeMedia !== false) {\n const images = element.querySelectorAll(\"img\");\n const videos = element.querySelectorAll(\"video\");\n const audios = element.querySelectorAll(\"audio\");\n result.media = [\n ...Array.from(images).map((img) => ({\n type: \"img\",\n alt: img.getAttribute(\"alt\") || void 0,\n src: img.getAttribute(\"src\") || void 0\n })),\n ...Array.from(videos).map((video) => ({\n type: \"video\",\n src: video.getAttribute(\"src\") || void 0\n })),\n ...Array.from(audios).map((audio) => ({\n type: \"audio\",\n src: audio.getAttribute(\"src\") || void 0\n }))\n ];\n }\n const allText = element.textContent || \"\";\n result.metadata.wordCount = allText.trim().split(/\\s+/).length;\n result.metadata.hasInteractive = element.querySelectorAll(\"button, a, input, textarea, select\").length > 0;\n return result;\n }\n /**\n * Analyze a region and extract summary information\n */\n static analyzeRegion(element) {\n const selector = SelectorGenerator.generateSelectors(element).css;\n const buttons = element.querySelectorAll('button, [role=\"button\"]');\n const links = element.querySelectorAll(\"a[href]\");\n const inputs = element.querySelectorAll(\"input, textarea, select\");\n const forms = element.querySelectorAll(\"form\");\n const lists = element.querySelectorAll(\"ul, ol\");\n const tables = element.querySelectorAll(\"table\");\n const media = element.querySelectorAll(\"img, video, audio\");\n const interactiveCount = buttons.length + links.length + inputs.length;\n let label;\n const ariaLabel = element.getAttribute(\"aria-label\");\n if (ariaLabel) {\n label = ariaLabel;\n } else if (element.getAttribute(\"aria-labelledby\")) {\n const labelId = element.getAttribute(\"aria-labelledby\");\n if (labelId) {\n const labelElement = element.ownerDocument?.getElementById(labelId);\n if (labelElement) {\n label = labelElement.textContent?.trim();\n }\n }\n } else {\n const heading = element.querySelector(\"h1, h2, h3\");\n if (heading) {\n label = heading.textContent?.trim();\n }\n }\n const textContent = element.textContent?.trim() || \"\";\n const textPreview = textContent.length > 50 ? textContent.substring(0, 50) + \"...\" : textContent;\n return {\n selector,\n label,\n role: element.getAttribute(\"role\") || void 0,\n interactiveCount,\n hasForm: forms.length > 0,\n hasList: lists.length > 0,\n hasTable: tables.length > 0,\n hasMedia: media.length > 0,\n buttonCount: buttons.length > 0 ? buttons.length : void 0,\n linkCount: links.length > 0 ? links.length : void 0,\n inputCount: inputs.length > 0 ? inputs.length : void 0,\n textPreview: textPreview.length > 0 ? textPreview : void 0\n };\n }\n /**\n * Extract overview of forms on the page\n */\n static extractFormOverview(root) {\n const forms = root.querySelectorAll(\"form\");\n return Array.from(forms).map((form) => {\n const inputs = form.querySelectorAll(\"input, textarea, select\");\n const selector = SelectorGenerator.generateSelectors(form).css;\n let location2 = \"unknown\";\n if (form.closest('header, [role=\"banner\"]')) {\n location2 = \"header\";\n } else if (form.closest('nav, [role=\"navigation\"]')) {\n location2 = \"navigation\";\n } else if (form.closest('main, [role=\"main\"]')) {\n location2 = \"main\";\n } else if (form.closest('aside, [role=\"complementary\"]')) {\n location2 = \"sidebar\";\n } else if (form.closest('footer, [role=\"contentinfo\"]')) {\n location2 = \"footer\";\n }\n let purpose;\n const formId = form.getAttribute(\"id\")?.toLowerCase();\n const formClass = form.getAttribute(\"class\")?.toLowerCase();\n const formAction = form.getAttribute(\"action\")?.toLowerCase();\n const hasEmail = form.querySelector('input[type=\"email\"]');\n const hasPassword = form.querySelector('input[type=\"password\"]');\n const hasSearch = form.querySelector('input[type=\"search\"]');\n if (hasSearch || formId?.includes(\"search\") || formClass?.includes(\"search\")) {\n purpose = \"search\";\n } else if (hasPassword && hasEmail) {\n purpose = \"login\";\n } else if (hasPassword) {\n purpose = \"authentication\";\n } else if (formId?.includes(\"contact\") || formClass?.includes(\"contact\")) {\n purpose = \"contact\";\n } else if (formId?.includes(\"subscribe\") || formClass?.includes(\"subscribe\")) {\n purpose = \"subscription\";\n } else if (formAction?.includes(\"checkout\") || formClass?.includes(\"checkout\")) {\n purpose = \"checkout\";\n }\n return {\n selector,\n location: location2,\n inputCount: inputs.length,\n purpose\n };\n });\n }\n /**\n * Calculate summary statistics\n */\n static calculateSummary(root, regions, forms) {\n const allInteractive = root.querySelectorAll(\"button, a[href], input, textarea, select\");\n const allSections = root.querySelectorAll('section, article, [role=\"region\"]');\n const hasModals = (regions.modals?.length || 0) > 0;\n const errorSelectors = [\".error\", \".alert-danger\", '[role=\"alert\"]'];\n const hasErrors = errorSelectors.some((sel) => {\n const element = root.querySelector(sel);\n return element ? DOMTraversal.isVisible(element) : false;\n });\n const loadingSelectors = [\".loading\", \".spinner\", '[aria-busy=\"true\"]'];\n const isLoading = loadingSelectors.some((sel) => {\n const element = root.querySelector(sel);\n return element ? DOMTraversal.isVisible(element) : false;\n });\n const mainContentSelector = regions.main?.selector;\n return {\n totalInteractive: allInteractive.length,\n totalForms: forms.length,\n totalSections: allSections.length,\n hasModals,\n hasErrors,\n isLoading,\n mainContentSelector\n };\n }\n /**\n * Generate AI-friendly suggestions\n */\n static generateSuggestions(regions, summary) {\n const suggestions = [];\n if (summary.hasErrors) {\n suggestions.push(\"Page has error indicators - check error messages before interacting\");\n }\n if (summary.isLoading) {\n suggestions.push(\"Page appears to be loading - wait or check loading state\");\n }\n if (summary.hasModals) {\n suggestions.push(\"Modal/dialog is open - may need to interact with or close it first\");\n }\n if (regions.main && regions.main.interactiveCount > 10) {\n suggestions.push(\n `Main content has ${regions.main.interactiveCount} interactive elements - consider filtering`\n );\n }\n if (summary.totalForms > 0) {\n suggestions.push(`Found ${summary.totalForms} form(s) on the page`);\n }\n if (!regions.main) {\n suggestions.push(\"No clear main content area detected - may need to explore regions\");\n }\n return suggestions;\n }\n /**\n * Get text content with optional truncation\n */\n static getTextContent(element, maxLength) {\n const text = element.textContent?.trim() || \"\";\n if (maxLength && text.length > maxLength) {\n return text.substring(0, maxLength) + \"...\";\n }\n return text;\n }\n }\n class SmartDOMReader {\n options;\n constructor(options = {}) {\n this.options = {\n mode: options.mode || \"interactive\",\n maxDepth: options.maxDepth || 5,\n includeHidden: options.includeHidden || false,\n includeShadowDOM: options.includeShadowDOM || true,\n includeIframes: options.includeIframes || false,\n viewportOnly: options.viewportOnly || false,\n mainContentOnly: options.mainContentOnly || false,\n customSelectors: options.customSelectors || [],\n attributeTruncateLength: options.attributeTruncateLength,\n dataAttributeTruncateLength: options.dataAttributeTruncateLength,\n textTruncateLength: options.textTruncateLength,\n filter: options.filter\n };\n }\n /**\n * Main extraction method - extracts all data in one pass\n * @param rootElement The document or element to extract from\n * @param runtimeOptions Options to override constructor options\n */\n extract(rootElement = document, runtimeOptions) {\n const startTime = Date.now();\n const doc = rootElement instanceof Document ? rootElement : rootElement.ownerDocument;\n const options = { ...this.options, ...runtimeOptions };\n let container = rootElement instanceof Document ? doc : rootElement;\n if (options.mainContentOnly && rootElement instanceof Document) {\n container = ContentDetection.findMainContent(doc);\n }\n const pageState = this.extractPageState(doc);\n const landmarks = this.extractLandmarks(doc);\n const interactive = this.extractInteractiveElements(container, options);\n const result = {\n mode: options.mode,\n timestamp: startTime,\n page: pageState,\n landmarks,\n interactive\n };\n if (options.mode === \"full\") {\n result.semantic = this.extractSemanticElements(container, options);\n result.metadata = this.extractMetadata(doc, container, options);\n }\n return result;\n }\n /**\n * Extract page state information\n */\n extractPageState(doc) {\n return {\n url: doc.location?.href || \"\",\n title: doc.title || \"\",\n hasErrors: this.detectErrors(doc),\n isLoading: this.detectLoading(doc),\n hasModals: this.detectModals(doc),\n hasFocus: this.getFocusedElement(doc)\n };\n }\n /**\n * Extract page landmarks\n */\n extractLandmarks(doc) {\n const detected = ContentDetection.detectLandmarks(doc);\n return {\n navigation: this.elementsToSelectors(detected.navigation || []),\n main: this.elementsToSelectors(detected.main || []),\n forms: this.elementsToSelectors(detected.form || []),\n headers: this.elementsToSelectors(detected.banner || []),\n footers: this.elementsToSelectors(detected.contentinfo || []),\n articles: this.elementsToSelectors(detected.region || []),\n sections: this.elementsToSelectors(detected.region || [])\n };\n }\n /**\n * Convert elements to selector strings\n */\n elementsToSelectors(elements) {\n return elements.map((el) => SelectorGenerator.generateSelectors(el).css);\n }\n /**\n * Extract interactive elements\n */\n extractInteractiveElements(container, options) {\n const buttons = [];\n const links = [];\n const inputs = [];\n const clickable = [];\n const buttonElements = container.querySelectorAll(\n 'button, [role=\"button\"], input[type=\"button\"], input[type=\"submit\"]'\n );\n buttonElements.forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) buttons.push(extracted);\n }\n });\n const linkElements = container.querySelectorAll(\"a[href]\");\n linkElements.forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) links.push(extracted);\n }\n });\n const inputElements = container.querySelectorAll(\n 'input:not([type=\"button\"]):not([type=\"submit\"]), textarea, select'\n );\n inputElements.forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) inputs.push(extracted);\n }\n });\n if (options.customSelectors) {\n options.customSelectors.forEach((selector) => {\n const elements = container.querySelectorAll(selector);\n elements.forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) clickable.push(extracted);\n }\n });\n });\n }\n const forms = this.extractForms(container, options);\n return {\n buttons,\n links,\n inputs,\n forms,\n clickable\n };\n }\n /**\n * Extract form information\n */\n extractForms(container, options) {\n const forms = [];\n const formElements = container.querySelectorAll(\"form\");\n formElements.forEach((form) => {\n if (!this.shouldIncludeElement(form, options)) return;\n const formInputs = [];\n const formButtons = [];\n const inputs = form.querySelectorAll(\n 'input:not([type=\"button\"]):not([type=\"submit\"]), textarea, select'\n );\n inputs.forEach((input) => {\n const extracted = DOMTraversal.extractElement(input, options);\n if (extracted) formInputs.push(extracted);\n });\n const buttons = form.querySelectorAll('button, input[type=\"button\"], input[type=\"submit\"]');\n buttons.forEach((button) => {\n const extracted = DOMTraversal.extractElement(button, options);\n if (extracted) formButtons.push(extracted);\n });\n forms.push({\n selector: SelectorGenerator.generateSelectors(form).css,\n action: form.getAttribute(\"action\") || void 0,\n method: form.getAttribute(\"method\") || void 0,\n inputs: formInputs,\n buttons: formButtons\n });\n });\n return forms;\n }\n /**\n * Extract semantic elements (full mode only)\n */\n extractSemanticElements(container, options) {\n const headings = [];\n const images = [];\n const tables = [];\n const lists = [];\n const articles = [];\n container.querySelectorAll(\"h1, h2, h3, h4, h5, h6\").forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) headings.push(extracted);\n }\n });\n container.querySelectorAll(\"img\").forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) images.push(extracted);\n }\n });\n container.querySelectorAll(\"table\").forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) tables.push(extracted);\n }\n });\n container.querySelectorAll(\"ul, ol\").forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) lists.push(extracted);\n }\n });\n container.querySelectorAll('article, [role=\"article\"]').forEach((el) => {\n if (this.shouldIncludeElement(el, options)) {\n const extracted = DOMTraversal.extractElement(el, options);\n if (extracted) articles.push(extracted);\n }\n });\n return {\n headings,\n images,\n tables,\n lists,\n articles\n };\n }\n /**\n * Extract metadata\n */\n extractMetadata(doc, container, options) {\n const allElements = container.querySelectorAll(\"*\");\n const extractedElements = container.querySelectorAll(\n \"button, a, input, textarea, select, h1, h2, h3, h4, h5, h6, img, table, ul, ol, article\"\n ).length;\n return {\n totalElements: allElements.length,\n extractedElements,\n mainContent: options.mainContentOnly && container instanceof Element ? SelectorGenerator.generateSelectors(container).css : void 0,\n language: doc.documentElement.getAttribute(\"lang\") || void 0\n };\n }\n /**\n * Check if element should be included based on options\n */\n shouldIncludeElement(element, options) {\n if (!options.includeHidden && !DOMTraversal.isVisible(element)) {\n return false;\n }\n if (options.viewportOnly && !DOMTraversal.isInViewport(element)) {\n return false;\n }\n if (options.filter && !DOMTraversal.passesFilter(element, options.filter)) {\n return false;\n }\n return true;\n }\n /**\n * Detect errors on the page\n */\n detectErrors(doc) {\n const errorSelectors = [\".error\", \".alert-danger\", '[role=\"alert\"]', \".error-message\"];\n return errorSelectors.some((sel) => {\n const element = doc.querySelector(sel);\n return element ? DOMTraversal.isVisible(element) : false;\n });\n }\n /**\n * Detect if page is loading\n */\n detectLoading(doc) {\n const loadingSelectors = [\".loading\", \".spinner\", '[aria-busy=\"true\"]', \".loader\"];\n return loadingSelectors.some((sel) => {\n const element = doc.querySelector(sel);\n return element ? DOMTraversal.isVisible(element) : false;\n });\n }\n /**\n * Detect modal dialogs\n */\n detectModals(doc) {\n const modalSelectors = ['[role=\"dialog\"]', \".modal\", \".popup\", \".overlay\"];\n return modalSelectors.some((sel) => {\n const element = doc.querySelector(sel);\n return element ? DOMTraversal.isVisible(element) : false;\n });\n }\n /**\n * Get currently focused element\n */\n getFocusedElement(doc) {\n const focused = doc.activeElement;\n if (focused && focused !== doc.body) {\n return SelectorGenerator.generateSelectors(focused).css;\n }\n return void 0;\n }\n // ===== Static convenience methods =====\n /**\n * Quick extraction for interactive elements only\n * @param doc The document to extract from\n * @param options Extraction options\n */\n static extractInteractive(doc, options = {}) {\n const reader = new SmartDOMReader({\n ...options,\n mode: \"interactive\"\n });\n return reader.extract(doc);\n }\n /**\n * Quick extraction for full content\n * @param doc The document to extract from\n * @param options Extraction options\n */\n static extractFull(doc, options = {}) {\n const reader = new SmartDOMReader({\n ...options,\n mode: \"full\"\n });\n return reader.extract(doc);\n }\n /**\n * Extract from a specific element\n * @param element The element to extract from\n * @param mode The extraction mode\n * @param options Additional options\n */\n static extractFromElement(element, mode = \"interactive\", options = {}) {\n const reader = new SmartDOMReader({\n ...options,\n mode\n });\n return reader.extract(element);\n }\n }\n function executeExtraction(method, args) {\n try {\n let result;\n switch (method) {\n case \"extractStructure\": {\n const structureArgs = args;\n const { selector, frameSelector, formatOptions } = structureArgs;\n let doc = document;\n if (frameSelector) {\n const iframe = document.querySelector(frameSelector);\n if (!iframe || !(iframe instanceof HTMLIFrameElement) || !iframe.contentDocument) {\n return { error: `Cannot access iframe: ${frameSelector}` };\n }\n doc = iframe.contentDocument;\n }\n const target = selector ? doc.querySelector(selector) ?? doc : doc;\n const overview = ProgressiveExtractor.extractStructure(target);\n const meta = { title: document.title, url: location.href };\n result = MarkdownFormatter.structure(\n overview,\n formatOptions ?? { detail: \"summary\" },\n meta\n );\n break;\n }\n case \"extractRegion\": {\n const regionArgs = args;\n const { selector, mode, frameSelector, options, formatOptions } = regionArgs;\n let doc = document;\n if (frameSelector) {\n const iframe = document.querySelector(frameSelector);\n if (!iframe || !(iframe instanceof HTMLIFrameElement) || !iframe.contentDocument) {\n return { error: `Cannot access iframe: ${frameSelector}` };\n }\n doc = iframe.contentDocument;\n }\n const extractOptions = {\n ...options || {},\n mode: mode || \"interactive\"\n };\n const extractResult = ProgressiveExtractor.extractRegion(\n selector,\n doc,\n extractOptions,\n SmartDOMReader\n );\n if (!extractResult) {\n return { error: `No element found matching selector: ${selector}` };\n }\n const meta = { title: document.title, url: location.href };\n result = MarkdownFormatter.region(\n extractResult,\n formatOptions ?? { detail: \"region\" },\n meta\n );\n break;\n }\n case \"extractContent\": {\n const contentArgs = args;\n const { selector, frameSelector, options, formatOptions } = contentArgs;\n let doc = document;\n if (frameSelector) {\n const iframe = document.querySelector(frameSelector);\n if (!iframe || !(iframe instanceof HTMLIFrameElement) || !iframe.contentDocument) {\n return { error: `Cannot access iframe: ${frameSelector}` };\n }\n doc = iframe.contentDocument;\n }\n const extractOptions = options || {};\n const extractResult = ProgressiveExtractor.extractContent(selector, doc, extractOptions);\n if (!extractResult) {\n return { error: `No element found matching selector: ${selector}` };\n }\n const meta = { title: document.title, url: location.href };\n result = MarkdownFormatter.content(\n extractResult,\n formatOptions ?? { detail: \"region\" },\n meta\n );\n break;\n }\n case \"extractInteractive\": {\n const interactiveArgs = args;\n const { selector, frameSelector, options, formatOptions } = interactiveArgs;\n let doc = document;\n if (frameSelector) {\n const iframe = document.querySelector(frameSelector);\n if (!iframe || !(iframe instanceof HTMLIFrameElement) || !iframe.contentDocument) {\n return { error: `Cannot access iframe: ${frameSelector}` };\n }\n doc = iframe.contentDocument;\n }\n const extractResult = selector ? SmartDOMReader.extractFromElement(\n doc.querySelector(selector),\n \"interactive\",\n options || {}\n ) : SmartDOMReader.extractInteractive(doc, options || {});\n const meta = { title: document.title, url: location.href };\n result = MarkdownFormatter.region(\n extractResult,\n formatOptions ?? { detail: \"region\" },\n meta\n );\n break;\n }\n case \"extractFull\": {\n const fullArgs = args;\n const { selector, frameSelector, options, formatOptions } = fullArgs;\n let doc = document;\n if (frameSelector) {\n const iframe = document.querySelector(frameSelector);\n if (!iframe || !(iframe instanceof HTMLIFrameElement) || !iframe.contentDocument) {\n return { error: `Cannot access iframe: ${frameSelector}` };\n }\n doc = iframe.contentDocument;\n }\n const extractResult = selector ? SmartDOMReader.extractFromElement(doc.querySelector(selector), \"full\", options || {}) : SmartDOMReader.extractFull(doc, options || {});\n const meta = { title: document.title, url: location.href };\n result = MarkdownFormatter.region(extractResult, formatOptions ?? { detail: \"deep\" }, meta);\n break;\n }\n default:\n return { error: `Unknown method: ${method}` };\n }\n return result;\n } catch (error) {\n return {\n error: error instanceof Error ? error.message : String(error)\n };\n }\n }\n const SmartDOMReaderBundle2 = { executeExtraction };\n exports.SmartDOMReaderBundle = SmartDOMReaderBundle2;\n exports.executeExtraction = executeExtraction;\n Object.defineProperty(exports, Symbol.toStringTag, { value: \"Module\" });\n return exports;\n})({});\n";
|
|
9
|
+
declare const SMART_DOM_READER_VERSION = "1.0.0";
|
|
10
|
+
|
|
11
|
+
export { SMART_DOM_READER_BUNDLE, SMART_DOM_READER_VERSION };
|