@mcp-b/smart-dom-reader 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +441 -0
- package/dist/bundle-string.d.ts +11 -0
- package/dist/bundle-string.js +8 -0
- package/dist/bundle-string.js.map +1 -0
- package/dist/index.d.ts +467 -0
- package/dist/index.js +1823 -0
- package/dist/index.js.map +1 -0
- package/package.json +82 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
type ExtractionMode = 'interactive' | 'full' | 'structure' | 'content';
|
|
2
|
+
interface ElementSelector {
|
|
3
|
+
css: string;
|
|
4
|
+
xpath: string;
|
|
5
|
+
textBased?: string;
|
|
6
|
+
dataTestId?: string;
|
|
7
|
+
ariaLabel?: string;
|
|
8
|
+
candidates?: ElementSelectorCandidate[];
|
|
9
|
+
}
|
|
10
|
+
interface ElementSelectorCandidate {
|
|
11
|
+
type: 'id' | 'data-testid' | 'role-aria' | 'name' | 'class-path' | 'css-path' | 'xpath' | 'text';
|
|
12
|
+
value: string;
|
|
13
|
+
score: number;
|
|
14
|
+
}
|
|
15
|
+
interface ElementContext {
|
|
16
|
+
nearestForm?: string;
|
|
17
|
+
nearestSection?: string;
|
|
18
|
+
nearestMain?: string;
|
|
19
|
+
nearestNav?: string;
|
|
20
|
+
parentChain: string[];
|
|
21
|
+
}
|
|
22
|
+
interface ElementInteraction {
|
|
23
|
+
click?: boolean;
|
|
24
|
+
change?: boolean;
|
|
25
|
+
submit?: boolean;
|
|
26
|
+
nav?: boolean;
|
|
27
|
+
disabled?: boolean;
|
|
28
|
+
hidden?: boolean;
|
|
29
|
+
role?: string;
|
|
30
|
+
form?: string;
|
|
31
|
+
}
|
|
32
|
+
interface ExtractedElement {
|
|
33
|
+
tag: string;
|
|
34
|
+
text: string;
|
|
35
|
+
selector: ElementSelector;
|
|
36
|
+
attributes: Record<string, string>;
|
|
37
|
+
context: ElementContext;
|
|
38
|
+
interaction: ElementInteraction;
|
|
39
|
+
children?: ExtractedElement[];
|
|
40
|
+
}
|
|
41
|
+
interface FormInfo {
|
|
42
|
+
selector: string;
|
|
43
|
+
action?: string;
|
|
44
|
+
method?: string;
|
|
45
|
+
inputs: ExtractedElement[];
|
|
46
|
+
buttons: ExtractedElement[];
|
|
47
|
+
}
|
|
48
|
+
interface PageLandmarks {
|
|
49
|
+
navigation: string[];
|
|
50
|
+
main: string[];
|
|
51
|
+
forms: string[];
|
|
52
|
+
headers: string[];
|
|
53
|
+
footers: string[];
|
|
54
|
+
articles: string[];
|
|
55
|
+
sections: string[];
|
|
56
|
+
}
|
|
57
|
+
interface PageState {
|
|
58
|
+
url: string;
|
|
59
|
+
title: string;
|
|
60
|
+
hasErrors: boolean;
|
|
61
|
+
isLoading: boolean;
|
|
62
|
+
hasModals: boolean;
|
|
63
|
+
hasFocus?: string;
|
|
64
|
+
}
|
|
65
|
+
interface SmartDOMResult {
|
|
66
|
+
mode: ExtractionMode;
|
|
67
|
+
timestamp: number;
|
|
68
|
+
page: PageState;
|
|
69
|
+
landmarks: PageLandmarks;
|
|
70
|
+
interactive: {
|
|
71
|
+
buttons: ExtractedElement[];
|
|
72
|
+
links: ExtractedElement[];
|
|
73
|
+
inputs: ExtractedElement[];
|
|
74
|
+
forms: FormInfo[];
|
|
75
|
+
clickable: ExtractedElement[];
|
|
76
|
+
};
|
|
77
|
+
semantic?: {
|
|
78
|
+
headings: ExtractedElement[];
|
|
79
|
+
images: ExtractedElement[];
|
|
80
|
+
tables: ExtractedElement[];
|
|
81
|
+
lists: ExtractedElement[];
|
|
82
|
+
articles: ExtractedElement[];
|
|
83
|
+
};
|
|
84
|
+
metadata?: {
|
|
85
|
+
totalElements: number;
|
|
86
|
+
extractedElements: number;
|
|
87
|
+
mainContent?: string;
|
|
88
|
+
language?: string;
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
interface FilterOptions {
|
|
92
|
+
includeSelectors?: string[];
|
|
93
|
+
excludeSelectors?: string[];
|
|
94
|
+
textContains?: string[];
|
|
95
|
+
textMatches?: RegExp[];
|
|
96
|
+
hasAttributes?: string[];
|
|
97
|
+
attributeValues?: Record<string, string | RegExp>;
|
|
98
|
+
tags?: string[];
|
|
99
|
+
interactionTypes?: Array<keyof ElementInteraction>;
|
|
100
|
+
withinSelectors?: string[];
|
|
101
|
+
nearText?: string;
|
|
102
|
+
}
|
|
103
|
+
interface ExtractionOptions {
|
|
104
|
+
mode: ExtractionMode;
|
|
105
|
+
maxDepth?: number;
|
|
106
|
+
includeHidden?: boolean;
|
|
107
|
+
includeShadowDOM?: boolean;
|
|
108
|
+
includeIframes?: boolean;
|
|
109
|
+
viewportOnly?: boolean;
|
|
110
|
+
mainContentOnly?: boolean;
|
|
111
|
+
customSelectors?: string[];
|
|
112
|
+
attributeTruncateLength?: number;
|
|
113
|
+
dataAttributeTruncateLength?: number;
|
|
114
|
+
textTruncateLength?: number;
|
|
115
|
+
filter?: FilterOptions;
|
|
116
|
+
}
|
|
117
|
+
interface RegionInfo {
|
|
118
|
+
selector: string;
|
|
119
|
+
label?: string;
|
|
120
|
+
role?: string;
|
|
121
|
+
interactiveCount: number;
|
|
122
|
+
hasForm?: boolean;
|
|
123
|
+
hasList?: boolean;
|
|
124
|
+
hasTable?: boolean;
|
|
125
|
+
hasMedia?: boolean;
|
|
126
|
+
buttonCount?: number;
|
|
127
|
+
linkCount?: number;
|
|
128
|
+
inputCount?: number;
|
|
129
|
+
textPreview?: string;
|
|
130
|
+
}
|
|
131
|
+
interface StructuralOverview {
|
|
132
|
+
regions: {
|
|
133
|
+
header?: RegionInfo;
|
|
134
|
+
navigation?: RegionInfo[];
|
|
135
|
+
main?: RegionInfo;
|
|
136
|
+
sidebar?: RegionInfo[];
|
|
137
|
+
footer?: RegionInfo;
|
|
138
|
+
modals?: RegionInfo[];
|
|
139
|
+
sections?: RegionInfo[];
|
|
140
|
+
};
|
|
141
|
+
forms: Array<{
|
|
142
|
+
selector: string;
|
|
143
|
+
location: string;
|
|
144
|
+
inputCount: number;
|
|
145
|
+
purpose?: string;
|
|
146
|
+
}>;
|
|
147
|
+
summary: {
|
|
148
|
+
totalInteractive: number;
|
|
149
|
+
totalForms: number;
|
|
150
|
+
totalSections: number;
|
|
151
|
+
hasModals: boolean;
|
|
152
|
+
hasErrors: boolean;
|
|
153
|
+
isLoading: boolean;
|
|
154
|
+
mainContentSelector?: string;
|
|
155
|
+
};
|
|
156
|
+
suggestions?: string[];
|
|
157
|
+
}
|
|
158
|
+
interface ContentExtractionOptions {
|
|
159
|
+
includeHeadings?: boolean;
|
|
160
|
+
includeLists?: boolean;
|
|
161
|
+
includeTables?: boolean;
|
|
162
|
+
includeMedia?: boolean;
|
|
163
|
+
preserveFormatting?: boolean;
|
|
164
|
+
maxTextLength?: number;
|
|
165
|
+
}
|
|
166
|
+
interface ExtractedContent {
|
|
167
|
+
selector: string;
|
|
168
|
+
text: {
|
|
169
|
+
headings?: Array<{
|
|
170
|
+
level: number;
|
|
171
|
+
text: string;
|
|
172
|
+
}>;
|
|
173
|
+
paragraphs?: string[];
|
|
174
|
+
lists?: Array<{
|
|
175
|
+
type: 'ul' | 'ol';
|
|
176
|
+
items: string[];
|
|
177
|
+
}>;
|
|
178
|
+
};
|
|
179
|
+
tables?: Array<{
|
|
180
|
+
headers: string[];
|
|
181
|
+
rows: string[][];
|
|
182
|
+
}>;
|
|
183
|
+
media?: Array<{
|
|
184
|
+
type: 'img' | 'video' | 'audio';
|
|
185
|
+
alt?: string;
|
|
186
|
+
src?: string;
|
|
187
|
+
}>;
|
|
188
|
+
metadata: {
|
|
189
|
+
wordCount: number;
|
|
190
|
+
hasInteractive: boolean;
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
type MarkdownDetailLevel = 'summary' | 'region' | 'deep';
|
|
195
|
+
interface MarkdownFormatOptions {
|
|
196
|
+
detail?: MarkdownDetailLevel;
|
|
197
|
+
maxTextLength?: number;
|
|
198
|
+
maxElements?: number;
|
|
199
|
+
}
|
|
200
|
+
type PageMeta = {
|
|
201
|
+
title?: string;
|
|
202
|
+
url?: string;
|
|
203
|
+
};
|
|
204
|
+
declare class MarkdownFormatter {
|
|
205
|
+
static structure(overview: StructuralOverview, _opts?: MarkdownFormatOptions, meta?: PageMeta): string;
|
|
206
|
+
static region(result: SmartDOMResult, opts?: MarkdownFormatOptions, meta?: PageMeta): string;
|
|
207
|
+
static content(content: ExtractedContent, opts?: MarkdownFormatOptions, meta?: PageMeta): string;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Type-safe interface for the stateless bundle extraction API
|
|
212
|
+
*/
|
|
213
|
+
|
|
214
|
+
type ExtractionMethod = 'extractStructure' | 'extractRegion' | 'extractContent' | 'extractInteractive' | 'extractFull';
|
|
215
|
+
interface BaseExtractionArgs {
|
|
216
|
+
frameSelector?: string;
|
|
217
|
+
formatOptions?: MarkdownFormatOptions;
|
|
218
|
+
}
|
|
219
|
+
interface ExtractStructureArgs extends BaseExtractionArgs {
|
|
220
|
+
selector?: string;
|
|
221
|
+
}
|
|
222
|
+
interface ExtractRegionArgs extends BaseExtractionArgs {
|
|
223
|
+
selector: string;
|
|
224
|
+
mode?: 'interactive' | 'full';
|
|
225
|
+
options?: Partial<ExtractionOptions>;
|
|
226
|
+
}
|
|
227
|
+
interface ExtractContentArgs extends BaseExtractionArgs {
|
|
228
|
+
selector: string;
|
|
229
|
+
options?: ContentExtractionOptions;
|
|
230
|
+
}
|
|
231
|
+
interface ExtractInteractiveArgs extends BaseExtractionArgs {
|
|
232
|
+
selector?: string;
|
|
233
|
+
options?: Partial<ExtractionOptions>;
|
|
234
|
+
}
|
|
235
|
+
interface ExtractFullArgs extends BaseExtractionArgs {
|
|
236
|
+
selector?: string;
|
|
237
|
+
options?: Partial<ExtractionOptions>;
|
|
238
|
+
}
|
|
239
|
+
type ExtractionArgs = {
|
|
240
|
+
extractStructure: ExtractStructureArgs;
|
|
241
|
+
extractRegion: ExtractRegionArgs;
|
|
242
|
+
extractContent: ExtractContentArgs;
|
|
243
|
+
extractInteractive: ExtractInteractiveArgs;
|
|
244
|
+
extractFull: ExtractFullArgs;
|
|
245
|
+
};
|
|
246
|
+
interface ExtractionError {
|
|
247
|
+
error: string;
|
|
248
|
+
}
|
|
249
|
+
type ExtractionResult = string | ExtractionError;
|
|
250
|
+
interface SmartDOMReaderBundle {
|
|
251
|
+
executeExtraction<M extends ExtractionMethod>(method: M, args: ExtractionArgs[M]): ExtractionResult;
|
|
252
|
+
}
|
|
253
|
+
declare global {
|
|
254
|
+
interface Window {
|
|
255
|
+
SmartDOMReaderBundle: SmartDOMReaderBundle;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
declare class ContentDetection {
|
|
260
|
+
/**
|
|
261
|
+
* Find the main content area of a page
|
|
262
|
+
* Inspired by dom-to-semantic-markdown's approach
|
|
263
|
+
*/
|
|
264
|
+
static findMainContent(doc: Document): Element;
|
|
265
|
+
/**
|
|
266
|
+
* Detect main content using scoring algorithm
|
|
267
|
+
*/
|
|
268
|
+
private static detectMainContent;
|
|
269
|
+
/**
|
|
270
|
+
* Collect content candidates
|
|
271
|
+
*/
|
|
272
|
+
private static collectCandidates;
|
|
273
|
+
/**
|
|
274
|
+
* Calculate content score for an element
|
|
275
|
+
*/
|
|
276
|
+
static calculateContentScore(element: Element): number;
|
|
277
|
+
/**
|
|
278
|
+
* Calculate link density in an element
|
|
279
|
+
*/
|
|
280
|
+
private static calculateLinkDensity;
|
|
281
|
+
/**
|
|
282
|
+
* Check if an element is likely navigation
|
|
283
|
+
*/
|
|
284
|
+
static isNavigation(element: Element): boolean;
|
|
285
|
+
/**
|
|
286
|
+
* Check if element is likely supplementary content
|
|
287
|
+
*/
|
|
288
|
+
static isSupplementary(element: Element): boolean;
|
|
289
|
+
/**
|
|
290
|
+
* Detect page landmarks
|
|
291
|
+
*/
|
|
292
|
+
static detectLandmarks(doc: Document): Record<string, Element[]>;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
type SmartDomReaderCtor = new (options?: Partial<ExtractionOptions>) => SmartDOMReader;
|
|
296
|
+
declare class ProgressiveExtractor {
|
|
297
|
+
/**
|
|
298
|
+
* Step 1: Extract high-level structural overview
|
|
299
|
+
* This provides a "map" of the page for the AI to understand structure
|
|
300
|
+
*/
|
|
301
|
+
static extractStructure(root: Document | Element): StructuralOverview;
|
|
302
|
+
/**
|
|
303
|
+
* Step 2: Extract detailed information from a specific region
|
|
304
|
+
*/
|
|
305
|
+
static extractRegion(selector: string, doc: Document, options?: Partial<ExtractionOptions>, smartDomReaderCtor?: SmartDomReaderCtor): SmartDOMResult | null;
|
|
306
|
+
/**
|
|
307
|
+
* Step 3: Extract readable content from a region
|
|
308
|
+
*/
|
|
309
|
+
static extractContent(selector: string, doc: Document, options?: ContentExtractionOptions): ExtractedContent | null;
|
|
310
|
+
/**
|
|
311
|
+
* Analyze a region and extract summary information
|
|
312
|
+
*/
|
|
313
|
+
private static analyzeRegion;
|
|
314
|
+
/**
|
|
315
|
+
* Extract overview of forms on the page
|
|
316
|
+
*/
|
|
317
|
+
private static extractFormOverview;
|
|
318
|
+
/**
|
|
319
|
+
* Calculate summary statistics
|
|
320
|
+
*/
|
|
321
|
+
private static calculateSummary;
|
|
322
|
+
/**
|
|
323
|
+
* Generate AI-friendly suggestions
|
|
324
|
+
*/
|
|
325
|
+
private static generateSuggestions;
|
|
326
|
+
/**
|
|
327
|
+
* Get text content with optional truncation
|
|
328
|
+
*/
|
|
329
|
+
private static getTextContent;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
declare class SelectorGenerator {
|
|
333
|
+
/**
|
|
334
|
+
* Generate multiple selector strategies for an element
|
|
335
|
+
*/
|
|
336
|
+
static generateSelectors(element: Element): ElementSelector;
|
|
337
|
+
/**
|
|
338
|
+
* Generate a unique CSS selector for an element
|
|
339
|
+
*/
|
|
340
|
+
private static generateCSSSelector;
|
|
341
|
+
/**
|
|
342
|
+
* Generate XPath for an element
|
|
343
|
+
*/
|
|
344
|
+
private static generateXPath;
|
|
345
|
+
/**
|
|
346
|
+
* Generate a text-based selector for buttons and links
|
|
347
|
+
*/
|
|
348
|
+
private static generateTextBasedSelector;
|
|
349
|
+
/**
|
|
350
|
+
* Get data-testid or similar attributes
|
|
351
|
+
*/
|
|
352
|
+
private static getDataTestId;
|
|
353
|
+
/**
|
|
354
|
+
* Check if an ID is unique in the document
|
|
355
|
+
*/
|
|
356
|
+
private static isUniqueId;
|
|
357
|
+
/**
|
|
358
|
+
* Check if a selector is unique within a container
|
|
359
|
+
*/
|
|
360
|
+
private static isUniqueSelector;
|
|
361
|
+
private static isUniqueSelectorSafe;
|
|
362
|
+
/**
|
|
363
|
+
* Get meaningful classes (filtering out utility classes)
|
|
364
|
+
*/
|
|
365
|
+
private static getMeaningfulClasses;
|
|
366
|
+
/**
|
|
367
|
+
* Optimize the selector path by removing unnecessary parts
|
|
368
|
+
*/
|
|
369
|
+
private static optimizePath;
|
|
370
|
+
/**
|
|
371
|
+
* Get a human-readable path description
|
|
372
|
+
*/
|
|
373
|
+
static getContextPath(element: Element): string[];
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Smart DOM Reader - Full Extraction Approach
|
|
378
|
+
*
|
|
379
|
+
* This class provides complete DOM extraction in a single pass.
|
|
380
|
+
* Use this when you need all information upfront and have sufficient
|
|
381
|
+
* token budget for processing the complete output.
|
|
382
|
+
*
|
|
383
|
+
* Features:
|
|
384
|
+
* - Single-pass extraction of all elements
|
|
385
|
+
* - Two modes: 'interactive' (UI elements) or 'full' (includes content)
|
|
386
|
+
* - Efficient for automation and testing scenarios
|
|
387
|
+
* - Returns complete structured data immediately
|
|
388
|
+
*/
|
|
389
|
+
declare class SmartDOMReader {
|
|
390
|
+
private options;
|
|
391
|
+
constructor(options?: Partial<ExtractionOptions>);
|
|
392
|
+
/**
|
|
393
|
+
* Main extraction method - extracts all data in one pass
|
|
394
|
+
* @param rootElement The document or element to extract from
|
|
395
|
+
* @param runtimeOptions Options to override constructor options
|
|
396
|
+
*/
|
|
397
|
+
extract(rootElement?: Document | Element, runtimeOptions?: Partial<ExtractionOptions>): SmartDOMResult;
|
|
398
|
+
/**
|
|
399
|
+
* Extract page state information
|
|
400
|
+
*/
|
|
401
|
+
private extractPageState;
|
|
402
|
+
/**
|
|
403
|
+
* Extract page landmarks
|
|
404
|
+
*/
|
|
405
|
+
private extractLandmarks;
|
|
406
|
+
/**
|
|
407
|
+
* Convert elements to selector strings
|
|
408
|
+
*/
|
|
409
|
+
private elementsToSelectors;
|
|
410
|
+
/**
|
|
411
|
+
* Extract interactive elements
|
|
412
|
+
*/
|
|
413
|
+
private extractInteractiveElements;
|
|
414
|
+
/**
|
|
415
|
+
* Extract form information
|
|
416
|
+
*/
|
|
417
|
+
private extractForms;
|
|
418
|
+
/**
|
|
419
|
+
* Extract semantic elements (full mode only)
|
|
420
|
+
*/
|
|
421
|
+
private extractSemanticElements;
|
|
422
|
+
/**
|
|
423
|
+
* Extract metadata
|
|
424
|
+
*/
|
|
425
|
+
private extractMetadata;
|
|
426
|
+
/**
|
|
427
|
+
* Check if element should be included based on options
|
|
428
|
+
*/
|
|
429
|
+
private shouldIncludeElement;
|
|
430
|
+
/**
|
|
431
|
+
* Detect errors on the page
|
|
432
|
+
*/
|
|
433
|
+
private detectErrors;
|
|
434
|
+
/**
|
|
435
|
+
* Detect if page is loading
|
|
436
|
+
*/
|
|
437
|
+
private detectLoading;
|
|
438
|
+
/**
|
|
439
|
+
* Detect modal dialogs
|
|
440
|
+
*/
|
|
441
|
+
private detectModals;
|
|
442
|
+
/**
|
|
443
|
+
* Get currently focused element
|
|
444
|
+
*/
|
|
445
|
+
private getFocusedElement;
|
|
446
|
+
/**
|
|
447
|
+
* Quick extraction for interactive elements only
|
|
448
|
+
* @param doc The document to extract from
|
|
449
|
+
* @param options Extraction options
|
|
450
|
+
*/
|
|
451
|
+
static extractInteractive(doc: Document, options?: Partial<ExtractionOptions>): SmartDOMResult;
|
|
452
|
+
/**
|
|
453
|
+
* Quick extraction for full content
|
|
454
|
+
* @param doc The document to extract from
|
|
455
|
+
* @param options Extraction options
|
|
456
|
+
*/
|
|
457
|
+
static extractFull(doc: Document, options?: Partial<ExtractionOptions>): SmartDOMResult;
|
|
458
|
+
/**
|
|
459
|
+
* Extract from a specific element
|
|
460
|
+
* @param element The element to extract from
|
|
461
|
+
* @param mode The extraction mode
|
|
462
|
+
* @param options Additional options
|
|
463
|
+
*/
|
|
464
|
+
static extractFromElement(element: Element, mode?: ExtractionMode, options?: Partial<ExtractionOptions>): SmartDOMResult;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
export { ContentDetection, type ContentExtractionOptions, type ElementContext, type ElementInteraction, type ElementSelector, type ElementSelectorCandidate, type ExtractContentArgs, type ExtractFullArgs, type ExtractInteractiveArgs, type ExtractRegionArgs, type ExtractStructureArgs, type ExtractedContent, type ExtractedElement, type ExtractionArgs, type ExtractionMethod, type ExtractionMode, type ExtractionOptions, type ExtractionResult, type FilterOptions, type FormInfo, type MarkdownFormatOptions, MarkdownFormatter, type PageLandmarks, type PageState, ProgressiveExtractor, type RegionInfo, SelectorGenerator, SmartDOMReader, type SmartDOMResult, type StructuralOverview, SmartDOMReader as default };
|