@mcp-b/smart-dom-reader 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,467 @@
1
+ type ExtractionMode = 'interactive' | 'full' | 'structure' | 'content';
2
+ interface ElementSelector {
3
+ css: string;
4
+ xpath: string;
5
+ textBased?: string;
6
+ dataTestId?: string;
7
+ ariaLabel?: string;
8
+ candidates?: ElementSelectorCandidate[];
9
+ }
10
+ interface ElementSelectorCandidate {
11
+ type: 'id' | 'data-testid' | 'role-aria' | 'name' | 'class-path' | 'css-path' | 'xpath' | 'text';
12
+ value: string;
13
+ score: number;
14
+ }
15
+ interface ElementContext {
16
+ nearestForm?: string;
17
+ nearestSection?: string;
18
+ nearestMain?: string;
19
+ nearestNav?: string;
20
+ parentChain: string[];
21
+ }
22
+ interface ElementInteraction {
23
+ click?: boolean;
24
+ change?: boolean;
25
+ submit?: boolean;
26
+ nav?: boolean;
27
+ disabled?: boolean;
28
+ hidden?: boolean;
29
+ role?: string;
30
+ form?: string;
31
+ }
32
+ interface ExtractedElement {
33
+ tag: string;
34
+ text: string;
35
+ selector: ElementSelector;
36
+ attributes: Record<string, string>;
37
+ context: ElementContext;
38
+ interaction: ElementInteraction;
39
+ children?: ExtractedElement[];
40
+ }
41
+ interface FormInfo {
42
+ selector: string;
43
+ action?: string;
44
+ method?: string;
45
+ inputs: ExtractedElement[];
46
+ buttons: ExtractedElement[];
47
+ }
48
+ interface PageLandmarks {
49
+ navigation: string[];
50
+ main: string[];
51
+ forms: string[];
52
+ headers: string[];
53
+ footers: string[];
54
+ articles: string[];
55
+ sections: string[];
56
+ }
57
+ interface PageState {
58
+ url: string;
59
+ title: string;
60
+ hasErrors: boolean;
61
+ isLoading: boolean;
62
+ hasModals: boolean;
63
+ hasFocus?: string;
64
+ }
65
+ interface SmartDOMResult {
66
+ mode: ExtractionMode;
67
+ timestamp: number;
68
+ page: PageState;
69
+ landmarks: PageLandmarks;
70
+ interactive: {
71
+ buttons: ExtractedElement[];
72
+ links: ExtractedElement[];
73
+ inputs: ExtractedElement[];
74
+ forms: FormInfo[];
75
+ clickable: ExtractedElement[];
76
+ };
77
+ semantic?: {
78
+ headings: ExtractedElement[];
79
+ images: ExtractedElement[];
80
+ tables: ExtractedElement[];
81
+ lists: ExtractedElement[];
82
+ articles: ExtractedElement[];
83
+ };
84
+ metadata?: {
85
+ totalElements: number;
86
+ extractedElements: number;
87
+ mainContent?: string;
88
+ language?: string;
89
+ };
90
+ }
91
+ interface FilterOptions {
92
+ includeSelectors?: string[];
93
+ excludeSelectors?: string[];
94
+ textContains?: string[];
95
+ textMatches?: RegExp[];
96
+ hasAttributes?: string[];
97
+ attributeValues?: Record<string, string | RegExp>;
98
+ tags?: string[];
99
+ interactionTypes?: Array<keyof ElementInteraction>;
100
+ withinSelectors?: string[];
101
+ nearText?: string;
102
+ }
103
+ interface ExtractionOptions {
104
+ mode: ExtractionMode;
105
+ maxDepth?: number;
106
+ includeHidden?: boolean;
107
+ includeShadowDOM?: boolean;
108
+ includeIframes?: boolean;
109
+ viewportOnly?: boolean;
110
+ mainContentOnly?: boolean;
111
+ customSelectors?: string[];
112
+ attributeTruncateLength?: number;
113
+ dataAttributeTruncateLength?: number;
114
+ textTruncateLength?: number;
115
+ filter?: FilterOptions;
116
+ }
117
+ interface RegionInfo {
118
+ selector: string;
119
+ label?: string;
120
+ role?: string;
121
+ interactiveCount: number;
122
+ hasForm?: boolean;
123
+ hasList?: boolean;
124
+ hasTable?: boolean;
125
+ hasMedia?: boolean;
126
+ buttonCount?: number;
127
+ linkCount?: number;
128
+ inputCount?: number;
129
+ textPreview?: string;
130
+ }
131
+ interface StructuralOverview {
132
+ regions: {
133
+ header?: RegionInfo;
134
+ navigation?: RegionInfo[];
135
+ main?: RegionInfo;
136
+ sidebar?: RegionInfo[];
137
+ footer?: RegionInfo;
138
+ modals?: RegionInfo[];
139
+ sections?: RegionInfo[];
140
+ };
141
+ forms: Array<{
142
+ selector: string;
143
+ location: string;
144
+ inputCount: number;
145
+ purpose?: string;
146
+ }>;
147
+ summary: {
148
+ totalInteractive: number;
149
+ totalForms: number;
150
+ totalSections: number;
151
+ hasModals: boolean;
152
+ hasErrors: boolean;
153
+ isLoading: boolean;
154
+ mainContentSelector?: string;
155
+ };
156
+ suggestions?: string[];
157
+ }
158
+ interface ContentExtractionOptions {
159
+ includeHeadings?: boolean;
160
+ includeLists?: boolean;
161
+ includeTables?: boolean;
162
+ includeMedia?: boolean;
163
+ preserveFormatting?: boolean;
164
+ maxTextLength?: number;
165
+ }
166
+ interface ExtractedContent {
167
+ selector: string;
168
+ text: {
169
+ headings?: Array<{
170
+ level: number;
171
+ text: string;
172
+ }>;
173
+ paragraphs?: string[];
174
+ lists?: Array<{
175
+ type: 'ul' | 'ol';
176
+ items: string[];
177
+ }>;
178
+ };
179
+ tables?: Array<{
180
+ headers: string[];
181
+ rows: string[][];
182
+ }>;
183
+ media?: Array<{
184
+ type: 'img' | 'video' | 'audio';
185
+ alt?: string;
186
+ src?: string;
187
+ }>;
188
+ metadata: {
189
+ wordCount: number;
190
+ hasInteractive: boolean;
191
+ };
192
+ }
193
+
194
+ type MarkdownDetailLevel = 'summary' | 'region' | 'deep';
195
+ interface MarkdownFormatOptions {
196
+ detail?: MarkdownDetailLevel;
197
+ maxTextLength?: number;
198
+ maxElements?: number;
199
+ }
200
+ type PageMeta = {
201
+ title?: string;
202
+ url?: string;
203
+ };
204
+ declare class MarkdownFormatter {
205
+ static structure(overview: StructuralOverview, _opts?: MarkdownFormatOptions, meta?: PageMeta): string;
206
+ static region(result: SmartDOMResult, opts?: MarkdownFormatOptions, meta?: PageMeta): string;
207
+ static content(content: ExtractedContent, opts?: MarkdownFormatOptions, meta?: PageMeta): string;
208
+ }
209
+
210
+ /**
211
+ * Type-safe interface for the stateless bundle extraction API
212
+ */
213
+
214
+ type ExtractionMethod = 'extractStructure' | 'extractRegion' | 'extractContent' | 'extractInteractive' | 'extractFull';
215
+ interface BaseExtractionArgs {
216
+ frameSelector?: string;
217
+ formatOptions?: MarkdownFormatOptions;
218
+ }
219
+ interface ExtractStructureArgs extends BaseExtractionArgs {
220
+ selector?: string;
221
+ }
222
+ interface ExtractRegionArgs extends BaseExtractionArgs {
223
+ selector: string;
224
+ mode?: 'interactive' | 'full';
225
+ options?: Partial<ExtractionOptions>;
226
+ }
227
+ interface ExtractContentArgs extends BaseExtractionArgs {
228
+ selector: string;
229
+ options?: ContentExtractionOptions;
230
+ }
231
+ interface ExtractInteractiveArgs extends BaseExtractionArgs {
232
+ selector?: string;
233
+ options?: Partial<ExtractionOptions>;
234
+ }
235
+ interface ExtractFullArgs extends BaseExtractionArgs {
236
+ selector?: string;
237
+ options?: Partial<ExtractionOptions>;
238
+ }
239
+ type ExtractionArgs = {
240
+ extractStructure: ExtractStructureArgs;
241
+ extractRegion: ExtractRegionArgs;
242
+ extractContent: ExtractContentArgs;
243
+ extractInteractive: ExtractInteractiveArgs;
244
+ extractFull: ExtractFullArgs;
245
+ };
246
+ interface ExtractionError {
247
+ error: string;
248
+ }
249
+ type ExtractionResult = string | ExtractionError;
250
+ interface SmartDOMReaderBundle {
251
+ executeExtraction<M extends ExtractionMethod>(method: M, args: ExtractionArgs[M]): ExtractionResult;
252
+ }
253
+ declare global {
254
+ interface Window {
255
+ SmartDOMReaderBundle: SmartDOMReaderBundle;
256
+ }
257
+ }
258
+
259
+ declare class ContentDetection {
260
+ /**
261
+ * Find the main content area of a page
262
+ * Inspired by dom-to-semantic-markdown's approach
263
+ */
264
+ static findMainContent(doc: Document): Element;
265
+ /**
266
+ * Detect main content using scoring algorithm
267
+ */
268
+ private static detectMainContent;
269
+ /**
270
+ * Collect content candidates
271
+ */
272
+ private static collectCandidates;
273
+ /**
274
+ * Calculate content score for an element
275
+ */
276
+ static calculateContentScore(element: Element): number;
277
+ /**
278
+ * Calculate link density in an element
279
+ */
280
+ private static calculateLinkDensity;
281
+ /**
282
+ * Check if an element is likely navigation
283
+ */
284
+ static isNavigation(element: Element): boolean;
285
+ /**
286
+ * Check if element is likely supplementary content
287
+ */
288
+ static isSupplementary(element: Element): boolean;
289
+ /**
290
+ * Detect page landmarks
291
+ */
292
+ static detectLandmarks(doc: Document): Record<string, Element[]>;
293
+ }
294
+
295
+ type SmartDomReaderCtor = new (options?: Partial<ExtractionOptions>) => SmartDOMReader;
296
+ declare class ProgressiveExtractor {
297
+ /**
298
+ * Step 1: Extract high-level structural overview
299
+ * This provides a "map" of the page for the AI to understand structure
300
+ */
301
+ static extractStructure(root: Document | Element): StructuralOverview;
302
+ /**
303
+ * Step 2: Extract detailed information from a specific region
304
+ */
305
+ static extractRegion(selector: string, doc: Document, options?: Partial<ExtractionOptions>, smartDomReaderCtor?: SmartDomReaderCtor): SmartDOMResult | null;
306
+ /**
307
+ * Step 3: Extract readable content from a region
308
+ */
309
+ static extractContent(selector: string, doc: Document, options?: ContentExtractionOptions): ExtractedContent | null;
310
+ /**
311
+ * Analyze a region and extract summary information
312
+ */
313
+ private static analyzeRegion;
314
+ /**
315
+ * Extract overview of forms on the page
316
+ */
317
+ private static extractFormOverview;
318
+ /**
319
+ * Calculate summary statistics
320
+ */
321
+ private static calculateSummary;
322
+ /**
323
+ * Generate AI-friendly suggestions
324
+ */
325
+ private static generateSuggestions;
326
+ /**
327
+ * Get text content with optional truncation
328
+ */
329
+ private static getTextContent;
330
+ }
331
+
332
+ declare class SelectorGenerator {
333
+ /**
334
+ * Generate multiple selector strategies for an element
335
+ */
336
+ static generateSelectors(element: Element): ElementSelector;
337
+ /**
338
+ * Generate a unique CSS selector for an element
339
+ */
340
+ private static generateCSSSelector;
341
+ /**
342
+ * Generate XPath for an element
343
+ */
344
+ private static generateXPath;
345
+ /**
346
+ * Generate a text-based selector for buttons and links
347
+ */
348
+ private static generateTextBasedSelector;
349
+ /**
350
+ * Get data-testid or similar attributes
351
+ */
352
+ private static getDataTestId;
353
+ /**
354
+ * Check if an ID is unique in the document
355
+ */
356
+ private static isUniqueId;
357
+ /**
358
+ * Check if a selector is unique within a container
359
+ */
360
+ private static isUniqueSelector;
361
+ private static isUniqueSelectorSafe;
362
+ /**
363
+ * Get meaningful classes (filtering out utility classes)
364
+ */
365
+ private static getMeaningfulClasses;
366
+ /**
367
+ * Optimize the selector path by removing unnecessary parts
368
+ */
369
+ private static optimizePath;
370
+ /**
371
+ * Get a human-readable path description
372
+ */
373
+ static getContextPath(element: Element): string[];
374
+ }
375
+
376
+ /**
377
+ * Smart DOM Reader - Full Extraction Approach
378
+ *
379
+ * This class provides complete DOM extraction in a single pass.
380
+ * Use this when you need all information upfront and have sufficient
381
+ * token budget for processing the complete output.
382
+ *
383
+ * Features:
384
+ * - Single-pass extraction of all elements
385
+ * - Two modes: 'interactive' (UI elements) or 'full' (includes content)
386
+ * - Efficient for automation and testing scenarios
387
+ * - Returns complete structured data immediately
388
+ */
389
+ declare class SmartDOMReader {
390
+ private options;
391
+ constructor(options?: Partial<ExtractionOptions>);
392
+ /**
393
+ * Main extraction method - extracts all data in one pass
394
+ * @param rootElement The document or element to extract from
395
+ * @param runtimeOptions Options to override constructor options
396
+ */
397
+ extract(rootElement?: Document | Element, runtimeOptions?: Partial<ExtractionOptions>): SmartDOMResult;
398
+ /**
399
+ * Extract page state information
400
+ */
401
+ private extractPageState;
402
+ /**
403
+ * Extract page landmarks
404
+ */
405
+ private extractLandmarks;
406
+ /**
407
+ * Convert elements to selector strings
408
+ */
409
+ private elementsToSelectors;
410
+ /**
411
+ * Extract interactive elements
412
+ */
413
+ private extractInteractiveElements;
414
+ /**
415
+ * Extract form information
416
+ */
417
+ private extractForms;
418
+ /**
419
+ * Extract semantic elements (full mode only)
420
+ */
421
+ private extractSemanticElements;
422
+ /**
423
+ * Extract metadata
424
+ */
425
+ private extractMetadata;
426
+ /**
427
+ * Check if element should be included based on options
428
+ */
429
+ private shouldIncludeElement;
430
+ /**
431
+ * Detect errors on the page
432
+ */
433
+ private detectErrors;
434
+ /**
435
+ * Detect if page is loading
436
+ */
437
+ private detectLoading;
438
+ /**
439
+ * Detect modal dialogs
440
+ */
441
+ private detectModals;
442
+ /**
443
+ * Get currently focused element
444
+ */
445
+ private getFocusedElement;
446
+ /**
447
+ * Quick extraction for interactive elements only
448
+ * @param doc The document to extract from
449
+ * @param options Extraction options
450
+ */
451
+ static extractInteractive(doc: Document, options?: Partial<ExtractionOptions>): SmartDOMResult;
452
+ /**
453
+ * Quick extraction for full content
454
+ * @param doc The document to extract from
455
+ * @param options Extraction options
456
+ */
457
+ static extractFull(doc: Document, options?: Partial<ExtractionOptions>): SmartDOMResult;
458
+ /**
459
+ * Extract from a specific element
460
+ * @param element The element to extract from
461
+ * @param mode The extraction mode
462
+ * @param options Additional options
463
+ */
464
+ static extractFromElement(element: Element, mode?: ExtractionMode, options?: Partial<ExtractionOptions>): SmartDOMResult;
465
+ }
466
+
467
+ export { ContentDetection, type ContentExtractionOptions, type ElementContext, type ElementInteraction, type ElementSelector, type ElementSelectorCandidate, type ExtractContentArgs, type ExtractFullArgs, type ExtractInteractiveArgs, type ExtractRegionArgs, type ExtractStructureArgs, type ExtractedContent, type ExtractedElement, type ExtractionArgs, type ExtractionMethod, type ExtractionMode, type ExtractionOptions, type ExtractionResult, type FilterOptions, type FormInfo, type MarkdownFormatOptions, MarkdownFormatter, type PageLandmarks, type PageState, ProgressiveExtractor, type RegionInfo, SelectorGenerator, SmartDOMReader, type SmartDOMResult, type StructuralOverview, SmartDOMReader as default };