@mcp-b/smart-dom-reader 0.0.0-beta-20260221154800

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1611 @@
1
+ import { createRequire } from "node:module";
2
+
3
+ //#region rolldown:runtime
4
+ var __defProp = Object.defineProperty;
5
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
6
+ var __getOwnPropNames = Object.getOwnPropertyNames;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __esm = (fn, res) => function() {
9
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
10
+ };
11
+ var __export = (all) => {
12
+ let target = {};
13
+ for (var name in all) __defProp(target, name, {
14
+ get: all[name],
15
+ enumerable: true
16
+ });
17
+ return target;
18
+ };
19
+ var __copyProps = (to, from, except, desc) => {
20
+ if (from && typeof from === "object" || typeof from === "function") for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
21
+ key = keys[i];
22
+ if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, {
23
+ get: ((k) => from[k]).bind(null, key),
24
+ enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
25
+ });
26
+ }
27
+ return to;
28
+ };
29
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
30
+ var __require = /* @__PURE__ */ createRequire(import.meta.url);
31
+
32
+ //#endregion
33
+ //#region src/content-detection.ts
34
+ var ContentDetection;
35
+ var init_content_detection = __esm({ "src/content-detection.ts": (() => {
36
+ ContentDetection = class ContentDetection {
37
+ /**
38
+ * Find the main content area of a page
39
+ * Inspired by dom-to-semantic-markdown's approach
40
+ */
41
+ static findMainContent(doc) {
42
+ const mainElement = doc.querySelector("main, [role=\"main\"]");
43
+ if (mainElement) return mainElement;
44
+ if (!doc.body) return doc.documentElement;
45
+ return ContentDetection.detectMainContent(doc.body);
46
+ }
47
+ /**
48
+ * Detect main content using scoring algorithm
49
+ */
50
+ static detectMainContent(rootElement) {
51
+ const candidates = [];
52
+ ContentDetection.collectCandidates(rootElement, candidates, 15);
53
+ if (candidates.length === 0) return rootElement;
54
+ candidates.sort((a, b) => ContentDetection.calculateContentScore(b) - ContentDetection.calculateContentScore(a));
55
+ let bestCandidate = candidates[0];
56
+ for (let i = 1; i < candidates.length; i++) {
57
+ const candidate = candidates[i];
58
+ if (!candidates.some((other, j) => j !== i && other.contains(candidate)) && ContentDetection.calculateContentScore(candidate) > ContentDetection.calculateContentScore(bestCandidate)) bestCandidate = candidate;
59
+ }
60
+ return bestCandidate;
61
+ }
62
+ /**
63
+ * Collect content candidates
64
+ */
65
+ static collectCandidates(element, candidates, minScore) {
66
+ if (ContentDetection.calculateContentScore(element) >= minScore) candidates.push(element);
67
+ Array.from(element.children).forEach((child) => {
68
+ ContentDetection.collectCandidates(child, candidates, minScore);
69
+ });
70
+ }
71
+ /**
72
+ * Calculate content score for an element
73
+ */
74
+ static calculateContentScore(element) {
75
+ let score = 0;
76
+ const semanticClasses = [
77
+ "article",
78
+ "content",
79
+ "main-container",
80
+ "main",
81
+ "main-content",
82
+ "post",
83
+ "entry"
84
+ ];
85
+ const semanticIds = [
86
+ "content",
87
+ "main",
88
+ "article",
89
+ "post",
90
+ "entry"
91
+ ];
92
+ semanticClasses.forEach((cls) => {
93
+ if (element.classList.contains(cls)) score += 10;
94
+ });
95
+ semanticIds.forEach((id) => {
96
+ if (element.id?.toLowerCase().includes(id)) score += 10;
97
+ });
98
+ const tag = element.tagName.toLowerCase();
99
+ if ([
100
+ "article",
101
+ "main",
102
+ "section"
103
+ ].includes(tag)) score += 8;
104
+ const paragraphs = element.getElementsByTagName("p").length;
105
+ score += Math.min(paragraphs * 2, 10);
106
+ const headings = element.querySelectorAll("h1, h2, h3").length;
107
+ score += Math.min(headings * 3, 9);
108
+ const textLength = element.textContent?.trim().length || 0;
109
+ if (textLength > 300) score += Math.min(Math.floor(textLength / 300) * 2, 10);
110
+ const linkDensity = ContentDetection.calculateLinkDensity(element);
111
+ if (linkDensity < .3) score += 5;
112
+ else if (linkDensity > .5) score -= 5;
113
+ if (element.hasAttribute("data-main") || element.hasAttribute("data-content") || element.hasAttribute("itemprop")) score += 8;
114
+ const role = element.getAttribute("role");
115
+ if (role === "main" || role === "article") score += 10;
116
+ if (element.matches("aside, nav, header, footer, .sidebar, .navigation, .menu, .ad, .advertisement")) score -= 10;
117
+ if (element.getElementsByTagName("form").length > 2) score -= 5;
118
+ return Math.max(0, score);
119
+ }
120
+ /**
121
+ * Calculate link density in an element
122
+ */
123
+ static calculateLinkDensity(element) {
124
+ const links = element.getElementsByTagName("a");
125
+ let linkTextLength = 0;
126
+ for (const link of Array.from(links)) linkTextLength += link.textContent?.length || 0;
127
+ const totalTextLength = element.textContent?.length || 1;
128
+ return linkTextLength / totalTextLength;
129
+ }
130
+ /**
131
+ * Check if an element is likely navigation
132
+ */
133
+ static isNavigation(element) {
134
+ if (element.tagName.toLowerCase() === "nav" || element.getAttribute("role") === "navigation") return true;
135
+ const navPatterns = [
136
+ /nav/i,
137
+ /menu/i,
138
+ /sidebar/i,
139
+ /toolbar/i
140
+ ];
141
+ const classesAndId = `${element.className} ${element.id}`.toLowerCase();
142
+ return navPatterns.some((pattern) => pattern.test(classesAndId));
143
+ }
144
+ /**
145
+ * Check if element is likely supplementary content
146
+ */
147
+ static isSupplementary(element) {
148
+ if (element.tagName.toLowerCase() === "aside" || element.getAttribute("role") === "complementary") return true;
149
+ const supplementaryPatterns = [
150
+ /sidebar/i,
151
+ /widget/i,
152
+ /related/i,
153
+ /advertisement/i,
154
+ /social/i
155
+ ];
156
+ const classesAndId = `${element.className} ${element.id}`.toLowerCase();
157
+ return supplementaryPatterns.some((pattern) => pattern.test(classesAndId));
158
+ }
159
+ /**
160
+ * Detect page landmarks
161
+ */
162
+ static detectLandmarks(doc) {
163
+ const landmarks = {
164
+ navigation: [],
165
+ main: [],
166
+ complementary: [],
167
+ contentinfo: [],
168
+ banner: [],
169
+ search: [],
170
+ form: [],
171
+ region: []
172
+ };
173
+ for (const [landmark, selector] of Object.entries({
174
+ navigation: "nav, [role=\"navigation\"]",
175
+ main: "main, [role=\"main\"]",
176
+ complementary: "aside, [role=\"complementary\"]",
177
+ contentinfo: "footer, [role=\"contentinfo\"]",
178
+ banner: "header, [role=\"banner\"]",
179
+ search: "[role=\"search\"]",
180
+ form: "form[aria-label], form[aria-labelledby], [role=\"form\"]",
181
+ region: "section[aria-label], section[aria-labelledby], [role=\"region\"]"
182
+ })) {
183
+ const elements = doc.querySelectorAll(selector);
184
+ landmarks[landmark] = Array.from(elements);
185
+ }
186
+ return landmarks;
187
+ }
188
+ };
189
+ }) });
190
+
191
+ //#endregion
192
+ //#region src/selectors.ts
193
+ var SelectorGenerator;
194
+ var init_selectors = __esm({ "src/selectors.ts": (() => {
195
+ SelectorGenerator = class SelectorGenerator {
196
+ /**
197
+ * Generate multiple selector strategies for an element
198
+ */
199
+ static generateSelectors(element) {
200
+ const doc = element.ownerDocument || document;
201
+ const candidates = [];
202
+ if (element.id && SelectorGenerator.isUniqueId(element.id, doc)) candidates.push({
203
+ type: "id",
204
+ value: `#${CSS.escape(element.id)}`,
205
+ score: 100
206
+ });
207
+ const testId = SelectorGenerator.getDataTestId(element);
208
+ if (testId) {
209
+ const v = `[data-testid="${CSS.escape(testId)}"]`;
210
+ candidates.push({
211
+ type: "data-testid",
212
+ value: v,
213
+ score: 90 + (SelectorGenerator.isUniqueSelectorSafe(v, doc) ? 5 : 0)
214
+ });
215
+ }
216
+ const role = element.getAttribute("role");
217
+ const aria = element.getAttribute("aria-label");
218
+ if (role && aria) {
219
+ const v = `[role="${CSS.escape(role)}"][aria-label="${CSS.escape(aria)}"]`;
220
+ candidates.push({
221
+ type: "role-aria",
222
+ value: v,
223
+ score: 85 + (SelectorGenerator.isUniqueSelectorSafe(v, doc) ? 5 : 0)
224
+ });
225
+ }
226
+ const nameAttr = element.getAttribute("name");
227
+ if (nameAttr) {
228
+ const v = `[name="${CSS.escape(nameAttr)}"]`;
229
+ candidates.push({
230
+ type: "name",
231
+ value: v,
232
+ score: 78 + (SelectorGenerator.isUniqueSelectorSafe(v, doc) ? 5 : 0)
233
+ });
234
+ }
235
+ const pathCss = SelectorGenerator.generateCSSSelector(element, doc);
236
+ const structuralPenalty = (pathCss.match(/:nth-child\(/g) || []).length * 10;
237
+ const classBonus = pathCss.includes(".") ? 8 : 0;
238
+ const pathScore = Math.max(0, 70 + classBonus - structuralPenalty);
239
+ candidates.push({
240
+ type: "class-path",
241
+ value: pathCss,
242
+ score: pathScore
243
+ });
244
+ const xpath = SelectorGenerator.generateXPath(element, doc);
245
+ candidates.push({
246
+ type: "xpath",
247
+ value: xpath,
248
+ score: 40
249
+ });
250
+ const textBased = SelectorGenerator.generateTextBasedSelector(element);
251
+ if (textBased) candidates.push({
252
+ type: "text",
253
+ value: textBased,
254
+ score: 30
255
+ });
256
+ candidates.sort((a, b) => b.score - a.score);
257
+ const selector = {
258
+ css: candidates.find((c) => c.type !== "xpath" && c.type !== "text")?.value || pathCss,
259
+ xpath,
260
+ candidates
261
+ };
262
+ if (textBased) selector.textBased = textBased;
263
+ if (testId) selector.dataTestId = testId;
264
+ if (aria) selector.ariaLabel = aria;
265
+ return selector;
266
+ }
267
+ /**
268
+ * Generate a unique CSS selector for an element
269
+ */
270
+ static generateCSSSelector(element, doc) {
271
+ if (element.id && SelectorGenerator.isUniqueId(element.id, doc)) return `#${CSS.escape(element.id)}`;
272
+ const testId = SelectorGenerator.getDataTestId(element);
273
+ if (testId) return `[data-testid="${CSS.escape(testId)}"]`;
274
+ const path = [];
275
+ let current = element;
276
+ while (current && current.nodeType === Node.ELEMENT_NODE) {
277
+ let selector = current.nodeName.toLowerCase();
278
+ if (current.id && SelectorGenerator.isUniqueId(current.id, doc)) {
279
+ selector = `#${CSS.escape(current.id)}`;
280
+ path.unshift(selector);
281
+ break;
282
+ }
283
+ const classes = SelectorGenerator.getMeaningfulClasses(current);
284
+ if (classes.length > 0) selector += `.${classes.map((c) => CSS.escape(c)).join(".")}`;
285
+ const siblings = current.parentElement?.children;
286
+ if (siblings && siblings.length > 1) {
287
+ const index = Array.from(siblings).indexOf(current);
288
+ if (index > 0 || !SelectorGenerator.isUniqueSelector(selector, current.parentElement)) selector += `:nth-child(${index + 1})`;
289
+ }
290
+ path.unshift(selector);
291
+ current = current.parentElement;
292
+ }
293
+ return SelectorGenerator.optimizePath(path, element, doc);
294
+ }
295
+ /**
296
+ * Generate XPath for an element
297
+ */
298
+ static generateXPath(element, doc) {
299
+ if (element.id && SelectorGenerator.isUniqueId(element.id, doc)) return `//*[@id="${element.id}"]`;
300
+ const path = [];
301
+ let current = element;
302
+ while (current && current.nodeType === Node.ELEMENT_NODE) {
303
+ const tagName = current.nodeName.toLowerCase();
304
+ if (current.id && SelectorGenerator.isUniqueId(current.id, doc)) {
305
+ path.unshift(`//*[@id="${current.id}"]`);
306
+ break;
307
+ }
308
+ let xpath = tagName;
309
+ const siblings = current.parentElement?.children;
310
+ if (siblings) {
311
+ const sameTagSiblings = Array.from(siblings).filter((s) => s.nodeName.toLowerCase() === tagName);
312
+ if (sameTagSiblings.length > 1) {
313
+ const index = sameTagSiblings.indexOf(current) + 1;
314
+ xpath += `[${index}]`;
315
+ }
316
+ }
317
+ path.unshift(xpath);
318
+ current = current.parentElement;
319
+ }
320
+ return `//${path.join("/")}`;
321
+ }
322
+ /**
323
+ * Generate a text-based selector for buttons and links
324
+ */
325
+ static generateTextBasedSelector(element) {
326
+ const text = element.textContent?.trim();
327
+ if (!text || text.length > 50) return void 0;
328
+ const tag = element.nodeName.toLowerCase();
329
+ if ([
330
+ "button",
331
+ "a",
332
+ "label"
333
+ ].includes(tag)) return `${tag}:contains("${text.replace(/['"\\]/g, "\\$&")}")`;
334
+ }
335
+ /**
336
+ * Get data-testid or similar attributes
337
+ */
338
+ static getDataTestId(element) {
339
+ return element.getAttribute("data-testid") || element.getAttribute("data-test-id") || element.getAttribute("data-test") || element.getAttribute("data-cy") || void 0;
340
+ }
341
+ /**
342
+ * Check if an ID is unique in the document
343
+ */
344
+ static isUniqueId(id, doc) {
345
+ return doc.querySelectorAll(`#${CSS.escape(id)}`).length === 1;
346
+ }
347
+ /**
348
+ * Check if a selector is unique within a container
349
+ */
350
+ static isUniqueSelector(selector, container) {
351
+ try {
352
+ return container.querySelectorAll(selector).length === 1;
353
+ } catch {
354
+ return false;
355
+ }
356
+ }
357
+ static isUniqueSelectorSafe(selector, doc) {
358
+ try {
359
+ return doc.querySelectorAll(selector).length === 1;
360
+ } catch {
361
+ return false;
362
+ }
363
+ }
364
+ /**
365
+ * Get meaningful classes (filtering out utility classes)
366
+ */
367
+ static getMeaningfulClasses(element) {
368
+ const classes = Array.from(element.classList);
369
+ const utilityPatterns = [
370
+ /^(p|m|w|h|text|bg|border|flex|grid|col|row)-/,
371
+ /^(xs|sm|md|lg|xl|2xl):/,
372
+ /^(hover|focus|active|disabled|checked):/,
373
+ /^js-/,
374
+ /^is-/,
375
+ /^has-/
376
+ ];
377
+ return classes.filter((cls) => {
378
+ if (cls.length < 3) return false;
379
+ return !utilityPatterns.some((pattern) => pattern.test(cls));
380
+ }).slice(0, 2);
381
+ }
382
+ /**
383
+ * Optimize the selector path by removing unnecessary parts
384
+ */
385
+ static optimizePath(path, element, doc) {
386
+ for (let i = 0; i < path.length - 1; i++) {
387
+ const shortPath = path.slice(i).join(" > ");
388
+ try {
389
+ const matches = doc.querySelectorAll(shortPath);
390
+ if (matches.length === 1 && matches[0] === element) return shortPath;
391
+ } catch {}
392
+ }
393
+ return path.join(" > ");
394
+ }
395
+ /**
396
+ * Get a human-readable path description
397
+ */
398
+ static getContextPath(element) {
399
+ const path = [];
400
+ let current = element;
401
+ let depth = 0;
402
+ const maxDepth = 5;
403
+ while (current && current !== element.ownerDocument?.body && depth < maxDepth) {
404
+ const tag = current.nodeName.toLowerCase();
405
+ let descriptor = tag;
406
+ if (current.id) descriptor = `${tag}#${current.id}`;
407
+ else if (current.className && typeof current.className === "string") {
408
+ const firstClass = current.className.split(" ")[0];
409
+ if (firstClass) descriptor = `${tag}.${firstClass}`;
410
+ }
411
+ const role = current.getAttribute("role");
412
+ if (role) descriptor += `[role="${role}"]`;
413
+ path.unshift(descriptor);
414
+ current = current.parentElement;
415
+ depth++;
416
+ }
417
+ return path;
418
+ }
419
+ };
420
+ }) });
421
+
422
+ //#endregion
423
+ //#region src/traversal.ts
424
+ var DOMTraversal;
425
+ var init_traversal = __esm({ "src/traversal.ts": (() => {
426
+ init_selectors();
427
+ DOMTraversal = class DOMTraversal {
428
+ static INTERACTIVE_SELECTORS = [
429
+ "button",
430
+ "a[href]",
431
+ "input:not([type=\"hidden\"])",
432
+ "textarea",
433
+ "select",
434
+ "[role=\"button\"]",
435
+ "[onclick]",
436
+ "[contenteditable=\"true\"]",
437
+ "summary",
438
+ "[tabindex]:not([tabindex=\"-1\"])"
439
+ ];
440
+ static SEMANTIC_SELECTORS = [
441
+ "h1",
442
+ "h2",
443
+ "h3",
444
+ "h4",
445
+ "h5",
446
+ "h6",
447
+ "article",
448
+ "section",
449
+ "nav",
450
+ "aside",
451
+ "main",
452
+ "header",
453
+ "footer",
454
+ "form",
455
+ "table",
456
+ "ul",
457
+ "ol",
458
+ "img[alt]",
459
+ "figure",
460
+ "video",
461
+ "audio",
462
+ "[role=\"navigation\"]",
463
+ "[role=\"main\"]",
464
+ "[role=\"complementary\"]",
465
+ "[role=\"contentinfo\"]"
466
+ ];
467
+ /**
468
+ * Check if element is visible
469
+ */
470
+ static isVisible(element, computedStyle) {
471
+ const rect = element.getBoundingClientRect();
472
+ const style = computedStyle || element.ownerDocument?.defaultView?.getComputedStyle(element);
473
+ if (!style) return false;
474
+ return !!(rect.width > 0 && rect.height > 0 && style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0" && element.offsetParent !== null);
475
+ }
476
+ /**
477
+ * Check if element is in viewport
478
+ */
479
+ static isInViewport(element, viewport) {
480
+ const rect = element.getBoundingClientRect();
481
+ const view = viewport || {
482
+ width: element.ownerDocument?.defaultView?.innerWidth || 0,
483
+ height: element.ownerDocument?.defaultView?.innerHeight || 0
484
+ };
485
+ return rect.top < view.height && rect.bottom > 0 && rect.left < view.width && rect.right > 0;
486
+ }
487
+ /**
488
+ * Check if element passes filter criteria
489
+ */
490
+ static passesFilter(element, filter) {
491
+ if (!filter) return true;
492
+ const htmlElement = element;
493
+ if (filter.excludeSelectors?.length) {
494
+ for (const selector of filter.excludeSelectors) if (element.matches(selector)) return false;
495
+ }
496
+ if (filter.includeSelectors?.length) {
497
+ let matches = false;
498
+ for (const selector of filter.includeSelectors) if (element.matches(selector)) {
499
+ matches = true;
500
+ break;
501
+ }
502
+ if (!matches) return false;
503
+ }
504
+ if (filter.tags?.length && !filter.tags.includes(element.tagName.toLowerCase())) return false;
505
+ const textContent = htmlElement.textContent?.toLowerCase() || "";
506
+ if (filter.textContains?.length) {
507
+ let hasText = false;
508
+ for (const text of filter.textContains) if (textContent.includes(text.toLowerCase())) {
509
+ hasText = true;
510
+ break;
511
+ }
512
+ if (!hasText) return false;
513
+ }
514
+ if (filter.textMatches?.length) {
515
+ let matches = false;
516
+ for (const pattern of filter.textMatches) if (pattern.test(textContent)) {
517
+ matches = true;
518
+ break;
519
+ }
520
+ if (!matches) return false;
521
+ }
522
+ if (filter.hasAttributes?.length) {
523
+ for (const attr of filter.hasAttributes) if (!element.hasAttribute(attr)) return false;
524
+ }
525
+ if (filter.attributeValues) for (const [attr, value] of Object.entries(filter.attributeValues)) {
526
+ const attrValue = element.getAttribute(attr);
527
+ if (!attrValue) return false;
528
+ if (typeof value === "string") {
529
+ if (attrValue !== value) return false;
530
+ } else if (value instanceof RegExp) {
531
+ if (!value.test(attrValue)) return false;
532
+ }
533
+ }
534
+ if (filter.withinSelectors?.length) {
535
+ let isWithin = false;
536
+ for (const selector of filter.withinSelectors) if (element.closest(selector)) {
537
+ isWithin = true;
538
+ break;
539
+ }
540
+ if (!isWithin) return false;
541
+ }
542
+ if (filter.interactionTypes?.length) {
543
+ const interaction = DOMTraversal.getInteractionInfo(element);
544
+ let hasInteraction = false;
545
+ for (const type of filter.interactionTypes) if (interaction[type]) {
546
+ hasInteraction = true;
547
+ break;
548
+ }
549
+ if (!hasInteraction) return false;
550
+ }
551
+ if (filter.nearText) {
552
+ const parent = element.parentElement;
553
+ if (!parent || !parent.textContent?.toLowerCase().includes(filter.nearText.toLowerCase())) return false;
554
+ }
555
+ return true;
556
+ }
557
+ /**
558
+ * Extract element information
559
+ */
560
+ static extractElement(element, options, depth = 0) {
561
+ if (options.maxDepth && depth > options.maxDepth) return null;
562
+ if (!options.includeHidden && !DOMTraversal.isVisible(element)) return null;
563
+ if (options.viewportOnly && !DOMTraversal.isInViewport(element)) return null;
564
+ if (!DOMTraversal.passesFilter(element, options.filter)) return null;
565
+ const htmlElement = element;
566
+ const extracted = {
567
+ tag: element.tagName.toLowerCase(),
568
+ text: DOMTraversal.getElementText(element, options),
569
+ selector: SelectorGenerator.generateSelectors(element),
570
+ attributes: DOMTraversal.getRelevantAttributes(element, options),
571
+ context: DOMTraversal.getElementContext(element),
572
+ interaction: DOMTraversal.getInteractionInfo(element)
573
+ };
574
+ if (options.mode === "full" && DOMTraversal.isSemanticContainer(element)) {
575
+ const children = [];
576
+ if (options.includeShadowDOM && htmlElement.shadowRoot) {
577
+ const shadowChildren = DOMTraversal.extractChildren(htmlElement.shadowRoot, options, depth + 1);
578
+ children.push(...shadowChildren);
579
+ }
580
+ const regularChildren = DOMTraversal.extractChildren(element, options, depth + 1);
581
+ children.push(...regularChildren);
582
+ if (children.length > 0) extracted.children = children;
583
+ }
584
+ return extracted;
585
+ }
586
+ /**
587
+ * Extract children elements
588
+ */
589
+ static extractChildren(container, options, depth) {
590
+ const children = [];
591
+ const elements = container.querySelectorAll("*");
592
+ for (const child of Array.from(elements)) {
593
+ if (DOMTraversal.hasExtractedAncestor(child, elements)) continue;
594
+ const extracted = DOMTraversal.extractElement(child, options, depth);
595
+ if (extracted) children.push(extracted);
596
+ }
597
+ return children;
598
+ }
599
+ /**
600
+ * Check if element has an ancestor that was already extracted
601
+ */
602
+ static hasExtractedAncestor(element, extractedElements) {
603
+ let parent = element.parentElement;
604
+ while (parent) {
605
+ if (Array.from(extractedElements).includes(parent)) return true;
606
+ parent = parent.parentElement;
607
+ }
608
+ return false;
609
+ }
610
+ /**
611
+ * Get relevant attributes for an element
612
+ */
613
+ static getRelevantAttributes(element, options) {
614
+ const relevant = [
615
+ "id",
616
+ "class",
617
+ "name",
618
+ "type",
619
+ "value",
620
+ "placeholder",
621
+ "href",
622
+ "src",
623
+ "alt",
624
+ "title",
625
+ "action",
626
+ "method",
627
+ "aria-label",
628
+ "aria-describedby",
629
+ "aria-controls",
630
+ "role",
631
+ "disabled",
632
+ "readonly",
633
+ "required",
634
+ "checked",
635
+ "min",
636
+ "max",
637
+ "pattern",
638
+ "step",
639
+ "autocomplete",
640
+ "data-testid",
641
+ "data-test",
642
+ "data-cy"
643
+ ];
644
+ const attributes = {};
645
+ const attrTruncate = options.attributeTruncateLength ?? 100;
646
+ const dataAttrTruncate = options.dataAttributeTruncateLength ?? 50;
647
+ for (const attr of relevant) {
648
+ const value = element.getAttribute(attr);
649
+ if (value) attributes[attr] = value.length > attrTruncate ? `${value.substring(0, attrTruncate)}...` : value;
650
+ }
651
+ for (const attr of element.attributes) if (attr.name.startsWith("data-") && !relevant.includes(attr.name)) attributes[attr.name] = attr.value.length > dataAttrTruncate ? `${attr.value.substring(0, dataAttrTruncate)}...` : attr.value;
652
+ return attributes;
653
+ }
654
+ /**
655
+ * Get element context information
656
+ */
657
+ static getElementContext(element) {
658
+ const context = { parentChain: SelectorGenerator.getContextPath(element) };
659
+ const form = element.closest("form");
660
+ if (form) context.nearestForm = SelectorGenerator.generateSelectors(form).css;
661
+ const section = element.closest("section, [role=\"region\"]");
662
+ if (section) context.nearestSection = SelectorGenerator.generateSelectors(section).css;
663
+ const main = element.closest("main, [role=\"main\"]");
664
+ if (main) context.nearestMain = SelectorGenerator.generateSelectors(main).css;
665
+ const nav = element.closest("nav, [role=\"navigation\"]");
666
+ if (nav) context.nearestNav = SelectorGenerator.generateSelectors(nav).css;
667
+ return context;
668
+ }
669
+ /**
670
+ * Get interaction information for an element (compact format)
671
+ */
672
+ static getInteractionInfo(element) {
673
+ const htmlElement = element;
674
+ const interaction = {};
675
+ if (!!(htmlElement.onclick || element.getAttribute("onclick") || element.matches("button, a[href], [role=\"button\"], [tabindex]:not([tabindex=\"-1\"])"))) interaction.click = true;
676
+ if (!!(htmlElement.onchange || element.getAttribute("onchange") || element.matches("input, select, textarea"))) interaction.change = true;
677
+ if (!!(htmlElement.onsubmit || element.getAttribute("onsubmit") || element.matches("form"))) interaction.submit = true;
678
+ if (element.matches("a[href], button[type=\"submit\"]")) interaction.nav = true;
679
+ if (htmlElement.hasAttribute("disabled") || htmlElement.getAttribute("aria-disabled") === "true") interaction.disabled = true;
680
+ if (!DOMTraversal.isVisible(element)) interaction.hidden = true;
681
+ const ariaRole = element.getAttribute("role");
682
+ if (ariaRole) interaction.role = ariaRole;
683
+ if (element.matches("input, textarea, select, button")) {
684
+ const form = element.form || element.closest("form");
685
+ if (form) interaction.form = SelectorGenerator.generateSelectors(form).css;
686
+ }
687
+ return interaction;
688
+ }
689
+ /**
690
+ * Get text content of an element (limited length)
691
+ */
692
+ static getElementText(element, options) {
693
+ if (element.matches("input, textarea")) {
694
+ const input = element;
695
+ return input.value || input.placeholder || "";
696
+ }
697
+ if (element.matches("img")) return element.alt || "";
698
+ const text = element.textContent?.trim() || "";
699
+ const maxLength = options?.textTruncateLength;
700
+ if (maxLength && text.length > maxLength) return `${text.substring(0, maxLength)}...`;
701
+ return text;
702
+ }
703
+ /**
704
+ * Check if element is a semantic container
705
+ */
706
+ static isSemanticContainer(element) {
707
+ return element.matches("article, section, nav, aside, main, header, footer, form, table, ul, ol, dl, figure, details, dialog, [role=\"region\"], [role=\"navigation\"], [role=\"main\"], [role=\"complementary\"]");
708
+ }
709
+ /**
710
+ * Get interactive elements
711
+ */
712
+ static getInteractiveElements(container, options) {
713
+ const elements = [];
714
+ const selector = DOMTraversal.INTERACTIVE_SELECTORS.join(", ");
715
+ const found = container.querySelectorAll(selector);
716
+ for (const element of Array.from(found)) {
717
+ const extracted = DOMTraversal.extractElement(element, options);
718
+ if (extracted) elements.push(extracted);
719
+ }
720
+ if (options.customSelectors) for (const customSelector of options.customSelectors) try {
721
+ const customFound = container.querySelectorAll(customSelector);
722
+ for (const element of Array.from(customFound)) {
723
+ const extracted = DOMTraversal.extractElement(element, options);
724
+ if (extracted) elements.push(extracted);
725
+ }
726
+ } catch (_e) {
727
+ console.warn(`Invalid custom selector: ${customSelector}`);
728
+ }
729
+ return elements;
730
+ }
731
+ /**
732
+ * Get semantic elements (for full mode)
733
+ */
734
+ static getSemanticElements(container, options) {
735
+ const elements = [];
736
+ const selector = DOMTraversal.SEMANTIC_SELECTORS.join(", ");
737
+ const found = container.querySelectorAll(selector);
738
+ for (const element of Array.from(found)) {
739
+ const extracted = DOMTraversal.extractElement(element, options);
740
+ if (extracted) elements.push(extracted);
741
+ }
742
+ return elements;
743
+ }
744
+ };
745
+ }) });
746
+
747
+ //#endregion
748
+ //#region src/markdown-formatter.ts
749
+ function truncate(text, len) {
750
+ const t = (text ?? "").trim();
751
+ if (!len || t.length <= len) return t;
752
+ const keywords = [
753
+ "login",
754
+ "log in",
755
+ "sign in",
756
+ "sign up",
757
+ "submit",
758
+ "search",
759
+ "filter",
760
+ "add to cart",
761
+ "next",
762
+ "continue"
763
+ ];
764
+ const lower = t.toLowerCase();
765
+ const hit = keywords.map((k) => ({
766
+ k,
767
+ i: lower.indexOf(k)
768
+ })).find((x) => x.i > -1);
769
+ const head = Math.max(0, Math.floor(len * .66));
770
+ if (hit && hit.i > head) {
771
+ const tailWindow = Math.max(12, len - head - 5);
772
+ const start = Math.max(0, hit.i - Math.floor(tailWindow / 2));
773
+ const end = Math.min(t.length, start + tailWindow);
774
+ return `${t.slice(0, head).trimEnd()} … ${t.slice(start, end).trim()}…`;
775
+ }
776
+ const slice = t.slice(0, len);
777
+ const lastSpace = slice.lastIndexOf(" ");
778
+ return `${lastSpace > 32 ? slice.slice(0, lastSpace) : slice}…`;
779
+ }
780
+ function bestSelector(el) {
781
+ return el.selector?.css || "";
782
+ }
783
+ function hashId(input) {
784
+ let h = 5381;
785
+ for (let i = 0; i < input.length; i++) h = h * 33 ^ input.charCodeAt(i);
786
+ return `sec-${(h >>> 0).toString(36)}`;
787
+ }
788
+ function iconForRegion(key) {
789
+ switch (key) {
790
+ case "header": return "🧭";
791
+ case "navigation": return "📑";
792
+ case "main": return "📄";
793
+ case "sections": return "🗂️";
794
+ case "sidebar": return "📚";
795
+ case "footer": return "🔻";
796
+ case "modals": return "💬";
797
+ default: return "🔹";
798
+ }
799
+ }
800
+ function elementLine(el, opts) {
801
+ const txt = truncate(el.text || el.attributes?.ariaLabel, opts?.maxTextLength ?? 80);
802
+ const sel = bestSelector(el);
803
+ const tag = el.tag.toLowerCase();
804
+ const action = el.interaction?.submit ? "submit" : el.interaction?.click ? "click" : el.interaction?.change ? "change" : void 0;
805
+ const actionText = action ? ` (${action})` : "";
806
+ return `- ${tag.toUpperCase()}: ${txt || "(no text)"} → \`${sel}\`${actionText}`;
807
+ }
808
+ function selectorQualitySummary(inter) {
809
+ const all = [];
810
+ all.push(...inter.buttons.map((e) => e.selector?.css || ""));
811
+ all.push(...inter.links.map((e) => e.selector?.css || ""));
812
+ all.push(...inter.inputs.map((e) => e.selector?.css || ""));
813
+ all.push(...inter.clickable.map((e) => e.selector?.css || ""));
814
+ const total = all.length || 1;
815
+ const idCount = all.filter((s) => s.startsWith("#")).length;
816
+ const testIdCount = all.filter((s) => /\[data-testid=/.test(s)).length;
817
+ const nthCount = all.filter((s) => /:nth-child\(/.test(s)).length;
818
+ const stable = idCount + testIdCount;
819
+ return `Selector quality: ${Math.round(stable / total * 100)}% stable (ID/data-testid), ${Math.round(nthCount / total * 100)}% structural (:nth-child)`;
820
+ }
821
+ function renderInteractive(inter, opts) {
822
+ const parts = [];
823
+ const limit = (arr) => typeof opts?.maxElements === "number" ? arr.slice(0, opts.maxElements) : arr;
824
+ if (inter.buttons.length) {
825
+ parts.push("Buttons:");
826
+ for (const el of limit(inter.buttons)) parts.push(elementLine(el, opts));
827
+ }
828
+ if (inter.links.length) {
829
+ parts.push("Links:");
830
+ for (const el of limit(inter.links)) parts.push(elementLine(el, opts));
831
+ }
832
+ if (inter.inputs.length) {
833
+ parts.push("Inputs:");
834
+ for (const el of limit(inter.inputs)) parts.push(elementLine(el, opts));
835
+ }
836
+ if (inter.clickable.length) {
837
+ parts.push("Other Clickable:");
838
+ for (const el of limit(inter.clickable)) parts.push(elementLine(el, opts));
839
+ }
840
+ if (inter.forms.length) {
841
+ parts.push("Forms:");
842
+ for (const f of limit(inter.forms)) parts.push(`- FORM: action=${f.action ?? "-"} method=${f.method ?? "-"} → \`${f.selector}\``);
843
+ }
844
+ return parts.join("\n");
845
+ }
846
+ function renderRegionInfo(region) {
847
+ const icon = iconForRegion("region");
848
+ const id = hashId(`${region.selector}|${region.label ?? ""}|${region.role ?? ""}`);
849
+ const label = region.label ? ` ${region.label}` : "";
850
+ const stats = [];
851
+ if (region.buttonCount) stats.push(`${region.buttonCount} buttons`);
852
+ if (region.linkCount) stats.push(`${region.linkCount} links`);
853
+ if (region.inputCount) stats.push(`${region.inputCount} inputs`);
854
+ if (region.textPreview) stats.push(`“${truncate(region.textPreview, 80)}”`);
855
+ const statsLine = stats.length ? ` — ${stats.join(", ")}` : "";
856
+ return `${icon} ${label} → \`${region.selector}\` [${id}]${statsLine}`;
857
+ }
858
+ function wrapXml(body, meta, type = "section") {
859
+ return `<page ${[meta?.title ? `title="${escapeXml(meta?.title)}"` : null, meta?.url ? `url="${escapeXml(meta?.url)}"` : null].filter(Boolean).join(" ")}>\n <${type}><![CDATA[\n${body}\n]]></${type}>\n</page>`;
860
+ }
861
+ function escapeXml(s) {
862
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
863
+ }
864
+ function capitalize(s) {
865
+ return s.charAt(0).toUpperCase() + s.slice(1);
866
+ }
867
+ var MarkdownFormatter;
868
+ var init_markdown_formatter = __esm({ "src/markdown-formatter.ts": (() => {
869
+ MarkdownFormatter = class {
870
+ static structure(overview, _opts = {}, meta) {
871
+ const lines = [];
872
+ lines.push("# Page Outline");
873
+ if (meta?.title || meta?.url) {
874
+ lines.push(`Title: ${meta?.title ?? ""}`.trim());
875
+ lines.push(`URL: ${meta?.url ?? ""}`.trim());
876
+ }
877
+ lines.push("");
878
+ const regions = overview.regions;
879
+ const entries = [
880
+ ["header", regions.header],
881
+ ["navigation", regions.navigation],
882
+ ["main", regions.main],
883
+ ["sections", regions.sections],
884
+ ["sidebar", regions.sidebar],
885
+ ["footer", regions.footer],
886
+ ["modals", regions.modals]
887
+ ];
888
+ for (const [key, value] of entries) {
889
+ if (!value) continue;
890
+ const icon = iconForRegion(key);
891
+ if (Array.isArray(value)) {
892
+ if (!value.length) continue;
893
+ lines.push(`## ${icon} ${capitalize(key)}`);
894
+ for (const region of value) lines.push(renderRegionInfo(region));
895
+ } else {
896
+ lines.push(`## ${icon} ${capitalize(key)}`);
897
+ lines.push(renderRegionInfo(value));
898
+ }
899
+ lines.push("");
900
+ }
901
+ if (overview.suggestions?.length) {
902
+ lines.push("## Suggestions");
903
+ for (const s of overview.suggestions) lines.push(`- ${s}`);
904
+ lines.push("");
905
+ }
906
+ lines.push("Next: choose a region (by selector or [sectionId]) and call dom_extract_region for actionable details.");
907
+ return wrapXml(lines.join("\n"), meta, "outline");
908
+ }
909
+ static region(result, opts = {}, meta) {
910
+ const lines = [];
911
+ lines.push("# Region Details");
912
+ if (meta?.title || meta?.url) {
913
+ lines.push(`Title: ${meta?.title ?? ""}`.trim());
914
+ lines.push(`URL: ${meta?.url ?? ""}`.trim());
915
+ }
916
+ lines.push("");
917
+ const inter = result.interactive;
918
+ if (result.page) {
919
+ const ps = [
920
+ result.page.hasErrors ? "errors: yes" : "errors: no",
921
+ result.page.isLoading ? "loading: yes" : "loading: no",
922
+ result.page.hasModals ? "modals: yes" : "modals: no"
923
+ ];
924
+ lines.push(`Page state: ${ps.join(", ")}`);
925
+ }
926
+ const summary = [];
927
+ const count = (arr) => arr ? arr.length : 0;
928
+ summary.push(`${count(inter.buttons)} buttons`);
929
+ summary.push(`${count(inter.links)} links`);
930
+ summary.push(`${count(inter.inputs)} inputs`);
931
+ if (inter.forms?.length) summary.push(`${count(inter.forms)} forms`);
932
+ lines.push(`Summary: ${summary.join(", ")}`);
933
+ lines.push(selectorQualitySummary(inter));
934
+ lines.push("");
935
+ lines.push(renderInteractive(inter, opts));
936
+ lines.push("");
937
+ lines.push("Next: write a script using the most stable selectors above. If selectors look unstable, rerun dom_extract_region with higher detail or call dom_extract_content for text context.");
938
+ return wrapXml(lines.join("\n"), meta, "section");
939
+ }
940
+ static content(content, opts = {}, meta) {
941
+ const lines = [];
942
+ lines.push("# Content");
943
+ lines.push(`Selector: \`${content.selector}\``);
944
+ lines.push("");
945
+ if (content.text.headings?.length) {
946
+ lines.push("Headings:");
947
+ for (const h of content.text.headings) lines.push(`- H${h.level}: ${truncate(h.text, opts.maxTextLength ?? 120)}`);
948
+ lines.push("");
949
+ }
950
+ if (content.text.paragraphs?.length) {
951
+ const limit = typeof opts.maxElements === "number" ? opts.maxElements : content.text.paragraphs.length;
952
+ lines.push("Paragraphs:");
953
+ for (const p of content.text.paragraphs.slice(0, limit)) lines.push(`- ${truncate(p, opts.maxTextLength ?? 200)}`);
954
+ lines.push("");
955
+ }
956
+ if (content.text.lists?.length) {
957
+ lines.push("Lists:");
958
+ for (const list of content.text.lists) {
959
+ lines.push(`- ${list.type.toUpperCase()}:`);
960
+ const limit = typeof opts.maxElements === "number" ? opts.maxElements : list.items.length;
961
+ for (const item of list.items.slice(0, limit)) lines.push(` - ${truncate(item, opts.maxTextLength ?? 120)}`);
962
+ }
963
+ lines.push("");
964
+ }
965
+ if (content.tables?.length) {
966
+ lines.push("Tables:");
967
+ for (const t of content.tables) {
968
+ lines.push(`- Headers: ${t.headers.join(" | ")}`);
969
+ const limit = typeof opts.maxElements === "number" ? opts.maxElements : t.rows.length;
970
+ for (const row of t.rows.slice(0, limit)) lines.push(` - ${row.join(" | ")}`);
971
+ }
972
+ lines.push("");
973
+ }
974
+ if (content.media?.length) {
975
+ lines.push("Media:");
976
+ const limit = typeof opts.maxElements === "number" ? opts.maxElements : content.media.length;
977
+ for (const m of content.media.slice(0, limit)) lines.push(`- ${m.type.toUpperCase()}: ${m.alt ?? ""} ${m.src ? `→ ${m.src}` : ""}`.trim());
978
+ lines.push("");
979
+ }
980
+ lines.push("Next: if text is insufficient for targeting, call dom_extract_region for interactive selectors.");
981
+ return wrapXml(lines.join("\n"), meta, "content");
982
+ }
983
+ };
984
+ }) });
985
+
986
+ //#endregion
987
+ //#region src/progressive.ts
988
+ function resolveSmartDomReader() {
989
+ if (typeof window !== "undefined") {
990
+ const globalWindow = window;
991
+ const direct = globalWindow.SmartDOMReader;
992
+ if (typeof direct === "function") return direct;
993
+ const namespace = globalWindow.SmartDOMReaderNamespace;
994
+ if (namespace && typeof namespace.SmartDOMReader === "function") return namespace.SmartDOMReader;
995
+ }
996
+ try {
997
+ if (typeof __require === "function") {
998
+ const moduleExports = (init_src(), __toCommonJS(src_exports));
999
+ if (moduleExports && typeof moduleExports.SmartDOMReader === "function") return moduleExports.SmartDOMReader;
1000
+ if (moduleExports && typeof moduleExports.default === "function") return moduleExports.default;
1001
+ }
1002
+ } catch {}
1003
+ }
1004
+ var ProgressiveExtractor;
1005
+ var init_progressive = __esm({ "src/progressive.ts": (() => {
1006
+ init_content_detection();
1007
+ init_selectors();
1008
+ init_traversal();
1009
+ ProgressiveExtractor = class ProgressiveExtractor {
1010
+ /**
1011
+ * Step 1: Extract high-level structural overview
1012
+ * This provides a "map" of the page for the AI to understand structure
1013
+ */
1014
+ static extractStructure(root) {
1015
+ const regions = {};
1016
+ const header = root.querySelector("header, [role=\"banner\"], .header, #header");
1017
+ if (header) regions.header = ProgressiveExtractor.analyzeRegion(header);
1018
+ const navs = root.querySelectorAll("nav, [role=\"navigation\"], .nav, .navigation");
1019
+ if (navs.length > 0) regions.navigation = Array.from(navs).map((nav) => ProgressiveExtractor.analyzeRegion(nav));
1020
+ if (root instanceof Document) {
1021
+ const main = ContentDetection.findMainContent(root);
1022
+ if (main) {
1023
+ regions.main = ProgressiveExtractor.analyzeRegion(main);
1024
+ const sections = main.querySelectorAll("section, article, [role=\"region\"]");
1025
+ if (sections.length > 0) regions.sections = Array.from(sections).filter((section) => !section.closest("nav, header, footer")).map((section) => ProgressiveExtractor.analyzeRegion(section));
1026
+ }
1027
+ } else {
1028
+ regions.main = ProgressiveExtractor.analyzeRegion(root);
1029
+ const sections = root.querySelectorAll("section, article, [role=\"region\"]");
1030
+ if (sections.length > 0) regions.sections = Array.from(sections).filter((section) => !section.closest("nav, header, footer")).map((section) => ProgressiveExtractor.analyzeRegion(section));
1031
+ }
1032
+ const sidebars = root.querySelectorAll("aside, [role=\"complementary\"], .sidebar, #sidebar");
1033
+ if (sidebars.length > 0) regions.sidebar = Array.from(sidebars).map((sidebar) => ProgressiveExtractor.analyzeRegion(sidebar));
1034
+ const footer = root.querySelector("footer, [role=\"contentinfo\"], .footer, #footer");
1035
+ if (footer) regions.footer = ProgressiveExtractor.analyzeRegion(footer);
1036
+ const modals = root.querySelectorAll("[role=\"dialog\"], .modal, .popup, .overlay");
1037
+ const visibleModals = Array.from(modals).filter((modal) => DOMTraversal.isVisible(modal));
1038
+ if (visibleModals.length > 0) regions.modals = visibleModals.map((modal) => ProgressiveExtractor.analyzeRegion(modal));
1039
+ const forms = ProgressiveExtractor.extractFormOverview(root);
1040
+ const summary = ProgressiveExtractor.calculateSummary(root, regions, forms);
1041
+ return {
1042
+ regions,
1043
+ forms,
1044
+ summary,
1045
+ suggestions: ProgressiveExtractor.generateSuggestions(regions, summary)
1046
+ };
1047
+ }
1048
+ /**
1049
+ * Step 2: Extract detailed information from a specific region
1050
+ */
1051
+ static extractRegion(selector, doc, options = {}, smartDomReaderCtor) {
1052
+ const element = doc.querySelector(selector);
1053
+ if (!element) return null;
1054
+ const SmartDOMReaderCtor = smartDomReaderCtor ?? resolveSmartDomReader();
1055
+ if (!SmartDOMReaderCtor) throw new Error("SmartDOMReader is unavailable. Ensure the Smart DOM Reader module is loaded before calling extractRegion.");
1056
+ return new SmartDOMReaderCtor(options).extract(element, options);
1057
+ }
1058
+ /**
1059
+ * Step 3: Extract readable content from a region
1060
+ */
1061
+ static extractContent(selector, doc, options = {}) {
1062
+ const element = doc.querySelector(selector);
1063
+ if (!element) return null;
1064
+ const result = {
1065
+ selector,
1066
+ text: {},
1067
+ metadata: {
1068
+ wordCount: 0,
1069
+ hasInteractive: false
1070
+ }
1071
+ };
1072
+ if (options.includeHeadings !== false) {
1073
+ const headings = element.querySelectorAll("h1, h2, h3, h4, h5, h6");
1074
+ result.text.headings = Array.from(headings).map((h) => ({
1075
+ level: Number.parseInt(h.tagName[1], 10),
1076
+ text: ProgressiveExtractor.getTextContent(h, options.maxTextLength)
1077
+ }));
1078
+ }
1079
+ const paragraphs = element.querySelectorAll("p");
1080
+ if (paragraphs.length > 0) result.text.paragraphs = Array.from(paragraphs).map((p) => ProgressiveExtractor.getTextContent(p, options.maxTextLength)).filter((text) => text.length > 0);
1081
+ if (options.includeLists !== false) {
1082
+ const lists = element.querySelectorAll("ul, ol");
1083
+ result.text.lists = Array.from(lists).map((list) => ({
1084
+ type: list.tagName.toLowerCase(),
1085
+ items: Array.from(list.querySelectorAll("li")).map((li) => ProgressiveExtractor.getTextContent(li, options.maxTextLength))
1086
+ }));
1087
+ }
1088
+ if (options.includeTables !== false) {
1089
+ const tables = element.querySelectorAll("table");
1090
+ result.tables = Array.from(tables).map((table) => {
1091
+ return {
1092
+ headers: Array.from(table.querySelectorAll("th")).map((th) => ProgressiveExtractor.getTextContent(th)),
1093
+ rows: Array.from(table.querySelectorAll("tr")).filter((tr) => tr.querySelector("td")).map((tr) => Array.from(tr.querySelectorAll("td")).map((td) => ProgressiveExtractor.getTextContent(td)))
1094
+ };
1095
+ });
1096
+ }
1097
+ if (options.includeMedia !== false) {
1098
+ const images = element.querySelectorAll("img");
1099
+ const videos = element.querySelectorAll("video");
1100
+ const audios = element.querySelectorAll("audio");
1101
+ result.media = [
1102
+ ...Array.from(images).map((img) => {
1103
+ const item = { type: "img" };
1104
+ const alt = img.getAttribute("alt");
1105
+ const src = img.getAttribute("src");
1106
+ if (alt) item.alt = alt;
1107
+ if (src) item.src = src;
1108
+ return item;
1109
+ }),
1110
+ ...Array.from(videos).map((video) => {
1111
+ const item = { type: "video" };
1112
+ const src = video.getAttribute("src");
1113
+ if (src) item.src = src;
1114
+ return item;
1115
+ }),
1116
+ ...Array.from(audios).map((audio) => {
1117
+ const item = { type: "audio" };
1118
+ const src = audio.getAttribute("src");
1119
+ if (src) item.src = src;
1120
+ return item;
1121
+ })
1122
+ ];
1123
+ }
1124
+ const allText = element.textContent || "";
1125
+ result.metadata.wordCount = allText.trim().split(/\s+/).length;
1126
+ result.metadata.hasInteractive = element.querySelectorAll("button, a, input, textarea, select").length > 0;
1127
+ return result;
1128
+ }
1129
+ /**
1130
+ * Analyze a region and extract summary information
1131
+ */
1132
+ static analyzeRegion(element) {
1133
+ const selector = SelectorGenerator.generateSelectors(element).css;
1134
+ const buttons = element.querySelectorAll("button, [role=\"button\"]");
1135
+ const links = element.querySelectorAll("a[href]");
1136
+ const inputs = element.querySelectorAll("input, textarea, select");
1137
+ const forms = element.querySelectorAll("form");
1138
+ const lists = element.querySelectorAll("ul, ol");
1139
+ const tables = element.querySelectorAll("table");
1140
+ const media = element.querySelectorAll("img, video, audio");
1141
+ const interactiveCount = buttons.length + links.length + inputs.length;
1142
+ let label;
1143
+ const ariaLabel = element.getAttribute("aria-label");
1144
+ if (ariaLabel) label = ariaLabel;
1145
+ else if (element.getAttribute("aria-labelledby")) {
1146
+ const labelId = element.getAttribute("aria-labelledby");
1147
+ if (labelId) {
1148
+ const labelElement = element.ownerDocument?.getElementById(labelId);
1149
+ if (labelElement) label = labelElement.textContent?.trim();
1150
+ }
1151
+ } else {
1152
+ const heading = element.querySelector("h1, h2, h3");
1153
+ if (heading) label = heading.textContent?.trim();
1154
+ }
1155
+ const textContent = element.textContent?.trim() || "";
1156
+ const textPreview = textContent.length > 50 ? `${textContent.substring(0, 50)}...` : textContent;
1157
+ const regionInfo = {
1158
+ selector,
1159
+ interactiveCount,
1160
+ hasForm: forms.length > 0,
1161
+ hasList: lists.length > 0,
1162
+ hasTable: tables.length > 0,
1163
+ hasMedia: media.length > 0
1164
+ };
1165
+ if (label) regionInfo.label = label;
1166
+ const role = element.getAttribute("role");
1167
+ if (role) regionInfo.role = role;
1168
+ if (buttons.length > 0) regionInfo.buttonCount = buttons.length;
1169
+ if (links.length > 0) regionInfo.linkCount = links.length;
1170
+ if (inputs.length > 0) regionInfo.inputCount = inputs.length;
1171
+ if (textPreview.length > 0) regionInfo.textPreview = textPreview;
1172
+ return regionInfo;
1173
+ }
1174
+ /**
1175
+ * Extract overview of forms on the page
1176
+ */
1177
+ static extractFormOverview(root) {
1178
+ const forms = root.querySelectorAll("form");
1179
+ return Array.from(forms).map((form) => {
1180
+ const inputs = form.querySelectorAll("input, textarea, select");
1181
+ const selector = SelectorGenerator.generateSelectors(form).css;
1182
+ let location = "unknown";
1183
+ if (form.closest("header, [role=\"banner\"]")) location = "header";
1184
+ else if (form.closest("nav, [role=\"navigation\"]")) location = "navigation";
1185
+ else if (form.closest("main, [role=\"main\"]")) location = "main";
1186
+ else if (form.closest("aside, [role=\"complementary\"]")) location = "sidebar";
1187
+ else if (form.closest("footer, [role=\"contentinfo\"]")) location = "footer";
1188
+ let purpose;
1189
+ const formId = form.getAttribute("id")?.toLowerCase();
1190
+ const formClass = form.getAttribute("class")?.toLowerCase();
1191
+ const formAction = form.getAttribute("action")?.toLowerCase();
1192
+ const hasEmail = form.querySelector("input[type=\"email\"]");
1193
+ const hasPassword = form.querySelector("input[type=\"password\"]");
1194
+ if (form.querySelector("input[type=\"search\"]") || formId?.includes("search") || formClass?.includes("search")) purpose = "search";
1195
+ else if (hasPassword && hasEmail) purpose = "login";
1196
+ else if (hasPassword) purpose = "authentication";
1197
+ else if (formId?.includes("contact") || formClass?.includes("contact")) purpose = "contact";
1198
+ else if (formId?.includes("subscribe") || formClass?.includes("subscribe")) purpose = "subscription";
1199
+ else if (formAction?.includes("checkout") || formClass?.includes("checkout")) purpose = "checkout";
1200
+ const formOverview = {
1201
+ selector,
1202
+ location,
1203
+ inputCount: inputs.length
1204
+ };
1205
+ if (purpose) formOverview.purpose = purpose;
1206
+ return formOverview;
1207
+ });
1208
+ }
1209
+ /**
1210
+ * Calculate summary statistics
1211
+ */
1212
+ static calculateSummary(root, regions, forms) {
1213
+ const allInteractive = root.querySelectorAll("button, a[href], input, textarea, select");
1214
+ const allSections = root.querySelectorAll("section, article, [role=\"region\"]");
1215
+ const hasModals = (regions.modals?.length || 0) > 0;
1216
+ const hasErrors = [
1217
+ ".error",
1218
+ ".alert-danger",
1219
+ "[role=\"alert\"]"
1220
+ ].some((sel) => {
1221
+ const element = root.querySelector(sel);
1222
+ return element ? DOMTraversal.isVisible(element) : false;
1223
+ });
1224
+ const isLoading = [
1225
+ ".loading",
1226
+ ".spinner",
1227
+ "[aria-busy=\"true\"]"
1228
+ ].some((sel) => {
1229
+ const element = root.querySelector(sel);
1230
+ return element ? DOMTraversal.isVisible(element) : false;
1231
+ });
1232
+ const summary = {
1233
+ totalInteractive: allInteractive.length,
1234
+ totalForms: forms.length,
1235
+ totalSections: allSections.length,
1236
+ hasModals,
1237
+ hasErrors,
1238
+ isLoading
1239
+ };
1240
+ const mainContentSelector = regions.main?.selector;
1241
+ if (mainContentSelector) summary.mainContentSelector = mainContentSelector;
1242
+ return summary;
1243
+ }
1244
+ /**
1245
+ * Generate AI-friendly suggestions
1246
+ */
1247
+ static generateSuggestions(regions, summary) {
1248
+ const suggestions = [];
1249
+ if (summary.hasErrors) suggestions.push("Page has error indicators - check error messages before interacting");
1250
+ if (summary.isLoading) suggestions.push("Page appears to be loading - wait or check loading state");
1251
+ if (summary.hasModals) suggestions.push("Modal/dialog is open - may need to interact with or close it first");
1252
+ if (regions.main && regions.main.interactiveCount > 10) suggestions.push(`Main content has ${regions.main.interactiveCount} interactive elements - consider filtering`);
1253
+ if (summary.totalForms > 0) suggestions.push(`Found ${summary.totalForms} form(s) on the page`);
1254
+ if (!regions.main) suggestions.push("No clear main content area detected - may need to explore regions");
1255
+ return suggestions;
1256
+ }
1257
+ /**
1258
+ * Get text content with optional truncation
1259
+ */
1260
+ static getTextContent(element, maxLength) {
1261
+ const text = element.textContent?.trim() || "";
1262
+ if (maxLength && text.length > maxLength) return `${text.substring(0, maxLength)}...`;
1263
+ return text;
1264
+ }
1265
+ };
1266
+ }) });
1267
+
1268
+ //#endregion
1269
+ //#region src/types.ts
1270
+ var init_types = __esm({ "src/types.ts": (() => {}) });
1271
+
1272
+ //#endregion
1273
+ //#region src/index.ts
1274
+ var src_exports = /* @__PURE__ */ __export({
1275
+ ContentDetection: () => ContentDetection,
1276
+ MarkdownFormatter: () => MarkdownFormatter,
1277
+ ProgressiveExtractor: () => ProgressiveExtractor,
1278
+ SelectorGenerator: () => SelectorGenerator,
1279
+ SmartDOMReader: () => SmartDOMReader,
1280
+ default: () => src_default
1281
+ });
1282
+ var SmartDOMReader, src_default;
1283
+ var init_src = __esm({ "src/index.ts": (() => {
1284
+ init_content_detection();
1285
+ init_selectors();
1286
+ init_traversal();
1287
+ init_markdown_formatter();
1288
+ init_progressive();
1289
+ init_types();
1290
+ SmartDOMReader = class SmartDOMReader {
1291
+ options;
1292
+ constructor(options = {}) {
1293
+ this.options = {
1294
+ mode: options.mode || "interactive",
1295
+ maxDepth: options.maxDepth || 5,
1296
+ includeHidden: options.includeHidden || false,
1297
+ includeShadowDOM: options.includeShadowDOM ?? true,
1298
+ includeIframes: options.includeIframes || false,
1299
+ viewportOnly: options.viewportOnly || false,
1300
+ mainContentOnly: options.mainContentOnly || false,
1301
+ customSelectors: options.customSelectors || [],
1302
+ ...options.attributeTruncateLength !== void 0 && { attributeTruncateLength: options.attributeTruncateLength },
1303
+ ...options.dataAttributeTruncateLength !== void 0 && { dataAttributeTruncateLength: options.dataAttributeTruncateLength },
1304
+ ...options.textTruncateLength !== void 0 && { textTruncateLength: options.textTruncateLength },
1305
+ ...options.filter !== void 0 && { filter: options.filter }
1306
+ };
1307
+ }
1308
+ /**
1309
+ * Main extraction method - extracts all data in one pass
1310
+ * @param rootElement The document or element to extract from
1311
+ * @param runtimeOptions Options to override constructor options
1312
+ */
1313
+ extract(rootElement = document, runtimeOptions) {
1314
+ const startTime = Date.now();
1315
+ const doc = rootElement instanceof Document ? rootElement : rootElement.ownerDocument;
1316
+ const options = {
1317
+ ...this.options,
1318
+ ...runtimeOptions
1319
+ };
1320
+ let container = rootElement instanceof Document ? doc : rootElement;
1321
+ if (options.mainContentOnly && rootElement instanceof Document) container = ContentDetection.findMainContent(doc);
1322
+ const pageState = this.extractPageState(doc);
1323
+ const landmarks = this.extractLandmarks(doc);
1324
+ const interactive = this.extractInteractiveElements(container, options);
1325
+ const result = {
1326
+ mode: options.mode,
1327
+ timestamp: startTime,
1328
+ page: pageState,
1329
+ landmarks,
1330
+ interactive
1331
+ };
1332
+ if (options.mode === "full") {
1333
+ const semantic = this.extractSemanticElements(container, options);
1334
+ const metadata = this.extractMetadata(doc, container, options);
1335
+ return {
1336
+ ...result,
1337
+ semantic,
1338
+ metadata
1339
+ };
1340
+ }
1341
+ return result;
1342
+ }
1343
+ /**
1344
+ * Extract page state information
1345
+ */
1346
+ extractPageState(doc) {
1347
+ const hasFocus = this.getFocusedElement(doc);
1348
+ return {
1349
+ url: doc.location?.href || "",
1350
+ title: doc.title || "",
1351
+ hasErrors: this.detectErrors(doc),
1352
+ isLoading: this.detectLoading(doc),
1353
+ hasModals: this.detectModals(doc),
1354
+ ...hasFocus !== void 0 && { hasFocus }
1355
+ };
1356
+ }
1357
+ /**
1358
+ * Extract page landmarks
1359
+ */
1360
+ extractLandmarks(doc) {
1361
+ const detected = ContentDetection.detectLandmarks(doc);
1362
+ return {
1363
+ navigation: this.elementsToSelectors(detected.navigation || []),
1364
+ main: this.elementsToSelectors(detected.main || []),
1365
+ forms: this.elementsToSelectors(detected.form || []),
1366
+ headers: this.elementsToSelectors(detected.banner || []),
1367
+ footers: this.elementsToSelectors(detected.contentinfo || []),
1368
+ articles: this.elementsToSelectors(detected.region || []),
1369
+ sections: this.elementsToSelectors(detected.region || [])
1370
+ };
1371
+ }
1372
+ /**
1373
+ * Convert elements to selector strings
1374
+ */
1375
+ elementsToSelectors(elements) {
1376
+ return elements.map((el) => SelectorGenerator.generateSelectors(el).css);
1377
+ }
1378
+ /**
1379
+ * Extract interactive elements
1380
+ */
1381
+ extractInteractiveElements(container, options) {
1382
+ const buttons = [];
1383
+ const links = [];
1384
+ const inputs = [];
1385
+ const clickable = [];
1386
+ container.querySelectorAll("button, [role=\"button\"], input[type=\"button\"], input[type=\"submit\"]").forEach((el) => {
1387
+ if (this.shouldIncludeElement(el, options)) {
1388
+ const extracted = DOMTraversal.extractElement(el, options);
1389
+ if (extracted) buttons.push(extracted);
1390
+ }
1391
+ });
1392
+ container.querySelectorAll("a[href]").forEach((el) => {
1393
+ if (this.shouldIncludeElement(el, options)) {
1394
+ const extracted = DOMTraversal.extractElement(el, options);
1395
+ if (extracted) links.push(extracted);
1396
+ }
1397
+ });
1398
+ container.querySelectorAll("input:not([type=\"button\"]):not([type=\"submit\"]), textarea, select").forEach((el) => {
1399
+ if (this.shouldIncludeElement(el, options)) {
1400
+ const extracted = DOMTraversal.extractElement(el, options);
1401
+ if (extracted) inputs.push(extracted);
1402
+ }
1403
+ });
1404
+ if (options.customSelectors) options.customSelectors.forEach((selector) => {
1405
+ container.querySelectorAll(selector).forEach((el) => {
1406
+ if (this.shouldIncludeElement(el, options)) {
1407
+ const extracted = DOMTraversal.extractElement(el, options);
1408
+ if (extracted) clickable.push(extracted);
1409
+ }
1410
+ });
1411
+ });
1412
+ return {
1413
+ buttons,
1414
+ links,
1415
+ inputs,
1416
+ forms: this.extractForms(container, options),
1417
+ clickable
1418
+ };
1419
+ }
1420
+ /**
1421
+ * Extract form information
1422
+ */
1423
+ extractForms(container, options) {
1424
+ const forms = [];
1425
+ container.querySelectorAll("form").forEach((form) => {
1426
+ if (!this.shouldIncludeElement(form, options)) return;
1427
+ const formInputs = [];
1428
+ const formButtons = [];
1429
+ form.querySelectorAll("input:not([type=\"button\"]):not([type=\"submit\"]), textarea, select").forEach((input) => {
1430
+ const extracted = DOMTraversal.extractElement(input, options);
1431
+ if (extracted) formInputs.push(extracted);
1432
+ });
1433
+ form.querySelectorAll("button, input[type=\"button\"], input[type=\"submit\"]").forEach((button) => {
1434
+ const extracted = DOMTraversal.extractElement(button, options);
1435
+ if (extracted) formButtons.push(extracted);
1436
+ });
1437
+ const action = form.getAttribute("action");
1438
+ const method = form.getAttribute("method");
1439
+ const formInfo = {
1440
+ selector: SelectorGenerator.generateSelectors(form).css,
1441
+ inputs: formInputs,
1442
+ buttons: formButtons
1443
+ };
1444
+ if (action) formInfo.action = action;
1445
+ if (method) formInfo.method = method;
1446
+ forms.push(formInfo);
1447
+ });
1448
+ return forms;
1449
+ }
1450
+ /**
1451
+ * Extract semantic elements (full mode only)
1452
+ */
1453
+ extractSemanticElements(container, options) {
1454
+ const headings = [];
1455
+ const images = [];
1456
+ const tables = [];
1457
+ const lists = [];
1458
+ const articles = [];
1459
+ container.querySelectorAll("h1, h2, h3, h4, h5, h6").forEach((el) => {
1460
+ if (this.shouldIncludeElement(el, options)) {
1461
+ const extracted = DOMTraversal.extractElement(el, options);
1462
+ if (extracted) headings.push(extracted);
1463
+ }
1464
+ });
1465
+ container.querySelectorAll("img").forEach((el) => {
1466
+ if (this.shouldIncludeElement(el, options)) {
1467
+ const extracted = DOMTraversal.extractElement(el, options);
1468
+ if (extracted) images.push(extracted);
1469
+ }
1470
+ });
1471
+ container.querySelectorAll("table").forEach((el) => {
1472
+ if (this.shouldIncludeElement(el, options)) {
1473
+ const extracted = DOMTraversal.extractElement(el, options);
1474
+ if (extracted) tables.push(extracted);
1475
+ }
1476
+ });
1477
+ container.querySelectorAll("ul, ol").forEach((el) => {
1478
+ if (this.shouldIncludeElement(el, options)) {
1479
+ const extracted = DOMTraversal.extractElement(el, options);
1480
+ if (extracted) lists.push(extracted);
1481
+ }
1482
+ });
1483
+ container.querySelectorAll("article, [role=\"article\"]").forEach((el) => {
1484
+ if (this.shouldIncludeElement(el, options)) {
1485
+ const extracted = DOMTraversal.extractElement(el, options);
1486
+ if (extracted) articles.push(extracted);
1487
+ }
1488
+ });
1489
+ return {
1490
+ headings,
1491
+ images,
1492
+ tables,
1493
+ lists,
1494
+ articles
1495
+ };
1496
+ }
1497
+ /**
1498
+ * Extract metadata
1499
+ */
1500
+ extractMetadata(doc, container, options) {
1501
+ const allElements = container.querySelectorAll("*");
1502
+ const extractedElements = container.querySelectorAll("button, a, input, textarea, select, h1, h2, h3, h4, h5, h6, img, table, ul, ol, article").length;
1503
+ const metadata = {
1504
+ totalElements: allElements.length,
1505
+ extractedElements
1506
+ };
1507
+ if (options.mainContentOnly && container instanceof Element) metadata.mainContent = SelectorGenerator.generateSelectors(container).css;
1508
+ const language = doc.documentElement.getAttribute("lang");
1509
+ if (language) metadata.language = language;
1510
+ return metadata;
1511
+ }
1512
+ /**
1513
+ * Check if element should be included based on options
1514
+ */
1515
+ shouldIncludeElement(element, options) {
1516
+ if (!options.includeHidden && !DOMTraversal.isVisible(element)) return false;
1517
+ if (options.viewportOnly && !DOMTraversal.isInViewport(element)) return false;
1518
+ if (options.filter && !DOMTraversal.passesFilter(element, options.filter)) return false;
1519
+ return true;
1520
+ }
1521
+ /**
1522
+ * Detect errors on the page
1523
+ */
1524
+ detectErrors(doc) {
1525
+ return [
1526
+ ".error",
1527
+ ".alert-danger",
1528
+ "[role=\"alert\"]",
1529
+ ".error-message"
1530
+ ].some((sel) => {
1531
+ const element = doc.querySelector(sel);
1532
+ return element ? DOMTraversal.isVisible(element) : false;
1533
+ });
1534
+ }
1535
+ /**
1536
+ * Detect if page is loading
1537
+ */
1538
+ detectLoading(doc) {
1539
+ return [
1540
+ ".loading",
1541
+ ".spinner",
1542
+ "[aria-busy=\"true\"]",
1543
+ ".loader"
1544
+ ].some((sel) => {
1545
+ const element = doc.querySelector(sel);
1546
+ return element ? DOMTraversal.isVisible(element) : false;
1547
+ });
1548
+ }
1549
+ /**
1550
+ * Detect modal dialogs
1551
+ */
1552
+ detectModals(doc) {
1553
+ return [
1554
+ "[role=\"dialog\"]",
1555
+ ".modal",
1556
+ ".popup",
1557
+ ".overlay"
1558
+ ].some((sel) => {
1559
+ const element = doc.querySelector(sel);
1560
+ return element ? DOMTraversal.isVisible(element) : false;
1561
+ });
1562
+ }
1563
+ /**
1564
+ * Get currently focused element
1565
+ */
1566
+ getFocusedElement(doc) {
1567
+ const focused = doc.activeElement;
1568
+ if (focused && focused !== doc.body) return SelectorGenerator.generateSelectors(focused).css;
1569
+ }
1570
+ /**
1571
+ * Quick extraction for interactive elements only
1572
+ * @param doc The document to extract from
1573
+ * @param options Extraction options
1574
+ */
1575
+ static extractInteractive(doc, options = {}) {
1576
+ return new SmartDOMReader({
1577
+ ...options,
1578
+ mode: "interactive"
1579
+ }).extract(doc);
1580
+ }
1581
+ /**
1582
+ * Quick extraction for full content
1583
+ * @param doc The document to extract from
1584
+ * @param options Extraction options
1585
+ */
1586
+ static extractFull(doc, options = {}) {
1587
+ return new SmartDOMReader({
1588
+ ...options,
1589
+ mode: "full"
1590
+ }).extract(doc);
1591
+ }
1592
+ /**
1593
+ * Extract from a specific element
1594
+ * @param element The element to extract from
1595
+ * @param mode The extraction mode
1596
+ * @param options Additional options
1597
+ */
1598
+ static extractFromElement(element, mode = "interactive", options = {}) {
1599
+ return new SmartDOMReader({
1600
+ ...options,
1601
+ mode
1602
+ }).extract(element);
1603
+ }
1604
+ };
1605
+ src_default = SmartDOMReader;
1606
+ }) });
1607
+
1608
+ //#endregion
1609
+ init_src();
1610
+ export { ContentDetection, MarkdownFormatter, ProgressiveExtractor, SelectorGenerator, SmartDOMReader, src_default as default };
1611
+ //# sourceMappingURL=index.js.map