page-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,215 @@
1
+ /**
2
+ * Playwright-based page extractor.
3
+ * Launches headless Chromium, navigates to URL, scrolls, extracts blocks + element geometries + HTML.
4
+ */
5
+
6
+ // In-browser block extraction function (serialized into page.evaluate)
7
+ // Imported from the project's extract-blocks script
8
+ import {
9
+ extractBlocksInBrowser,
10
+ scrollToBottom,
11
+ waitForStableHeight
12
+ } from './vendor/extract-blocks.js';
13
+
14
+ export class PageExtractor {
15
+ constructor(config = {}) {
16
+ this.config = {
17
+ timeoutMs: Number.isInteger(config.timeoutMs) ? Math.max(1000, config.timeoutMs) : 30000,
18
+ viewportWidth: Number.isInteger(config.viewportWidth) ? Math.max(320, config.viewportWidth) : 1440,
19
+ viewportHeight: Number.isInteger(config.viewportHeight) ? Math.max(400, config.viewportHeight) : 900,
20
+ minBlockHeight: Number.isInteger(config.minBlockHeight) ? Math.max(1, config.minBlockHeight) : 40,
21
+ minBlockWidthRatio: Number.isFinite(config.minBlockWidthRatio)
22
+ ? Math.min(1, Math.max(0.05, config.minBlockWidthRatio))
23
+ : 0.25,
24
+ blockMaxHeightRatio: Number.isFinite(config.blockMaxHeightRatio)
25
+ ? Math.max(0.5, config.blockMaxHeightRatio)
26
+ : 1.5,
27
+ blockMaxDepth: Number.isInteger(config.blockMaxDepth) ? Math.max(1, config.blockMaxDepth) : 15,
28
+ textPreviewMaxChars: Number.isInteger(config.textPreviewMaxChars)
29
+ ? Math.max(120, config.textPreviewMaxChars)
30
+ : 1200
31
+ };
32
+ this.playwrightModule = null;
33
+ }
34
+
35
+ async getPlaywright() {
36
+ if (this.playwrightModule) {
37
+ return this.playwrightModule;
38
+ }
39
+ const mod = await import('playwright');
40
+ this.playwrightModule = mod.default || mod;
41
+ return this.playwrightModule;
42
+ }
43
+
44
+ async revealHiddenContent(page) {
45
+ return page.evaluate(() => {
46
+ const CONTENT_THRESHOLD = 20;
47
+ let opacityCount = 0;
48
+ let displayCount = 0;
49
+
50
+ for (const el of document.querySelectorAll('*')) {
51
+ const style = getComputedStyle(el);
52
+ if (parseFloat(style.opacity) === 0 && el.getBoundingClientRect().height > 0) {
53
+ const text = (el.innerText || '').trim();
54
+ if (text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0) {
55
+ el.style.setProperty('opacity', '1', 'important');
56
+ opacityCount += 1;
57
+ }
58
+ continue;
59
+ }
60
+
61
+ if (style.display === 'none') {
62
+ const parent = el.parentElement;
63
+ if (parent && getComputedStyle(parent).display === 'none') {
64
+ continue;
65
+ }
66
+
67
+ const originalDisplay = el.style.display;
68
+ el.style.setProperty('display', 'block', 'important');
69
+ const text = (el.innerText || '').trim();
70
+
71
+ if (text.length >= CONTENT_THRESHOLD) {
72
+ displayCount += 1;
73
+ } else if (originalDisplay) {
74
+ el.style.display = originalDisplay;
75
+ } else {
76
+ el.style.removeProperty('display');
77
+ }
78
+ }
79
+ }
80
+
81
+ return { opacityCount, displayCount };
82
+ });
83
+ }
84
+
85
+ async collectElementGeometries(page) {
86
+ return page.evaluate(() => {
87
+ const INTERACTIVE_SELECTOR = 'a, button, form, input, select, textarea, [onclick], [role="button"]';
88
+ const records = [];
89
+ const seenNodes = new Set();
90
+
91
+ const normalizeTextInPage = (value, maxLength = 240) => String(value || '')
92
+ .replace(/\s+/g, ' ')
93
+ .trim()
94
+ .slice(0, maxLength);
95
+
96
+ const normalizeHref = (value) => {
97
+ const raw = String(value || '').trim();
98
+ if (!raw) return '';
99
+ try {
100
+ const resolved = new URL(raw, location.href);
101
+ resolved.hash = '';
102
+ const href = resolved.href;
103
+ return href.endsWith('/') && href !== `${resolved.origin}/`
104
+ ? href.slice(0, -1)
105
+ : href;
106
+ } catch {
107
+ return raw;
108
+ }
109
+ };
110
+
111
+ const buildPath = (el, useNthOfType) => {
112
+ if (!el || !el.tagName) return '';
113
+ const parts = [];
114
+ let current = el;
115
+ while (current && current.tagName && current.tagName.toLowerCase() !== 'body') {
116
+ const parent = current.parentElement;
117
+ if (!parent || parent.tagName.toLowerCase() === 'html') break;
118
+ const tag = current.tagName.toLowerCase();
119
+ const siblings = Array.from(parent.children).filter((child) => {
120
+ if (!(child instanceof Element)) return false;
121
+ return useNthOfType ? child.tagName.toLowerCase() === tag : true;
122
+ });
123
+ const index = siblings.indexOf(current) + 1;
124
+ const suffix = useNthOfType
125
+ ? `:nth-of-type(${index})`
126
+ : `:nth-child(${index})`;
127
+ parts.unshift(`${tag}${suffix}`);
128
+ current = parent;
129
+ }
130
+ if (parts.length === 0) return '';
131
+ return `body > ${parts.join(' > ')}`;
132
+ };
133
+
134
+ for (const element of document.querySelectorAll(INTERACTIVE_SELECTOR)) {
135
+ if (!(element instanceof Element) || seenNodes.has(element)) continue;
136
+ seenNodes.add(element);
137
+
138
+ const rect = element.getBoundingClientRect();
139
+ if (rect.width <= 0 || rect.height <= 0) continue;
140
+
141
+ records.push({
142
+ tag: element.tagName.toLowerCase(),
143
+ text: normalizeTextInPage(element.innerText || element.textContent || ''),
144
+ href: normalizeHref(element.getAttribute('href') || ''),
145
+ action: normalizeHref(element.getAttribute('action') || ''),
146
+ formAction: normalizeHref(element.getAttribute('formaction') || ''),
147
+ name: normalizeTextInPage(element.getAttribute('name') || '', 120),
148
+ type: normalizeTextInPage(element.getAttribute('type') || '', 40),
149
+ ariaLabel: normalizeTextInPage(element.getAttribute('aria-label') || '', 120),
150
+ top: rect.top + window.scrollY,
151
+ left: rect.left + window.scrollX,
152
+ width: rect.width,
153
+ height: rect.height,
154
+ selectorNthOfType: buildPath(element, true),
155
+ selectorNthChild: buildPath(element, false),
156
+ selectorById: element.id ? `[id="${String(element.id).replace(/"/g, '\\"')}"]` : ''
157
+ });
158
+ }
159
+
160
+ return records;
161
+ });
162
+ }
163
+
164
+ /**
165
+ * Extract page data: html, blocks, elementGeometries, markdown
166
+ * @param {string} url - URL to extract
167
+ * @returns {Promise<{html, blocks, elementGeometries, pageSize}>}
168
+ */
169
+ async extract(url) {
170
+ const targetUrl = String(url || '').trim();
171
+ if (!targetUrl) {
172
+ throw new Error('PageExtractor requires a non-empty URL');
173
+ }
174
+
175
+ const viewport = {
176
+ width: this.config.viewportWidth,
177
+ height: this.config.viewportHeight
178
+ };
179
+
180
+ const playwright = await this.getPlaywright();
181
+ const browser = await playwright.chromium.launch({ headless: true });
182
+ try {
183
+ const page = await browser.newPage({ viewport });
184
+ await page.goto(targetUrl, {
185
+ waitUntil: 'domcontentloaded',
186
+ timeout: this.config.timeoutMs
187
+ });
188
+ await scrollToBottom(page);
189
+ await waitForStableHeight(page, { maxWait: this.config.timeoutMs });
190
+ await this.revealHiddenContent(page);
191
+
192
+ const html = await page.content();
193
+ const pageSize = await page.evaluate(() => ({
194
+ width: document.documentElement.scrollWidth || 0,
195
+ height: document.documentElement.scrollHeight || 0
196
+ }));
197
+
198
+ const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
199
+ const blocksResult = await page.evaluate(extractBlocksInBrowser, {
200
+ minHeight: this.config.minBlockHeight,
201
+ minWidth,
202
+ maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
203
+ maxDepth: this.config.blockMaxDepth,
204
+ textPreviewMaxChars: this.config.textPreviewMaxChars,
205
+ debug: false
206
+ });
207
+ const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
208
+ const elementGeometries = await this.collectElementGeometries(page);
209
+
210
+ return { html, blocks, elementGeometries, pageSize };
211
+ } finally {
212
+ await browser.close();
213
+ }
214
+ }
215
+ }
@@ -0,0 +1,31 @@
1
+ /**
2
+ * Escape a value for use inside a CSS attribute selector string.
3
+ * Escapes backslashes first, then double-quotes.
4
+ * @param {*} value
5
+ * @returns {string}
6
+ */
7
+ export function escapeAttributeValue(value) {
8
+ return String(value)
9
+ .replace(/\\/g, '\\\\')
10
+ .replace(/"/g, '\\"');
11
+ }
12
+
13
+ /**
14
+ * Return true for CSS class tokens that are likely stable identifiers
15
+ * (not transient UI-state classes or hash-like tokens).
16
+ * @param {string} token
17
+ * @returns {boolean}
18
+ */
19
+ export function isLikelyStableClassToken(token) {
20
+ const normalized = String(token || '').trim();
21
+ if (!normalized || normalized.length < 2 || normalized.length > 48) {
22
+ return false;
23
+ }
24
+ if (/^(active|selected|open|show|hide|visible|hidden|focus|hover|disabled|enabled)$/i.test(normalized)) {
25
+ return false;
26
+ }
27
+ if (/\d{4,}/.test(normalized)) {
28
+ return false;
29
+ }
30
+ return /^[a-zA-Z0-9_-]+$/.test(normalized);
31
+ }
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Normalize whitespace-heavy text into a compact single-line string.
3
+ * Collapses all whitespace sequences to a single space and trims.
4
+ * @param {*} value
5
+ * @returns {string}
6
+ */
7
+ export function normalizeText(value) {
8
+ return String(value || '')
9
+ .replace(/\s+/g, ' ')
10
+ .trim();
11
+ }
@@ -0,0 +1,43 @@
1
+ import { URL } from 'url';
2
+
3
+ /**
4
+ * Normalize URL (convert relative to absolute, remove fragments)
5
+ * @param {string} urlStr - URL string to normalize
6
+ * @param {string} baseUrl - Base URL for resolving relative links
7
+ * @param {Object} options - Normalization options
8
+ * @returns {string} Normalized absolute URL
9
+ */
10
+ export function normalizeUrl(urlStr, baseUrl, options = {}) {
11
+ const {
12
+ skipFragments = true,
13
+ normalizeTrailingSlash = true
14
+ } = options;
15
+
16
+ if (!urlStr) return '';
17
+
18
+ try {
19
+ urlStr = urlStr.trim();
20
+
21
+ if (urlStr.startsWith('javascript:') ||
22
+ urlStr.startsWith('mailto:') ||
23
+ urlStr.startsWith('tel:') ||
24
+ urlStr.startsWith('#')) {
25
+ return urlStr;
26
+ }
27
+
28
+ const absoluteUrl = new URL(urlStr, baseUrl);
29
+
30
+ if (skipFragments) {
31
+ absoluteUrl.hash = '';
32
+ }
33
+
34
+ let normalized = absoluteUrl.href;
35
+ if (normalizeTrailingSlash && normalized.endsWith('/') && normalized !== absoluteUrl.origin + '/') {
36
+ normalized = normalized.slice(0, -1);
37
+ }
38
+
39
+ return normalized;
40
+ } catch {
41
+ return urlStr;
42
+ }
43
+ }