page-analyzer 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/page-extractor.js CHANGED
@@ -3,6 +3,9 @@
3
3
  * Launches headless Chromium, navigates to URL, scrolls, extracts blocks + element geometries + HTML.
4
4
  */
5
5
 
6
+ import fs from 'node:fs/promises';
7
+ import path from 'node:path';
8
+
6
9
  // In-browser block extraction function (serialized into page.evaluate)
7
10
  // Imported from the project's extract-blocks script
8
11
  import {
@@ -11,6 +14,41 @@ import {
11
14
  waitForStableHeight
12
15
  } from './vendor/extract-blocks.js';
13
16
 
17
+ function createSnapshotSlug(url) {
18
+ let source = String(url || '').trim();
19
+ try {
20
+ const parsed = new URL(source);
21
+ source = `${parsed.hostname}${parsed.pathname}`;
22
+ } catch {
23
+ // Keep the raw value for non-URL inputs.
24
+ }
25
+
26
+ const slug = source
27
+ .toLowerCase()
28
+ .replace(/[^a-z0-9]+/g, '-')
29
+ .replace(/^-+|-+$/g, '')
30
+ .slice(0, 80);
31
+
32
+ return slug || 'page';
33
+ }
34
+
35
+ function createSnapshotRunId() {
36
+ return new Date()
37
+ .toISOString()
38
+ .replace(/\.\d{3}z$/i, '')
39
+ .replace(/[^0-9a-z]+/gi, '-')
40
+ .replace(/^-+|-+$/g, '');
41
+ }
42
+
43
+ function getBlockNumber(block, fallbackIndex) {
44
+ return Number.isInteger(block?.blockIdx) ? block.blockIdx : fallbackIndex;
45
+ }
46
+
47
+ function getBlockSelector(block) {
48
+ const selector = typeof block?.blockCssPath === 'string' ? block.blockCssPath.trim() : '';
49
+ return selector || '';
50
+ }
51
+
14
52
  export class PageExtractor {
15
53
  constructor(config = {}) {
16
54
  this.config = {
@@ -27,7 +65,13 @@ export class PageExtractor {
27
65
  blockMaxDepth: Number.isInteger(config.blockMaxDepth) ? Math.max(1, config.blockMaxDepth) : 15,
28
66
  textPreviewMaxChars: Number.isInteger(config.textPreviewMaxChars)
29
67
  ? Math.max(120, config.textPreviewMaxChars)
30
- : 1200
68
+ : 1200,
69
+ waitForImagesLoaded: Boolean(config.waitForImagesLoaded),
70
+ fullPageScreenshot: Boolean(config.fullPageScreenshot),
71
+ blockScreenshots: Boolean(config.blockScreenshots),
72
+ snapshotDir: typeof config.snapshotDir === 'string' && config.snapshotDir.trim()
73
+ ? path.resolve(process.cwd(), config.snapshotDir)
74
+ : path.resolve(process.cwd(), 'snapshots')
31
75
  };
32
76
  this.playwrightModule = null;
33
77
  }
@@ -44,14 +88,40 @@ export class PageExtractor {
44
88
  async revealHiddenContent(page) {
45
89
  return page.evaluate(() => {
46
90
  const CONTENT_THRESHOLD = 20;
91
+ const NON_CONTENT_TAGS = new Set([
92
+ 'SCRIPT',
93
+ 'STYLE',
94
+ 'NOSCRIPT',
95
+ 'TEMPLATE',
96
+ 'META',
97
+ 'LINK',
98
+ 'IFRAME',
99
+ 'OBJECT',
100
+ 'EMBED'
101
+ ]);
47
102
  let opacityCount = 0;
48
103
  let displayCount = 0;
49
104
 
105
+ const isLikelyScriptText = (value) => {
106
+ const text = String(value || '').trim();
107
+ if (!text) {
108
+ return false;
109
+ }
110
+ return /(_satellite|google_tag_manager|dataLayer|window\.|document\.|function\s*\(|=>|createElement\(|appendChild\(|\.push\(|var\s+\w+\s*=|const\s+\w+\s*=|let\s+\w+\s*=)/.test(text);
111
+ };
112
+
50
113
  for (const el of document.querySelectorAll('*')) {
114
+ if (NON_CONTENT_TAGS.has(el.tagName)) {
115
+ continue;
116
+ }
117
+
51
118
  const style = getComputedStyle(el);
52
119
  if (parseFloat(style.opacity) === 0 && el.getBoundingClientRect().height > 0) {
53
120
  const text = (el.innerText || '').trim();
54
- if (text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0) {
121
+ if (
122
+ !isLikelyScriptText(text) &&
123
+ (text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0)
124
+ ) {
55
125
  el.style.setProperty('opacity', '1', 'important');
56
126
  opacityCount += 1;
57
127
  }
@@ -68,7 +138,7 @@ export class PageExtractor {
68
138
  el.style.setProperty('display', 'block', 'important');
69
139
  const text = (el.innerText || '').trim();
70
140
 
71
- if (text.length >= CONTENT_THRESHOLD) {
141
+ if (text.length >= CONTENT_THRESHOLD && !isLikelyScriptText(text)) {
72
142
  displayCount += 1;
73
143
  } else if (originalDisplay) {
74
144
  el.style.display = originalDisplay;
@@ -82,6 +152,80 @@ export class PageExtractor {
82
152
  });
83
153
  }
84
154
 
155
+ async waitForImagesLoaded(page) {
156
+ if (!this.config.waitForImagesLoaded) {
157
+ return null;
158
+ }
159
+
160
+ const timeoutMs = this.config.timeoutMs;
161
+ const result = await page.evaluate(async ({ timeoutMs: waitTimeoutMs }) => {
162
+ const images = Array.from(document.images || []);
163
+ const total = images.length;
164
+ if (total === 0) {
165
+ return {
166
+ total,
167
+ loaded: 0,
168
+ timedOut: false
169
+ };
170
+ }
171
+
172
+ const isSettled = (img) => img.complete;
173
+ const countLoaded = () => images.filter(isSettled).length;
174
+ const pending = images.filter((img) => !isSettled(img));
175
+
176
+ if (pending.length === 0) {
177
+ return {
178
+ total,
179
+ loaded: total,
180
+ timedOut: false
181
+ };
182
+ }
183
+
184
+ let timeoutId = null;
185
+ const waitForImage = (img) => new Promise((resolve) => {
186
+ if (isSettled(img)) {
187
+ resolve();
188
+ return;
189
+ }
190
+
191
+ const done = () => {
192
+ img.removeEventListener('load', done);
193
+ img.removeEventListener('error', done);
194
+ resolve();
195
+ };
196
+ img.addEventListener('load', done, { once: true });
197
+ img.addEventListener('error', done, { once: true });
198
+ });
199
+
200
+ const allImagesDone = Promise.all(pending.map(waitForImage))
201
+ .then(() => ({ timedOut: false }));
202
+ const timeout = new Promise((resolve) => {
203
+ timeoutId = window.setTimeout(() => resolve({ timedOut: true }), waitTimeoutMs);
204
+ });
205
+
206
+ const waitResult = await Promise.race([allImagesDone, timeout]);
207
+ if (timeoutId !== null) {
208
+ window.clearTimeout(timeoutId);
209
+ }
210
+
211
+ return {
212
+ total,
213
+ loaded: countLoaded(),
214
+ timedOut: Boolean(waitResult?.timedOut)
215
+ };
216
+ }, { timeoutMs });
217
+
218
+ if (result?.timedOut) {
219
+ console.warn(
220
+ `[page-analyzer] Timed out waiting for images: ${result.loaded}/${result.total} completed`
221
+ );
222
+ } else {
223
+ console.log(`[page-analyzer] Images loaded: ${result?.loaded || 0}/${result?.total || 0}`);
224
+ }
225
+
226
+ return result;
227
+ }
228
+
85
229
  async collectElementGeometries(page) {
86
230
  return page.evaluate(() => {
87
231
  const INTERACTIVE_SELECTOR = 'a, button, form, input, select, textarea, [onclick], [role="button"]';
@@ -161,10 +305,221 @@ export class PageExtractor {
161
305
  });
162
306
  }
163
307
 
308
+ async collectPageSize(page) {
309
+ return page.evaluate(() => {
310
+ const html = document.documentElement;
311
+ const body = document.body;
312
+ return {
313
+ width: Math.max(
314
+ html?.scrollWidth || 0,
315
+ html?.offsetWidth || 0,
316
+ html?.clientWidth || 0,
317
+ body?.scrollWidth || 0,
318
+ body?.offsetWidth || 0,
319
+ body?.clientWidth || 0,
320
+ window.innerWidth || 0
321
+ ),
322
+ height: Math.max(
323
+ html?.scrollHeight || 0,
324
+ html?.offsetHeight || 0,
325
+ html?.clientHeight || 0,
326
+ body?.scrollHeight || 0,
327
+ body?.offsetHeight || 0,
328
+ body?.clientHeight || 0,
329
+ window.innerHeight || 0
330
+ )
331
+ };
332
+ });
333
+ }
334
+
335
+ async hideExternalFixedOverlays(page, selector) {
336
+ return page.evaluate((targetSelector) => {
337
+ const existing = Array.isArray(window.__pageAnalyzerHiddenOverlays)
338
+ ? window.__pageAnalyzerHiddenOverlays
339
+ : [];
340
+ for (const item of existing) {
341
+ if (!item?.element) continue;
342
+ if (item.visibilityValue) {
343
+ item.element.style.setProperty(
344
+ 'visibility',
345
+ item.visibilityValue,
346
+ item.visibilityPriority || ''
347
+ );
348
+ } else {
349
+ item.element.style.removeProperty('visibility');
350
+ }
351
+ }
352
+
353
+ const target = document.querySelector(targetSelector);
354
+ if (!(target instanceof Element)) {
355
+ window.__pageAnalyzerHiddenOverlays = [];
356
+ return 0;
357
+ }
358
+
359
+ const hidden = [];
360
+ for (const element of document.querySelectorAll('body *')) {
361
+ if (!(element instanceof HTMLElement)) continue;
362
+ if (element === target || element.contains(target) || target.contains(element)) continue;
363
+
364
+ const style = getComputedStyle(element);
365
+ if (style.position !== 'fixed' && style.position !== 'sticky') continue;
366
+
367
+ const rect = element.getBoundingClientRect();
368
+ if (rect.width <= 0 || rect.height <= 0) continue;
369
+
370
+ hidden.push({
371
+ element,
372
+ visibilityValue: element.style.getPropertyValue('visibility'),
373
+ visibilityPriority: element.style.getPropertyPriority('visibility')
374
+ });
375
+ element.style.setProperty('visibility', 'hidden', 'important');
376
+ }
377
+
378
+ window.__pageAnalyzerHiddenOverlays = hidden;
379
+ return hidden.length;
380
+ }, selector);
381
+ }
382
+
383
+ async restoreExternalFixedOverlays(page) {
384
+ await page.evaluate(() => {
385
+ const hidden = Array.isArray(window.__pageAnalyzerHiddenOverlays)
386
+ ? window.__pageAnalyzerHiddenOverlays
387
+ : [];
388
+ for (const item of hidden) {
389
+ if (!item?.element) continue;
390
+ if (item.visibilityValue) {
391
+ item.element.style.setProperty(
392
+ 'visibility',
393
+ item.visibilityValue,
394
+ item.visibilityPriority || ''
395
+ );
396
+ } else {
397
+ item.element.style.removeProperty('visibility');
398
+ }
399
+ }
400
+ window.__pageAnalyzerHiddenOverlays = [];
401
+ });
402
+ }
403
+
404
+ async captureBlockScreenshot(page, block, blockPath) {
405
+ if (block?.hidden) {
406
+ return false;
407
+ }
408
+
409
+ const selector = getBlockSelector(block);
410
+ if (!selector) {
411
+ return false;
412
+ }
413
+
414
+ try {
415
+ const locator = page.locator(selector).first();
416
+ if (await locator.count() > 0) {
417
+ await this.hideExternalFixedOverlays(page, selector);
418
+ try {
419
+ await locator.screenshot({ path: blockPath });
420
+ return true;
421
+ } finally {
422
+ await this.restoreExternalFixedOverlays(page);
423
+ }
424
+ }
425
+ } catch {
426
+ // Selector-only mode: skip blocks that cannot be captured through CSS.
427
+ }
428
+
429
+ return false;
430
+ }
431
+
432
+ async captureScreenshots(page, targetUrl, blocks, options = {}) {
433
+ const fullPageScreenshot = options.fullPageScreenshot ?? this.config.fullPageScreenshot;
434
+ const blockScreenshots = options.blockScreenshots ?? this.config.blockScreenshots;
435
+
436
+ if (!fullPageScreenshot && !blockScreenshots) {
437
+ return null;
438
+ }
439
+
440
+ await fs.mkdir(this.config.snapshotDir, { recursive: true });
441
+ const prefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
442
+ const screenshots = {};
443
+
444
+ if (fullPageScreenshot) {
445
+ const fullPagePath = path.join(this.config.snapshotDir, `${prefix}-full-page.png`);
446
+ await page.screenshot({
447
+ path: fullPagePath,
448
+ fullPage: true
449
+ });
450
+ screenshots.fullPage = fullPagePath;
451
+ }
452
+
453
+ if (blockScreenshots) {
454
+ screenshots.blocks = [];
455
+
456
+ for (let index = 0; index < blocks.length; index += 1) {
457
+ const block = blocks[index];
458
+ const blockIdx = getBlockNumber(block, index);
459
+
460
+ const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
461
+ const blockPath = path.join(this.config.snapshotDir, `${prefix}-block-${blockLabel}.png`);
462
+ try {
463
+ const captured = await this.captureBlockScreenshot(page, block, blockPath);
464
+ if (captured) {
465
+ const screenshotRecord = {
466
+ blockIdx,
467
+ path: blockPath
468
+ };
469
+ if (typeof block?.blockName === 'string' && block.blockName.trim()) {
470
+ screenshotRecord.blockName = block.blockName.trim();
471
+ }
472
+ if (typeof block?.blockIdxs === 'string' && block.blockIdxs.trim()) {
473
+ screenshotRecord.blockIdxs = block.blockIdxs.trim();
474
+ }
475
+ screenshots.blocks.push(screenshotRecord);
476
+ }
477
+ } catch (error) {
478
+ console.warn(`[page-analyzer] Failed to capture block ${blockIdx}: ${error.message}`);
479
+ }
480
+ }
481
+ }
482
+
483
+ return screenshots;
484
+ }
485
+
486
+ async preparePage(page, targetUrl) {
487
+ await page.goto(targetUrl, {
488
+ waitUntil: 'domcontentloaded',
489
+ timeout: this.config.timeoutMs
490
+ });
491
+ await scrollToBottom(page);
492
+ await waitForStableHeight(page, { maxWait: this.config.timeoutMs });
493
+ await this.revealHiddenContent(page);
494
+ await this.waitForImagesLoaded(page);
495
+ }
496
+
497
+ async captureUrlScreenshots(url, blocks, options = {}) {
498
+ const targetUrl = String(url || '').trim();
499
+ if (!targetUrl) {
500
+ throw new Error('PageExtractor requires a non-empty URL');
501
+ }
502
+
503
+ const viewport = {
504
+ width: this.config.viewportWidth,
505
+ height: this.config.viewportHeight
506
+ };
507
+
508
+ const playwright = await this.getPlaywright();
509
+ const browser = await playwright.chromium.launch({ headless: true });
510
+ try {
511
+ const page = await browser.newPage({ viewport });
512
+ await this.preparePage(page, targetUrl);
513
+ return await this.captureScreenshots(page, targetUrl, blocks, options);
514
+ } finally {
515
+ await browser.close();
516
+ }
517
+ }
518
+
164
519
  /**
165
- * Extract page data: html, blocks, elementGeometries, markdown
520
+ * Extract page data: html, blocks, elementGeometries, screenshots
166
521
  * @param {string} url - URL to extract
167
- * @returns {Promise<{html, blocks, elementGeometries, pageSize}>}
522
+ * @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
168
523
  */
169
524
  async extract(url) {
170
525
  const targetUrl = String(url || '').trim();
@@ -181,19 +536,9 @@ export class PageExtractor {
181
536
  const browser = await playwright.chromium.launch({ headless: true });
182
537
  try {
183
538
  const page = await browser.newPage({ viewport });
184
- await page.goto(targetUrl, {
185
- waitUntil: 'domcontentloaded',
186
- timeout: this.config.timeoutMs
187
- });
188
- await scrollToBottom(page);
189
- await waitForStableHeight(page, { maxWait: this.config.timeoutMs });
190
- await this.revealHiddenContent(page);
539
+ await this.preparePage(page, targetUrl);
191
540
 
192
541
  const html = await page.content();
193
- const pageSize = await page.evaluate(() => ({
194
- width: document.documentElement.scrollWidth || 0,
195
- height: document.documentElement.scrollHeight || 0
196
- }));
197
542
 
198
543
  const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
199
544
  const blocksResult = await page.evaluate(extractBlocksInBrowser, {
@@ -206,8 +551,10 @@ export class PageExtractor {
206
551
  });
207
552
  const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
208
553
  const elementGeometries = await this.collectElementGeometries(page);
554
+ const finalPageSize = await this.collectPageSize(page);
555
+ const screenshots = await this.captureScreenshots(page, targetUrl, blocks);
209
556
 
210
- return { html, blocks, elementGeometries, pageSize };
557
+ return { html, blocks, elementGeometries, screenshots, pageSize: finalPageSize };
211
558
  } finally {
212
559
  await browser.close();
213
560
  }