page-analyzer 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/page-extractor.js CHANGED
@@ -3,6 +3,10 @@
3
3
  * Launches headless Chromium, navigates to URL, scrolls, extracts blocks + element geometries + HTML.
4
4
  */
5
5
 
6
+ import fs from 'node:fs/promises';
7
+ import path from 'node:path';
8
+ import { PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
9
+
6
10
  // In-browser block extraction function (serialized into page.evaluate)
7
11
  // Imported from the project's extract-blocks script
8
12
  import {
@@ -11,6 +15,110 @@ import {
11
15
  waitForStableHeight
12
16
  } from './vendor/extract-blocks.js';
13
17
 
18
+ function createSnapshotSlug(url) {
19
+ let source = String(url || '').trim();
20
+ try {
21
+ const parsed = new URL(source);
22
+ source = `${parsed.hostname}${parsed.pathname}`;
23
+ } catch {
24
+ // Keep the raw value for non-URL inputs.
25
+ }
26
+
27
+ const slug = source
28
+ .toLowerCase()
29
+ .replace(/[^a-z0-9]+/g, '-')
30
+ .replace(/^-+|-+$/g, '')
31
+ .slice(0, 80);
32
+
33
+ return slug || 'page';
34
+ }
35
+
36
+ function createSnapshotRunId() {
37
+ return new Date()
38
+ .toISOString()
39
+ .replace(/\.\d{3}z$/i, '')
40
+ .replace(/[^0-9a-z]+/gi, '-')
41
+ .replace(/^-+|-+$/g, '');
42
+ }
43
+
44
+ function getBlockNumber(block, fallbackIndex) {
45
+ return Number.isInteger(block?.blockIdx) ? block.blockIdx : fallbackIndex;
46
+ }
47
+
48
+ function getBlockSelector(block) {
49
+ const selector = typeof block?.blockCssPath === 'string' ? block.blockCssPath.trim() : '';
50
+ return selector || '';
51
+ }
52
+
53
+ function isObject(value) {
54
+ return value && typeof value === 'object' && !Array.isArray(value);
55
+ }
56
+
57
+ function normalizeS3Prefix(value) {
58
+ return String(value || '')
59
+ .trim()
60
+ .replace(/^\/+|\/+$/g, '');
61
+ }
62
+
63
+ function normalizePublicBaseUrl(value) {
64
+ return String(value || '')
65
+ .trim()
66
+ .replace(/\/+$/g, '');
67
+ }
68
+
69
+ function normalizeS3Config(config) {
70
+ if (config == null) {
71
+ return null;
72
+ }
73
+ if (!isObject(config)) {
74
+ throw new Error('extractorConfig.s3 must be an object');
75
+ }
76
+
77
+ const bucket = String(config.bucket || '').trim();
78
+ const region = String(config.region || '').trim();
79
+ if (!bucket) {
80
+ throw new Error('extractorConfig.s3.bucket is required');
81
+ }
82
+ if (!region) {
83
+ throw new Error('extractorConfig.s3.region is required');
84
+ }
85
+
86
+ return {
87
+ bucket,
88
+ region,
89
+ prefix: normalizeS3Prefix(config.prefix),
90
+ publicBaseUrl: normalizePublicBaseUrl(config.publicBaseUrl),
91
+ credentials: isObject(config.credentials) ? config.credentials : undefined,
92
+ client: config.client,
93
+ maxUploadAttempts: Number.isInteger(config.maxUploadAttempts)
94
+ ? Math.max(1, config.maxUploadAttempts)
95
+ : 3
96
+ };
97
+ }
98
+
99
+ function joinS3Key(prefix, filename) {
100
+ return [prefix, filename].filter(Boolean).join('/');
101
+ }
102
+
103
+ function encodeS3Key(key) {
104
+ return String(key || '')
105
+ .split('/')
106
+ .map((part) => encodeURIComponent(part))
107
+ .join('/');
108
+ }
109
+
110
+ function buildS3Url(s3Config, key, filename) {
111
+ if (s3Config.publicBaseUrl) {
112
+ return `${s3Config.publicBaseUrl}/${encodeURIComponent(filename)}`;
113
+ }
114
+
115
+ return `https://${s3Config.bucket}.s3.${s3Config.region}.amazonaws.com/${encodeS3Key(key)}`;
116
+ }
117
+
118
+ function getErrorMessage(error) {
119
+ return error instanceof Error ? error.message : String(error);
120
+ }
121
+
14
122
  export class PageExtractor {
15
123
  constructor(config = {}) {
16
124
  this.config = {
@@ -27,9 +135,17 @@ export class PageExtractor {
27
135
  blockMaxDepth: Number.isInteger(config.blockMaxDepth) ? Math.max(1, config.blockMaxDepth) : 15,
28
136
  textPreviewMaxChars: Number.isInteger(config.textPreviewMaxChars)
29
137
  ? Math.max(120, config.textPreviewMaxChars)
30
- : 1200
138
+ : 1200,
139
+ waitForImagesLoaded: Boolean(config.waitForImagesLoaded),
140
+ fullPageScreenshot: Boolean(config.fullPageScreenshot),
141
+ blockScreenshots: Boolean(config.blockScreenshots),
142
+ snapshotDir: typeof config.snapshotDir === 'string' && config.snapshotDir.trim()
143
+ ? path.resolve(process.cwd(), config.snapshotDir)
144
+ : path.resolve(process.cwd(), 'snapshots'),
145
+ s3: normalizeS3Config(config.s3)
31
146
  };
32
147
  this.playwrightModule = null;
148
+ this.s3Client = null;
33
149
  }
34
150
 
35
151
  async getPlaywright() {
@@ -41,17 +157,97 @@ export class PageExtractor {
41
157
  return this.playwrightModule;
42
158
  }
43
159
 
160
+ getS3Client() {
161
+ if (!this.config.s3) {
162
+ return null;
163
+ }
164
+ if (this.config.s3.client) {
165
+ return this.config.s3.client;
166
+ }
167
+ if (this.s3Client) {
168
+ return this.s3Client;
169
+ }
170
+
171
+ this.s3Client = new S3Client({
172
+ region: this.config.s3.region,
173
+ credentials: this.config.s3.credentials,
174
+ maxAttempts: 1
175
+ });
176
+ return this.s3Client;
177
+ }
178
+
179
+ async uploadScreenshotToS3(filename, body) {
180
+ const s3Config = this.config.s3;
181
+ if (!s3Config) {
182
+ throw new Error('S3 is not configured');
183
+ }
184
+
185
+ const key = joinS3Key(s3Config.prefix, filename);
186
+ const client = this.getS3Client();
187
+ const commandInput = {
188
+ Bucket: s3Config.bucket,
189
+ Key: key,
190
+ Body: body,
191
+ ContentType: 'image/png'
192
+ };
193
+
194
+ let lastError = null;
195
+ for (let attempt = 1; attempt <= s3Config.maxUploadAttempts; attempt += 1) {
196
+ try {
197
+ const command = new PutObjectCommand(commandInput);
198
+ await client.send(command);
199
+ return buildS3Url(s3Config, key, filename);
200
+ } catch (error) {
201
+ lastError = error;
202
+ if (attempt < s3Config.maxUploadAttempts) {
203
+ console.warn(
204
+ `[page-analyzer] Failed to upload ${key} to S3, retrying ` +
205
+ `(${attempt}/${s3Config.maxUploadAttempts}): ${getErrorMessage(error)}`
206
+ );
207
+ }
208
+ }
209
+ }
210
+
211
+ throw lastError;
212
+ }
213
+
44
214
  async revealHiddenContent(page) {
45
215
  return page.evaluate(() => {
46
216
  const CONTENT_THRESHOLD = 20;
217
+ const NON_CONTENT_TAGS = new Set([
218
+ 'SCRIPT',
219
+ 'STYLE',
220
+ 'NOSCRIPT',
221
+ 'TEMPLATE',
222
+ 'META',
223
+ 'LINK',
224
+ 'IFRAME',
225
+ 'OBJECT',
226
+ 'EMBED'
227
+ ]);
47
228
  let opacityCount = 0;
48
229
  let displayCount = 0;
49
230
 
231
+ const isLikelyScriptText = (value) => {
232
+ const text = String(value || '').trim();
233
+ if (!text) {
234
+ return false;
235
+ }
236
+ return /(_satellite|google_tag_manager|dataLayer|window\.|document\.|function\s*\(|=>|createElement\(|appendChild\(|\.push\(|var\s+\w+\s*=|const\s+\w+\s*=|let\s+\w+\s*=)/.test(text);
237
+ };
238
+
50
239
  for (const el of document.querySelectorAll('*')) {
240
+ if (NON_CONTENT_TAGS.has(el.tagName)) {
241
+ continue;
242
+ }
243
+
51
244
  const style = getComputedStyle(el);
52
245
  if (parseFloat(style.opacity) === 0 && el.getBoundingClientRect().height > 0) {
53
246
  const text = (el.innerText || '').trim();
54
- if (text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0) {
247
+ if (
248
+ !isLikelyScriptText(text) &&
249
+ (text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0)
250
+ ) {
55
251
  el.style.setProperty('opacity', '1', 'important');
56
252
  opacityCount += 1;
57
253
  }
@@ -68,7 +264,7 @@ export class PageExtractor {
68
264
  el.style.setProperty('display', 'block', 'important');
69
265
  const text = (el.innerText || '').trim();
70
266
 
71
- if (text.length >= CONTENT_THRESHOLD) {
267
+ if (text.length >= CONTENT_THRESHOLD && !isLikelyScriptText(text)) {
72
268
  displayCount += 1;
73
269
  } else if (originalDisplay) {
74
270
  el.style.display = originalDisplay;
@@ -82,6 +278,80 @@ export class PageExtractor {
82
278
  });
83
279
  }
84
280
 
281
+ async waitForImagesLoaded(page) {
282
+ if (!this.config.waitForImagesLoaded) {
283
+ return null;
284
+ }
285
+
286
+ const timeoutMs = this.config.timeoutMs;
287
+ const result = await page.evaluate(async ({ timeoutMs: waitTimeoutMs }) => {
288
+ const images = Array.from(document.images || []);
289
+ const total = images.length;
290
+ if (total === 0) {
291
+ return {
292
+ total,
293
+ loaded: 0,
294
+ timedOut: false
295
+ };
296
+ }
297
+
298
+ const isSettled = (img) => img.complete;
299
+ const countLoaded = () => images.filter(isSettled).length;
300
+ const pending = images.filter((img) => !isSettled(img));
301
+
302
+ if (pending.length === 0) {
303
+ return {
304
+ total,
305
+ loaded: total,
306
+ timedOut: false
307
+ };
308
+ }
309
+
310
+ let timeoutId = null;
311
+ const waitForImage = (img) => new Promise((resolve) => {
312
+ if (isSettled(img)) {
313
+ resolve();
314
+ return;
315
+ }
316
+
317
+ const done = () => {
318
+ img.removeEventListener('load', done);
319
+ img.removeEventListener('error', done);
320
+ resolve();
321
+ };
322
+ img.addEventListener('load', done, { once: true });
323
+ img.addEventListener('error', done, { once: true });
324
+ });
325
+
326
+ const allImagesDone = Promise.all(pending.map(waitForImage))
327
+ .then(() => ({ timedOut: false }));
328
+ const timeout = new Promise((resolve) => {
329
+ timeoutId = window.setTimeout(() => resolve({ timedOut: true }), waitTimeoutMs);
330
+ });
331
+
332
+ const waitResult = await Promise.race([allImagesDone, timeout]);
333
+ if (timeoutId !== null) {
334
+ window.clearTimeout(timeoutId);
335
+ }
336
+
337
+ return {
338
+ total,
339
+ loaded: countLoaded(),
340
+ timedOut: Boolean(waitResult?.timedOut)
341
+ };
342
+ }, { timeoutMs });
343
+
344
+ if (result?.timedOut) {
345
+ console.warn(
346
+ `[page-analyzer] Timed out waiting for images: ${result.loaded}/${result.total} completed`
347
+ );
348
+ } else {
349
+ console.log(`[page-analyzer] Images loaded: ${result?.loaded || 0}/${result?.total || 0}`);
350
+ }
351
+
352
+ return result;
353
+ }
354
+
85
355
  async collectElementGeometries(page) {
86
356
  return page.evaluate(() => {
87
357
  const INTERACTIVE_SELECTOR = 'a, button, form, input, select, textarea, [onclick], [role="button"]';
@@ -161,16 +431,246 @@ export class PageExtractor {
161
431
  });
162
432
  }
163
433
 
164
- /**
165
- * Extract page data: html, blocks, elementGeometries, markdown
166
- * @param {string} url - URL to extract
167
- * @returns {Promise<{html, blocks, elementGeometries, pageSize}>}
168
- */
169
- async extract(url) {
434
+ async collectPageSize(page) {
435
+ return page.evaluate(() => {
436
+ const html = document.documentElement;
437
+ const body = document.body;
438
+ return {
439
+ width: Math.max(
440
+ html?.scrollWidth || 0,
441
+ html?.offsetWidth || 0,
442
+ html?.clientWidth || 0,
443
+ body?.scrollWidth || 0,
444
+ body?.offsetWidth || 0,
445
+ body?.clientWidth || 0,
446
+ window.innerWidth || 0
447
+ ),
448
+ height: Math.max(
449
+ html?.scrollHeight || 0,
450
+ html?.offsetHeight || 0,
451
+ html?.clientHeight || 0,
452
+ body?.scrollHeight || 0,
453
+ body?.offsetHeight || 0,
454
+ body?.clientHeight || 0,
455
+ window.innerHeight || 0
456
+ )
457
+ };
458
+ });
459
+ }
460
+
461
+ async hideExternalFixedOverlays(page, selector) {
462
+ return page.evaluate((targetSelector) => {
463
+ const existing = Array.isArray(window.__pageAnalyzerHiddenOverlays)
464
+ ? window.__pageAnalyzerHiddenOverlays
465
+ : [];
466
+ for (const item of existing) {
467
+ if (!item?.element) continue;
468
+ if (item.visibilityValue) {
469
+ item.element.style.setProperty(
470
+ 'visibility',
471
+ item.visibilityValue,
472
+ item.visibilityPriority || ''
473
+ );
474
+ } else {
475
+ item.element.style.removeProperty('visibility');
476
+ }
477
+ }
478
+
479
+ const target = document.querySelector(targetSelector);
480
+ if (!(target instanceof Element)) {
481
+ window.__pageAnalyzerHiddenOverlays = [];
482
+ return 0;
483
+ }
484
+
485
+ const hidden = [];
486
+ for (const element of document.querySelectorAll('body *')) {
487
+ if (!(element instanceof HTMLElement)) continue;
488
+ if (element === target || element.contains(target) || target.contains(element)) continue;
489
+
490
+ const style = getComputedStyle(element);
491
+ if (style.position !== 'fixed' && style.position !== 'sticky') continue;
492
+
493
+ const rect = element.getBoundingClientRect();
494
+ if (rect.width <= 0 || rect.height <= 0) continue;
495
+
496
+ hidden.push({
497
+ element,
498
+ visibilityValue: element.style.getPropertyValue('visibility'),
499
+ visibilityPriority: element.style.getPropertyPriority('visibility')
500
+ });
501
+ element.style.setProperty('visibility', 'hidden', 'important');
502
+ }
503
+
504
+ window.__pageAnalyzerHiddenOverlays = hidden;
505
+ return hidden.length;
506
+ }, selector);
507
+ }
508
+
509
+ async restoreExternalFixedOverlays(page) {
510
+ await page.evaluate(() => {
511
+ const hidden = Array.isArray(window.__pageAnalyzerHiddenOverlays)
512
+ ? window.__pageAnalyzerHiddenOverlays
513
+ : [];
514
+ for (const item of hidden) {
515
+ if (!item?.element) continue;
516
+ if (item.visibilityValue) {
517
+ item.element.style.setProperty(
518
+ 'visibility',
519
+ item.visibilityValue,
520
+ item.visibilityPriority || ''
521
+ );
522
+ } else {
523
+ item.element.style.removeProperty('visibility');
524
+ }
525
+ }
526
+ window.__pageAnalyzerHiddenOverlays = [];
527
+ });
528
+ }
529
+
530
+ async captureBlockScreenshotData(page, block, screenshotOptions = {}) {
531
+ if (block?.hidden) {
532
+ return null;
533
+ }
534
+
535
+ const selector = getBlockSelector(block);
536
+ if (!selector) {
537
+ return null;
538
+ }
539
+
540
+ try {
541
+ const locator = page.locator(selector).first();
542
+ if (await locator.count() > 0) {
543
+ await this.hideExternalFixedOverlays(page, selector);
544
+ try {
545
+ return await locator.screenshot(screenshotOptions);
546
+ } finally {
547
+ await this.restoreExternalFixedOverlays(page);
548
+ }
549
+ }
550
+ } catch {
551
+ // Selector-only mode: skip blocks that cannot be captured through CSS.
552
+ }
553
+
554
+ return null;
555
+ }
556
+
557
+ async captureBlockScreenshot(page, block, blockPath) {
558
+ const body = await this.captureBlockScreenshotData(page, block, { path: blockPath });
559
+ return Boolean(body) || body === undefined;
560
+ }
561
+
562
+ async captureScreenshots(page, targetUrl, blocks, options = {}) {
563
+ const fullPageScreenshot = options.fullPageScreenshot ?? this.config.fullPageScreenshot;
564
+ const blockScreenshots = options.blockScreenshots ?? this.config.blockScreenshots;
565
+ const useS3 = Boolean(this.config.s3);
566
+
567
+ if (!fullPageScreenshot && !blockScreenshots) {
568
+ return null;
569
+ }
570
+
571
+ if (!useS3) {
572
+ await fs.mkdir(this.config.snapshotDir, { recursive: true });
573
+ }
574
+
575
+ const prefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
576
+ const screenshots = {};
577
+
578
+ if (fullPageScreenshot) {
579
+ const fullPageFilename = `${prefix}-full-page.png`;
580
+ try {
581
+ if (useS3) {
582
+ const body = await page.screenshot({ fullPage: true });
583
+ screenshots.fullPage = await this.uploadScreenshotToS3(fullPageFilename, body);
584
+ } else {
585
+ const fullPagePath = path.join(this.config.snapshotDir, fullPageFilename);
586
+ await page.screenshot({
587
+ path: fullPagePath,
588
+ fullPage: true
589
+ });
590
+ screenshots.fullPage = fullPagePath;
591
+ }
592
+ } catch (error) {
593
+ console.warn(
594
+ `[page-analyzer] Failed to capture/upload full-page screenshot: ${getErrorMessage(error)}`
595
+ );
596
+ }
597
+ }
598
+
599
+ if (blockScreenshots) {
600
+ screenshots.blocks = [];
601
+
602
+ for (let index = 0; index < blocks.length; index += 1) {
603
+ const block = blocks[index];
604
+ const blockIdx = getBlockNumber(block, index);
605
+
606
+ const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
607
+ const blockFilename = `${prefix}-block-${blockLabel}.png`;
608
+ try {
609
+ if (useS3) {
610
+ const body = await this.captureBlockScreenshotData(page, block);
611
+ if (!body) {
612
+ continue;
613
+ }
614
+ const url = await this.uploadScreenshotToS3(blockFilename, body);
615
+ const screenshotRecord = {
616
+ blockIdx,
617
+ path: url
618
+ };
619
+ if (typeof block?.blockName === 'string' && block.blockName.trim()) {
620
+ screenshotRecord.blockName = block.blockName.trim();
621
+ }
622
+ if (typeof block?.blockIdxs === 'string' && block.blockIdxs.trim()) {
623
+ screenshotRecord.blockIdxs = block.blockIdxs.trim();
624
+ }
625
+ screenshots.blocks.push(screenshotRecord);
626
+ continue;
627
+ }
628
+
629
+ const blockPath = path.join(this.config.snapshotDir, blockFilename);
630
+ const captured = await this.captureBlockScreenshot(page, block, blockPath);
631
+ if (captured) {
632
+ const screenshotRecord = {
633
+ blockIdx,
634
+ path: blockPath
635
+ };
636
+ if (typeof block?.blockName === 'string' && block.blockName.trim()) {
637
+ screenshotRecord.blockName = block.blockName.trim();
638
+ }
639
+ if (typeof block?.blockIdxs === 'string' && block.blockIdxs.trim()) {
640
+ screenshotRecord.blockIdxs = block.blockIdxs.trim();
641
+ }
642
+ screenshots.blocks.push(screenshotRecord);
643
+ }
644
+ } catch (error) {
645
+ console.warn(
646
+ `[page-analyzer] Failed to capture/upload block ${blockIdx}: ${getErrorMessage(error)}`
647
+ );
648
+ }
649
+ }
650
+ }
651
+
652
+ return screenshots;
653
+ }
654
+
655
+ async preparePage(page, targetUrl) {
656
+ await page.goto(targetUrl, {
657
+ waitUntil: 'domcontentloaded',
658
+ timeout: this.config.timeoutMs
659
+ });
660
+ await scrollToBottom(page);
661
+ await waitForStableHeight(page, { maxWait: this.config.timeoutMs });
662
+ await this.revealHiddenContent(page);
663
+ await this.waitForImagesLoaded(page);
664
+ }
665
+
666
+ async withPreparedPage(url, callback) {
170
667
  const targetUrl = String(url || '').trim();
171
668
  if (!targetUrl) {
172
669
  throw new Error('PageExtractor requires a non-empty URL');
173
670
  }
671
+ if (typeof callback !== 'function') {
672
+ throw new Error('PageExtractor.withPreparedPage requires a callback');
673
+ }
174
674
 
175
675
  const viewport = {
176
676
  width: this.config.viewportWidth,
@@ -181,35 +681,61 @@ export class PageExtractor {
181
681
  const browser = await playwright.chromium.launch({ headless: true });
182
682
  try {
183
683
  const page = await browser.newPage({ viewport });
184
- await page.goto(targetUrl, {
185
- waitUntil: 'domcontentloaded',
186
- timeout: this.config.timeoutMs
187
- });
188
- await scrollToBottom(page);
189
- await waitForStableHeight(page, { maxWait: this.config.timeoutMs });
190
- await this.revealHiddenContent(page);
191
-
192
- const html = await page.content();
193
- const pageSize = await page.evaluate(() => ({
194
- width: document.documentElement.scrollWidth || 0,
195
- height: document.documentElement.scrollHeight || 0
196
- }));
197
-
198
- const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
199
- const blocksResult = await page.evaluate(extractBlocksInBrowser, {
200
- minHeight: this.config.minBlockHeight,
201
- minWidth,
202
- maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
203
- maxDepth: this.config.blockMaxDepth,
204
- textPreviewMaxChars: this.config.textPreviewMaxChars,
205
- debug: false
206
- });
207
- const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
208
- const elementGeometries = await this.collectElementGeometries(page);
209
-
210
- return { html, blocks, elementGeometries, pageSize };
684
+ await this.preparePage(page, targetUrl);
685
+ return await callback(page, targetUrl);
211
686
  } finally {
212
687
  await browser.close();
213
688
  }
214
689
  }
690
+
691
+ async captureUrlScreenshots(url, blocks, options = {}) {
692
+ return this.withPreparedPage(url, async (page, targetUrl) => {
693
+ return await this.captureScreenshots(page, targetUrl, blocks, options);
694
+ });
695
+ }
696
+
697
+ /**
698
+ * Extract page data from an already prepared Playwright page:
699
+ * html, blocks, elementGeometries, screenshots.
700
+ * When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
701
+ * @param {import('playwright').Page} page - Prepared Playwright page
702
+ * @param {string} targetUrl - URL loaded in the page
703
+ * @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
704
+ */
705
+ async extractPreparedPage(page, targetUrl) {
706
+ const viewport = {
707
+ width: this.config.viewportWidth,
708
+ height: this.config.viewportHeight
709
+ };
710
+
711
+ const html = await page.content();
712
+
713
+ const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
714
+ const blocksResult = await page.evaluate(extractBlocksInBrowser, {
715
+ minHeight: this.config.minBlockHeight,
716
+ minWidth,
717
+ maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
718
+ maxDepth: this.config.blockMaxDepth,
719
+ textPreviewMaxChars: this.config.textPreviewMaxChars,
720
+ debug: false
721
+ });
722
+ const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
723
+ const elementGeometries = await this.collectElementGeometries(page);
724
+ const finalPageSize = await this.collectPageSize(page);
725
+ const screenshots = await this.captureScreenshots(page, targetUrl, blocks);
726
+
727
+ return { html, blocks, elementGeometries, screenshots, pageSize: finalPageSize };
728
+ }
729
+
730
+ /**
731
+ * Extract page data: html, blocks, elementGeometries, screenshots.
732
+ * When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
733
+ * @param {string} url - URL to extract
734
+ * @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
735
+ */
736
+ async extract(url) {
737
+ return this.withPreparedPage(url, async (page, targetUrl) => {
738
+ return await this.extractPreparedPage(page, targetUrl);
739
+ });
740
+ }
215
741
  }