page-analyzer 1.1.1 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -151,6 +151,7 @@ const result = await analyzeUrl('https://example.com', {
151
151
  | `options.fullPageScreenshot` | `boolean` | 否 | 是否保存整页截图到当前运行目录的 `snapshots/` 并返回文件路径 |
152
152
  | `options.blockScreenshots` | `boolean` | 否 | 是否在 LLM 合并区块后,保存每个逻辑区块截图到当前运行目录的 `snapshots/` 并返回文件路径 |
153
153
  | `options.waitForImagesLoaded` | `boolean` | 否 | 是否在提取区块、分析和截图前等待页面图片加载完成,默认 `false` |
154
+ | `options.extractorConfig.s3` | `object` | 否 | 截图 S3 上传配置。配置后截图上传到 S3,返回 HTTPS URL;未配置时仍保存到本地 `snapshots/` |
154
155
 
155
156
  ### analyzePageEvents(input)
156
157
 
@@ -249,6 +250,8 @@ const result = await analyzePageEvents({
249
250
 
250
251
  启用 `blockScreenshots: true` 后,模块会在 LLM 合并区块后再截图。返回结果会包含 `screenshots.blocks`,每项包含逻辑区块序号 `blockIdx` 和对应截图 `path`;区块分析结果中的每个 block 也会额外带上 `blockScreenshotPaths`,每个逻辑区块最多对应一张截图。无法通过 `blockCssPath` 截图的隐藏或空区块会被跳过。
251
252
 
253
+ 如果配置 `extractorConfig.s3`,截图不会写入本地 `snapshots/`,而是直接上传到 S3;`screenshots.fullPage`、`screenshots.blocks[].path` 和 `blockScreenshotPaths` 会返回 HTTPS URL。上传不会设置 ACL,访问权限沿用 bucket 策略。单张截图上传失败会重试 3 次,仍失败则跳过该截图。
254
+
252
255
  启用 `waitForImagesLoaded: true` 后,模块会先滚动页面触发懒加载,再等待当前 DOM 中的 `<img>` 完成加载或失败,之后再提取区块、分析和截图;等待时间受 `extractorConfig.timeoutMs` 控制。
253
256
 
254
257
  截图参数启用后的新增输出示例:
@@ -284,6 +287,36 @@ const result = await analyzePageEvents({
284
287
  | `blockMaxDepth` | `15` | 区块提取最大 DOM 深度 |
285
288
  | `textPreviewMaxChars` | `1200` | 区块文本预览最大长度 |
286
289
  | `waitForImagesLoaded` | `false` | 是否在提取区块、分析和截图前等待页面图片加载完成 |
290
+ | `s3` | 无 | 截图 S3 上传配置。配置后截图直接上传到 S3,未配置时保存到本地 |
291
+
292
+ S3 截图上传示例:
293
+
294
+ ```js
295
+ const result = await analyzeUrl('https://example.com', {
296
+ fullPageScreenshot: true,
297
+ blockScreenshots: true,
298
+ llm: {
299
+ apiKey: process.env.LLM_API_KEY,
300
+ apiEndpoint: process.env.LLM_API_ENDPOINT,
301
+ model: process.env.LLM_MODEL
302
+ },
303
+ extractorConfig: {
304
+ s3: {
305
+ bucket: 'my-bucket',
306
+ region: 'ap-northeast-1',
307
+ prefix: 'page-analyzer/snapshots',
308
+ publicBaseUrl: 'https://cdn.example.com',
309
+ credentials: {
310
+ accessKeyId: process.env.AWS_ACCESS_KEY_ID,
311
+ secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
312
+ sessionToken: process.env.AWS_SESSION_TOKEN
313
+ }
314
+ }
315
+ }
316
+ });
317
+ ```
318
+
319
+ `extractorConfig.s3.bucket` 和 `extractorConfig.s3.region` 必填。`credentials` 可省略,省略时使用 AWS SDK 默认凭证链。`publicBaseUrl` 可省略,省略时返回 `https://<bucket>.s3.<region>.amazonaws.com/<key>`;配置后返回 `${publicBaseUrl}/<key>`。
287
320
 
288
321
  ### parserConfig
289
322
 
package/index.js CHANGED
@@ -292,6 +292,7 @@ function buildPageAnalysisResult({
292
292
  * @param {boolean} [options.fullPageScreenshot=false] - Save a full-page screenshot to snapshots/ and return its path.
293
293
  * @param {boolean} [options.blockScreenshots=false] - Save one screenshot per merged logical block to snapshots/ and return their paths.
294
294
  * @param {boolean} [options.waitForImagesLoaded=false] - Wait for page images before extracting and screenshotting.
295
+ * @param {Object} [options.extractorConfig.s3] - Optional S3 config for uploading screenshots instead of saving locally.
295
296
  * @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
296
297
  */
297
298
  export async function analyzeUrl(url, options = {}) {
@@ -323,47 +324,50 @@ export async function analyzeUrl(url, options = {}) {
323
324
  blockScreenshots: false,
324
325
  waitForImagesLoaded: waitForImagesLoaded ?? extractorConfig?.waitForImagesLoaded
325
326
  });
326
- const bundle = await extractor.extract(url);
327
- console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
328
327
 
329
- // Derive domain from URL
330
- let domain = '';
331
- try { domain = new URL(url).hostname.replace(/^www\./, ''); } catch { /* ignore */ }
332
-
333
- let result = await analyzePageEvents({
334
- html: bundle.html,
335
- url,
336
- blocks: bundle.blocks,
337
- elementGeometries: bundle.elementGeometries,
338
- llm: llmConfig,
339
- knownEventTypes,
340
- parserConfig,
341
- showEvents,
342
- showBlockIdx,
343
- screenshots: bundle.screenshots,
344
- domain,
345
- nodeId: `${domain}-root`
346
- });
347
-
348
- if (shouldCaptureBlocks) {
349
- const logicalBlocks = Array.isArray(result?.analysis?.block_analysis?.blocks)
350
- ? result.analysis.block_analysis.blocks
351
- : [];
352
- const blockScreenshotsBundle = await extractor.captureUrlScreenshots(url, logicalBlocks, {
353
- fullPageScreenshot: false,
354
- blockScreenshots: true
328
+ return await extractor.withPreparedPage(url, async (page, targetUrl) => {
329
+ const bundle = await extractor.extractPreparedPage(page, targetUrl);
330
+ console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
331
+
332
+ // Derive domain from URL
333
+ let domain = '';
334
+ try { domain = new URL(targetUrl).hostname.replace(/^www\./, ''); } catch { /* ignore */ }
335
+
336
+ let result = await analyzePageEvents({
337
+ html: bundle.html,
338
+ url: targetUrl,
339
+ blocks: bundle.blocks,
340
+ elementGeometries: bundle.elementGeometries,
341
+ llm: llmConfig,
342
+ knownEventTypes,
343
+ parserConfig,
344
+ showEvents,
345
+ showBlockIdx,
346
+ screenshots: bundle.screenshots,
347
+ domain,
348
+ nodeId: `${domain}-root`
355
349
  });
356
- const screenshots = mergeScreenshots(result.screenshots, blockScreenshotsBundle);
357
- result = attachLogicalBlockScreenshotPaths(
358
- {
359
- ...result,
360
- ...(screenshots ? { screenshots } : {})
361
- },
362
- screenshots
363
- );
364
- }
365
350
 
366
- return result;
351
+ if (shouldCaptureBlocks) {
352
+ const logicalBlocks = Array.isArray(result?.analysis?.block_analysis?.blocks)
353
+ ? result.analysis.block_analysis.blocks
354
+ : [];
355
+ const blockScreenshotsBundle = await extractor.captureScreenshots(page, targetUrl, logicalBlocks, {
356
+ fullPageScreenshot: false,
357
+ blockScreenshots: true
358
+ });
359
+ const screenshots = mergeScreenshots(result.screenshots, blockScreenshotsBundle);
360
+ result = attachLogicalBlockScreenshotPaths(
361
+ {
362
+ ...result,
363
+ ...(screenshots ? { screenshots } : {})
364
+ },
365
+ screenshots
366
+ );
367
+ }
368
+
369
+ return result;
370
+ });
367
371
  }
368
372
 
369
373
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "page-analyzer",
3
- "version": "1.1.1",
3
+ "version": "1.2.1",
4
4
  "type": "module",
5
5
  "description": "Standalone page analysis module.",
6
6
  "license": "MIT",
@@ -11,6 +11,7 @@
11
11
  "viewer": "node scripts/serve-result-viewer.js"
12
12
  },
13
13
  "dependencies": {
14
+ "@aws-sdk/client-s3": "^3.1045.0",
14
15
  "cheerio": "^1.2.0",
15
16
  "csv-parse": "^5.6.0",
16
17
  "playwright": "^1.58.2"
package/page-extractor.js CHANGED
@@ -5,6 +5,7 @@
5
5
 
6
6
  import fs from 'node:fs/promises';
7
7
  import path from 'node:path';
8
+ import { PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
8
9
 
9
10
  // In-browser block extraction function (serialized into page.evaluate)
10
11
  // Imported from the project's extract-blocks script
@@ -49,6 +50,75 @@ function getBlockSelector(block) {
49
50
  return selector || '';
50
51
  }
51
52
 
53
+ function isObject(value) {
54
+ return value && typeof value === 'object' && !Array.isArray(value);
55
+ }
56
+
57
+ function normalizeS3Prefix(value) {
58
+ return String(value || '')
59
+ .trim()
60
+ .replace(/^\/+|\/+$/g, '');
61
+ }
62
+
63
+ function normalizePublicBaseUrl(value) {
64
+ return String(value || '')
65
+ .trim()
66
+ .replace(/\/+$/g, '');
67
+ }
68
+
69
+ function normalizeS3Config(config) {
70
+ if (config == null) {
71
+ return null;
72
+ }
73
+ if (!isObject(config)) {
74
+ throw new Error('extractorConfig.s3 must be an object');
75
+ }
76
+
77
+ const bucket = String(config.bucket || '').trim();
78
+ const region = String(config.region || '').trim();
79
+ if (!bucket) {
80
+ throw new Error('extractorConfig.s3.bucket is required');
81
+ }
82
+ if (!region) {
83
+ throw new Error('extractorConfig.s3.region is required');
84
+ }
85
+
86
+ return {
87
+ bucket,
88
+ region,
89
+ prefix: normalizeS3Prefix(config.prefix),
90
+ publicBaseUrl: normalizePublicBaseUrl(config.publicBaseUrl),
91
+ credentials: isObject(config.credentials) ? config.credentials : undefined,
92
+ client: config.client,
93
+ maxUploadAttempts: Number.isInteger(config.maxUploadAttempts)
94
+ ? Math.max(1, config.maxUploadAttempts)
95
+ : 3
96
+ };
97
+ }
98
+
99
+ function joinS3Key(prefix, filename) {
100
+ return [prefix, filename].filter(Boolean).join('/');
101
+ }
102
+
103
+ function encodeS3Key(key) {
104
+ return String(key || '')
105
+ .split('/')
106
+ .map((part) => encodeURIComponent(part))
107
+ .join('/');
108
+ }
109
+
110
+ function buildS3Url(s3Config, key) {
111
+ if (s3Config.publicBaseUrl) {
112
+ return `${s3Config.publicBaseUrl}/${encodeS3Key(key)}`;
113
+ }
114
+
115
+ return `https://${s3Config.bucket}.s3.${s3Config.region}.amazonaws.com/${encodeS3Key(key)}`;
116
+ }
117
+
118
+ function getErrorMessage(error) {
119
+ return error instanceof Error ? error.message : String(error);
120
+ }
121
+
52
122
  export class PageExtractor {
53
123
  constructor(config = {}) {
54
124
  this.config = {
@@ -71,9 +141,11 @@ export class PageExtractor {
71
141
  blockScreenshots: Boolean(config.blockScreenshots),
72
142
  snapshotDir: typeof config.snapshotDir === 'string' && config.snapshotDir.trim()
73
143
  ? path.resolve(process.cwd(), config.snapshotDir)
74
- : path.resolve(process.cwd(), 'snapshots')
144
+ : path.resolve(process.cwd(), 'snapshots'),
145
+ s3: normalizeS3Config(config.s3)
75
146
  };
76
147
  this.playwrightModule = null;
148
+ this.s3Client = null;
77
149
  }
78
150
 
79
151
  async getPlaywright() {
@@ -85,6 +157,60 @@ export class PageExtractor {
85
157
  return this.playwrightModule;
86
158
  }
87
159
 
160
+ getS3Client() {
161
+ if (!this.config.s3) {
162
+ return null;
163
+ }
164
+ if (this.config.s3.client) {
165
+ return this.config.s3.client;
166
+ }
167
+ if (this.s3Client) {
168
+ return this.s3Client;
169
+ }
170
+
171
+ this.s3Client = new S3Client({
172
+ region: this.config.s3.region,
173
+ credentials: this.config.s3.credentials,
174
+ maxAttempts: 1
175
+ });
176
+ return this.s3Client;
177
+ }
178
+
179
+ async uploadScreenshotToS3(filename, body) {
180
+ const s3Config = this.config.s3;
181
+ if (!s3Config) {
182
+ throw new Error('S3 is not configured');
183
+ }
184
+
185
+ const key = joinS3Key(s3Config.prefix, filename);
186
+ const client = this.getS3Client();
187
+ const commandInput = {
188
+ Bucket: s3Config.bucket,
189
+ Key: key,
190
+ Body: body,
191
+ ContentType: 'image/png'
192
+ };
193
+
194
+ let lastError = null;
195
+ for (let attempt = 1; attempt <= s3Config.maxUploadAttempts; attempt += 1) {
196
+ try {
197
+ const command = new PutObjectCommand(commandInput);
198
+ await client.send(command);
199
+ return buildS3Url(s3Config, key);
200
+ } catch (error) {
201
+ lastError = error;
202
+ if (attempt < s3Config.maxUploadAttempts) {
203
+ console.warn(
204
+ `[page-analyzer] Failed to upload ${key} to S3, retrying ` +
205
+ `(${attempt}/${s3Config.maxUploadAttempts}): ${getErrorMessage(error)}`
206
+ );
207
+ }
208
+ }
209
+ }
210
+
211
+ throw lastError;
212
+ }
213
+
88
214
  async revealHiddenContent(page) {
89
215
  return page.evaluate(() => {
90
216
  const CONTENT_THRESHOLD = 20;
@@ -401,14 +527,14 @@ export class PageExtractor {
401
527
  });
402
528
  }
403
529
 
404
- async captureBlockScreenshot(page, block, blockPath) {
530
+ async captureBlockScreenshotData(page, block, screenshotOptions = {}) {
405
531
  if (block?.hidden) {
406
- return false;
532
+ return null;
407
533
  }
408
534
 
409
535
  const selector = getBlockSelector(block);
410
536
  if (!selector) {
411
- return false;
537
+ return null;
412
538
  }
413
539
 
414
540
  try {
@@ -416,8 +542,7 @@ export class PageExtractor {
416
542
  if (await locator.count() > 0) {
417
543
  await this.hideExternalFixedOverlays(page, selector);
418
544
  try {
419
- await locator.screenshot({ path: blockPath });
420
- return true;
545
+ return await locator.screenshot(screenshotOptions);
421
546
  } finally {
422
547
  await this.restoreExternalFixedOverlays(page);
423
548
  }
@@ -426,28 +551,49 @@ export class PageExtractor {
426
551
  // Selector-only mode: skip blocks that cannot be captured through CSS.
427
552
  }
428
553
 
429
- return false;
554
+ return null;
555
+ }
556
+
557
+ async captureBlockScreenshot(page, block, blockPath) {
558
+ const body = await this.captureBlockScreenshotData(page, block, { path: blockPath });
559
+ return Boolean(body) || body === undefined;
430
560
  }
431
561
 
432
562
  async captureScreenshots(page, targetUrl, blocks, options = {}) {
433
563
  const fullPageScreenshot = options.fullPageScreenshot ?? this.config.fullPageScreenshot;
434
564
  const blockScreenshots = options.blockScreenshots ?? this.config.blockScreenshots;
565
+ const useS3 = Boolean(this.config.s3);
435
566
 
436
567
  if (!fullPageScreenshot && !blockScreenshots) {
437
568
  return null;
438
569
  }
439
570
 
440
- await fs.mkdir(this.config.snapshotDir, { recursive: true });
571
+ if (!useS3) {
572
+ await fs.mkdir(this.config.snapshotDir, { recursive: true });
573
+ }
574
+
441
575
  const prefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
442
576
  const screenshots = {};
443
577
 
444
578
  if (fullPageScreenshot) {
445
- const fullPagePath = path.join(this.config.snapshotDir, `${prefix}-full-page.png`);
446
- await page.screenshot({
447
- path: fullPagePath,
448
- fullPage: true
449
- });
450
- screenshots.fullPage = fullPagePath;
579
+ const fullPageFilename = `${prefix}-full-page.png`;
580
+ try {
581
+ if (useS3) {
582
+ const body = await page.screenshot({ fullPage: true });
583
+ screenshots.fullPage = await this.uploadScreenshotToS3(fullPageFilename, body);
584
+ } else {
585
+ const fullPagePath = path.join(this.config.snapshotDir, fullPageFilename);
586
+ await page.screenshot({
587
+ path: fullPagePath,
588
+ fullPage: true
589
+ });
590
+ screenshots.fullPage = fullPagePath;
591
+ }
592
+ } catch (error) {
593
+ console.warn(
594
+ `[page-analyzer] Failed to capture/upload full-page screenshot: ${getErrorMessage(error)}`
595
+ );
596
+ }
451
597
  }
452
598
 
453
599
  if (blockScreenshots) {
@@ -458,8 +604,29 @@ export class PageExtractor {
458
604
  const blockIdx = getBlockNumber(block, index);
459
605
 
460
606
  const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
461
- const blockPath = path.join(this.config.snapshotDir, `${prefix}-block-${blockLabel}.png`);
607
+ const blockFilename = `${prefix}-block-${blockLabel}.png`;
462
608
  try {
609
+ if (useS3) {
610
+ const body = await this.captureBlockScreenshotData(page, block);
611
+ if (!body) {
612
+ continue;
613
+ }
614
+ const url = await this.uploadScreenshotToS3(blockFilename, body);
615
+ const screenshotRecord = {
616
+ blockIdx,
617
+ path: url
618
+ };
619
+ if (typeof block?.blockName === 'string' && block.blockName.trim()) {
620
+ screenshotRecord.blockName = block.blockName.trim();
621
+ }
622
+ if (typeof block?.blockIdxs === 'string' && block.blockIdxs.trim()) {
623
+ screenshotRecord.blockIdxs = block.blockIdxs.trim();
624
+ }
625
+ screenshots.blocks.push(screenshotRecord);
626
+ continue;
627
+ }
628
+
629
+ const blockPath = path.join(this.config.snapshotDir, blockFilename);
463
630
  const captured = await this.captureBlockScreenshot(page, block, blockPath);
464
631
  if (captured) {
465
632
  const screenshotRecord = {
@@ -475,7 +642,9 @@ export class PageExtractor {
475
642
  screenshots.blocks.push(screenshotRecord);
476
643
  }
477
644
  } catch (error) {
478
- console.warn(`[page-analyzer] Failed to capture block ${blockIdx}: ${error.message}`);
645
+ console.warn(
646
+ `[page-analyzer] Failed to capture/upload block ${blockIdx}: ${getErrorMessage(error)}`
647
+ );
479
648
  }
480
649
  }
481
650
  }
@@ -494,11 +663,14 @@ export class PageExtractor {
494
663
  await this.waitForImagesLoaded(page);
495
664
  }
496
665
 
497
- async captureUrlScreenshots(url, blocks, options = {}) {
666
+ async withPreparedPage(url, callback) {
498
667
  const targetUrl = String(url || '').trim();
499
668
  if (!targetUrl) {
500
669
  throw new Error('PageExtractor requires a non-empty URL');
501
670
  }
671
+ if (typeof callback !== 'function') {
672
+ throw new Error('PageExtractor.withPreparedPage requires a callback');
673
+ }
502
674
 
503
675
  const viewport = {
504
676
  width: this.config.viewportWidth,
@@ -510,53 +682,60 @@ export class PageExtractor {
510
682
  try {
511
683
  const page = await browser.newPage({ viewport });
512
684
  await this.preparePage(page, targetUrl);
513
- return await this.captureScreenshots(page, targetUrl, blocks, options);
685
+ return await callback(page, targetUrl);
514
686
  } finally {
515
687
  await browser.close();
516
688
  }
517
689
  }
518
690
 
691
+ async captureUrlScreenshots(url, blocks, options = {}) {
692
+ return this.withPreparedPage(url, async (page, targetUrl) => {
693
+ return await this.captureScreenshots(page, targetUrl, blocks, options);
694
+ });
695
+ }
696
+
519
697
  /**
520
- * Extract page data: html, blocks, elementGeometries, screenshots
521
- * @param {string} url - URL to extract
698
+ * Extract page data from an already prepared Playwright page:
699
+ * html, blocks, elementGeometries, screenshots.
700
+ * When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
701
+ * @param {import('playwright').Page} page - Prepared Playwright page
702
+ * @param {string} targetUrl - URL loaded in the page
522
703
  * @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
523
704
  */
524
- async extract(url) {
525
- const targetUrl = String(url || '').trim();
526
- if (!targetUrl) {
527
- throw new Error('PageExtractor requires a non-empty URL');
528
- }
529
-
705
+ async extractPreparedPage(page, targetUrl) {
530
706
  const viewport = {
531
707
  width: this.config.viewportWidth,
532
708
  height: this.config.viewportHeight
533
709
  };
534
710
 
535
- const playwright = await this.getPlaywright();
536
- const browser = await playwright.chromium.launch({ headless: true });
537
- try {
538
- const page = await browser.newPage({ viewport });
539
- await this.preparePage(page, targetUrl);
711
+ const html = await page.content();
540
712
 
541
- const html = await page.content();
713
+ const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
714
+ const blocksResult = await page.evaluate(extractBlocksInBrowser, {
715
+ minHeight: this.config.minBlockHeight,
716
+ minWidth,
717
+ maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
718
+ maxDepth: this.config.blockMaxDepth,
719
+ textPreviewMaxChars: this.config.textPreviewMaxChars,
720
+ debug: false
721
+ });
722
+ const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
723
+ const elementGeometries = await this.collectElementGeometries(page);
724
+ const finalPageSize = await this.collectPageSize(page);
725
+ const screenshots = await this.captureScreenshots(page, targetUrl, blocks);
542
726
 
543
- const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
544
- const blocksResult = await page.evaluate(extractBlocksInBrowser, {
545
- minHeight: this.config.minBlockHeight,
546
- minWidth,
547
- maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
548
- maxDepth: this.config.blockMaxDepth,
549
- textPreviewMaxChars: this.config.textPreviewMaxChars,
550
- debug: false
551
- });
552
- const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
553
- const elementGeometries = await this.collectElementGeometries(page);
554
- const finalPageSize = await this.collectPageSize(page);
555
- const screenshots = await this.captureScreenshots(page, targetUrl, blocks);
727
+ return { html, blocks, elementGeometries, screenshots, pageSize: finalPageSize };
728
+ }
556
729
 
557
- return { html, blocks, elementGeometries, screenshots, pageSize: finalPageSize };
558
- } finally {
559
- await browser.close();
560
- }
730
+ /**
731
+ * Extract page data: html, blocks, elementGeometries, screenshots.
732
+ * When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
733
+ * @param {string} url - URL to extract
734
+ * @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
735
+ */
736
+ async extract(url) {
737
+ return this.withPreparedPage(url, async (page, targetUrl) => {
738
+ return await this.extractPreparedPage(page, targetUrl);
739
+ });
561
740
  }
562
741
  }