page-analyzer 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -0
- package/index.js +42 -38
- package/package.json +2 -1
- package/page-extractor.js +228 -49
- package/result-viewer.html +214 -29
- package/scripts/build-result-viewer.js +214 -29
- package/test/smoke.test.js +242 -1
package/README.md
CHANGED
|
@@ -151,6 +151,7 @@ const result = await analyzeUrl('https://example.com', {
|
|
|
151
151
|
| `options.fullPageScreenshot` | `boolean` | 否 | 是否保存整页截图到当前运行目录的 `snapshots/` 并返回文件路径 |
|
|
152
152
|
| `options.blockScreenshots` | `boolean` | 否 | 是否在 LLM 合并区块后,保存每个逻辑区块截图到当前运行目录的 `snapshots/` 并返回文件路径 |
|
|
153
153
|
| `options.waitForImagesLoaded` | `boolean` | 否 | 是否在提取区块、分析和截图前等待页面图片加载完成,默认 `false` |
|
|
154
|
+
| `options.extractorConfig.s3` | `object` | 否 | 截图 S3 上传配置。配置后截图上传到 S3,返回 HTTPS URL;未配置时仍保存到本地 `snapshots/` |
|
|
154
155
|
|
|
155
156
|
### analyzePageEvents(input)
|
|
156
157
|
|
|
@@ -249,6 +250,8 @@ const result = await analyzePageEvents({
|
|
|
249
250
|
|
|
250
251
|
启用 `blockScreenshots: true` 后,模块会在 LLM 合并区块后再截图。返回结果会包含 `screenshots.blocks`,每项包含逻辑区块序号 `blockIdx` 和对应截图 `path`;区块分析结果中的每个 block 也会额外带上 `blockScreenshotPaths`,每个逻辑区块最多对应一张截图。无法通过 `blockCssPath` 截图的隐藏或空区块会被跳过。
|
|
251
252
|
|
|
253
|
+
如果配置 `extractorConfig.s3`,截图不会写入本地 `snapshots/`,而是直接上传到 S3;`screenshots.fullPage`、`screenshots.blocks[].path` 和 `blockScreenshotPaths` 会返回 HTTPS URL。上传不会设置 ACL,访问权限沿用 bucket 策略。单张截图上传失败会重试 3 次,仍失败则跳过该截图。
|
|
254
|
+
|
|
252
255
|
启用 `waitForImagesLoaded: true` 后,模块会先滚动页面触发懒加载,再等待当前 DOM 中的 `<img>` 完成加载或失败,之后再提取区块、分析和截图;等待时间受 `extractorConfig.timeoutMs` 控制。
|
|
253
256
|
|
|
254
257
|
截图参数启用后的新增输出示例:
|
|
@@ -284,6 +287,36 @@ const result = await analyzePageEvents({
|
|
|
284
287
|
| `blockMaxDepth` | `15` | 区块提取最大 DOM 深度 |
|
|
285
288
|
| `textPreviewMaxChars` | `1200` | 区块文本预览最大长度 |
|
|
286
289
|
| `waitForImagesLoaded` | `false` | 是否在提取区块、分析和截图前等待页面图片加载完成 |
|
|
290
|
+
| `s3` | 无 | 截图 S3 上传配置。配置后截图直接上传到 S3,未配置时保存到本地 |
|
|
291
|
+
|
|
292
|
+
S3 截图上传示例:
|
|
293
|
+
|
|
294
|
+
```js
|
|
295
|
+
const result = await analyzeUrl('https://example.com', {
|
|
296
|
+
fullPageScreenshot: true,
|
|
297
|
+
blockScreenshots: true,
|
|
298
|
+
llm: {
|
|
299
|
+
apiKey: process.env.LLM_API_KEY,
|
|
300
|
+
apiEndpoint: process.env.LLM_API_ENDPOINT,
|
|
301
|
+
model: process.env.LLM_MODEL
|
|
302
|
+
},
|
|
303
|
+
extractorConfig: {
|
|
304
|
+
s3: {
|
|
305
|
+
bucket: 'my-bucket',
|
|
306
|
+
region: 'ap-northeast-1',
|
|
307
|
+
prefix: 'page-analyzer/snapshots',
|
|
308
|
+
publicBaseUrl: 'https://cdn.example.com/page-analyzer/snapshots',
|
|
309
|
+
credentials: {
|
|
310
|
+
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
|
|
311
|
+
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
|
|
312
|
+
sessionToken: process.env.AWS_SESSION_TOKEN
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
});
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
`extractorConfig.s3.bucket` 和 `extractorConfig.s3.region` 必填。`credentials` 可省略,省略时使用 AWS SDK 默认凭证链。`publicBaseUrl` 可省略,省略时返回 `https://<bucket>.s3.<region>.amazonaws.com/<key>`。
|
|
287
320
|
|
|
288
321
|
### parserConfig
|
|
289
322
|
|
package/index.js
CHANGED
|
@@ -292,6 +292,7 @@ function buildPageAnalysisResult({
|
|
|
292
292
|
* @param {boolean} [options.fullPageScreenshot=false] - Save a full-page screenshot to snapshots/ and return its path.
|
|
293
293
|
* @param {boolean} [options.blockScreenshots=false] - Save one screenshot per merged logical block to snapshots/ and return their paths.
|
|
294
294
|
* @param {boolean} [options.waitForImagesLoaded=false] - Wait for page images before extracting and screenshotting.
|
|
295
|
+
* @param {Object} [options.extractorConfig.s3] - Optional S3 config for uploading screenshots instead of saving locally.
|
|
295
296
|
* @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
|
|
296
297
|
*/
|
|
297
298
|
export async function analyzeUrl(url, options = {}) {
|
|
@@ -323,47 +324,50 @@ export async function analyzeUrl(url, options = {}) {
|
|
|
323
324
|
blockScreenshots: false,
|
|
324
325
|
waitForImagesLoaded: waitForImagesLoaded ?? extractorConfig?.waitForImagesLoaded
|
|
325
326
|
});
|
|
326
|
-
const bundle = await extractor.extract(url);
|
|
327
|
-
console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
|
|
328
327
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
? result.analysis.block_analysis.blocks
|
|
351
|
-
: [];
|
|
352
|
-
const blockScreenshotsBundle = await extractor.captureUrlScreenshots(url, logicalBlocks, {
|
|
353
|
-
fullPageScreenshot: false,
|
|
354
|
-
blockScreenshots: true
|
|
328
|
+
return await extractor.withPreparedPage(url, async (page, targetUrl) => {
|
|
329
|
+
const bundle = await extractor.extractPreparedPage(page, targetUrl);
|
|
330
|
+
console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
|
|
331
|
+
|
|
332
|
+
// Derive domain from URL
|
|
333
|
+
let domain = '';
|
|
334
|
+
try { domain = new URL(targetUrl).hostname.replace(/^www\./, ''); } catch { /* ignore */ }
|
|
335
|
+
|
|
336
|
+
let result = await analyzePageEvents({
|
|
337
|
+
html: bundle.html,
|
|
338
|
+
url: targetUrl,
|
|
339
|
+
blocks: bundle.blocks,
|
|
340
|
+
elementGeometries: bundle.elementGeometries,
|
|
341
|
+
llm: llmConfig,
|
|
342
|
+
knownEventTypes,
|
|
343
|
+
parserConfig,
|
|
344
|
+
showEvents,
|
|
345
|
+
showBlockIdx,
|
|
346
|
+
screenshots: bundle.screenshots,
|
|
347
|
+
domain,
|
|
348
|
+
nodeId: `${domain}-root`
|
|
355
349
|
});
|
|
356
|
-
const screenshots = mergeScreenshots(result.screenshots, blockScreenshotsBundle);
|
|
357
|
-
result = attachLogicalBlockScreenshotPaths(
|
|
358
|
-
{
|
|
359
|
-
...result,
|
|
360
|
-
...(screenshots ? { screenshots } : {})
|
|
361
|
-
},
|
|
362
|
-
screenshots
|
|
363
|
-
);
|
|
364
|
-
}
|
|
365
350
|
|
|
366
|
-
|
|
351
|
+
if (shouldCaptureBlocks) {
|
|
352
|
+
const logicalBlocks = Array.isArray(result?.analysis?.block_analysis?.blocks)
|
|
353
|
+
? result.analysis.block_analysis.blocks
|
|
354
|
+
: [];
|
|
355
|
+
const blockScreenshotsBundle = await extractor.captureScreenshots(page, targetUrl, logicalBlocks, {
|
|
356
|
+
fullPageScreenshot: false,
|
|
357
|
+
blockScreenshots: true
|
|
358
|
+
});
|
|
359
|
+
const screenshots = mergeScreenshots(result.screenshots, blockScreenshotsBundle);
|
|
360
|
+
result = attachLogicalBlockScreenshotPaths(
|
|
361
|
+
{
|
|
362
|
+
...result,
|
|
363
|
+
...(screenshots ? { screenshots } : {})
|
|
364
|
+
},
|
|
365
|
+
screenshots
|
|
366
|
+
);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
return result;
|
|
370
|
+
});
|
|
367
371
|
}
|
|
368
372
|
|
|
369
373
|
/**
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "page-analyzer",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Standalone page analysis module.",
|
|
6
6
|
"license": "MIT",
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
"viewer": "node scripts/serve-result-viewer.js"
|
|
12
12
|
},
|
|
13
13
|
"dependencies": {
|
|
14
|
+
"@aws-sdk/client-s3": "^3.1045.0",
|
|
14
15
|
"cheerio": "^1.2.0",
|
|
15
16
|
"csv-parse": "^5.6.0",
|
|
16
17
|
"playwright": "^1.58.2"
|
package/page-extractor.js
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
import fs from 'node:fs/promises';
|
|
7
7
|
import path from 'node:path';
|
|
8
|
+
import { PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
|
|
8
9
|
|
|
9
10
|
// In-browser block extraction function (serialized into page.evaluate)
|
|
10
11
|
// Imported from the project's extract-blocks script
|
|
@@ -49,6 +50,75 @@ function getBlockSelector(block) {
|
|
|
49
50
|
return selector || '';
|
|
50
51
|
}
|
|
51
52
|
|
|
53
|
+
function isObject(value) {
|
|
54
|
+
return value && typeof value === 'object' && !Array.isArray(value);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function normalizeS3Prefix(value) {
|
|
58
|
+
return String(value || '')
|
|
59
|
+
.trim()
|
|
60
|
+
.replace(/^\/+|\/+$/g, '');
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function normalizePublicBaseUrl(value) {
|
|
64
|
+
return String(value || '')
|
|
65
|
+
.trim()
|
|
66
|
+
.replace(/\/+$/g, '');
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function normalizeS3Config(config) {
|
|
70
|
+
if (config == null) {
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
if (!isObject(config)) {
|
|
74
|
+
throw new Error('extractorConfig.s3 must be an object');
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const bucket = String(config.bucket || '').trim();
|
|
78
|
+
const region = String(config.region || '').trim();
|
|
79
|
+
if (!bucket) {
|
|
80
|
+
throw new Error('extractorConfig.s3.bucket is required');
|
|
81
|
+
}
|
|
82
|
+
if (!region) {
|
|
83
|
+
throw new Error('extractorConfig.s3.region is required');
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
bucket,
|
|
88
|
+
region,
|
|
89
|
+
prefix: normalizeS3Prefix(config.prefix),
|
|
90
|
+
publicBaseUrl: normalizePublicBaseUrl(config.publicBaseUrl),
|
|
91
|
+
credentials: isObject(config.credentials) ? config.credentials : undefined,
|
|
92
|
+
client: config.client,
|
|
93
|
+
maxUploadAttempts: Number.isInteger(config.maxUploadAttempts)
|
|
94
|
+
? Math.max(1, config.maxUploadAttempts)
|
|
95
|
+
: 3
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function joinS3Key(prefix, filename) {
|
|
100
|
+
return [prefix, filename].filter(Boolean).join('/');
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function encodeS3Key(key) {
|
|
104
|
+
return String(key || '')
|
|
105
|
+
.split('/')
|
|
106
|
+
.map((part) => encodeURIComponent(part))
|
|
107
|
+
.join('/');
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function buildS3Url(s3Config, key, filename) {
|
|
111
|
+
if (s3Config.publicBaseUrl) {
|
|
112
|
+
return `${s3Config.publicBaseUrl}/${encodeURIComponent(filename)}`;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return `https://${s3Config.bucket}.s3.${s3Config.region}.amazonaws.com/${encodeS3Key(key)}`;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function getErrorMessage(error) {
|
|
119
|
+
return error instanceof Error ? error.message : String(error);
|
|
120
|
+
}
|
|
121
|
+
|
|
52
122
|
export class PageExtractor {
|
|
53
123
|
constructor(config = {}) {
|
|
54
124
|
this.config = {
|
|
@@ -71,9 +141,11 @@ export class PageExtractor {
|
|
|
71
141
|
blockScreenshots: Boolean(config.blockScreenshots),
|
|
72
142
|
snapshotDir: typeof config.snapshotDir === 'string' && config.snapshotDir.trim()
|
|
73
143
|
? path.resolve(process.cwd(), config.snapshotDir)
|
|
74
|
-
: path.resolve(process.cwd(), 'snapshots')
|
|
144
|
+
: path.resolve(process.cwd(), 'snapshots'),
|
|
145
|
+
s3: normalizeS3Config(config.s3)
|
|
75
146
|
};
|
|
76
147
|
this.playwrightModule = null;
|
|
148
|
+
this.s3Client = null;
|
|
77
149
|
}
|
|
78
150
|
|
|
79
151
|
async getPlaywright() {
|
|
@@ -85,6 +157,60 @@ export class PageExtractor {
|
|
|
85
157
|
return this.playwrightModule;
|
|
86
158
|
}
|
|
87
159
|
|
|
160
|
+
getS3Client() {
|
|
161
|
+
if (!this.config.s3) {
|
|
162
|
+
return null;
|
|
163
|
+
}
|
|
164
|
+
if (this.config.s3.client) {
|
|
165
|
+
return this.config.s3.client;
|
|
166
|
+
}
|
|
167
|
+
if (this.s3Client) {
|
|
168
|
+
return this.s3Client;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
this.s3Client = new S3Client({
|
|
172
|
+
region: this.config.s3.region,
|
|
173
|
+
credentials: this.config.s3.credentials,
|
|
174
|
+
maxAttempts: 1
|
|
175
|
+
});
|
|
176
|
+
return this.s3Client;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
async uploadScreenshotToS3(filename, body) {
|
|
180
|
+
const s3Config = this.config.s3;
|
|
181
|
+
if (!s3Config) {
|
|
182
|
+
throw new Error('S3 is not configured');
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const key = joinS3Key(s3Config.prefix, filename);
|
|
186
|
+
const client = this.getS3Client();
|
|
187
|
+
const commandInput = {
|
|
188
|
+
Bucket: s3Config.bucket,
|
|
189
|
+
Key: key,
|
|
190
|
+
Body: body,
|
|
191
|
+
ContentType: 'image/png'
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
let lastError = null;
|
|
195
|
+
for (let attempt = 1; attempt <= s3Config.maxUploadAttempts; attempt += 1) {
|
|
196
|
+
try {
|
|
197
|
+
const command = new PutObjectCommand(commandInput);
|
|
198
|
+
await client.send(command);
|
|
199
|
+
return buildS3Url(s3Config, key, filename);
|
|
200
|
+
} catch (error) {
|
|
201
|
+
lastError = error;
|
|
202
|
+
if (attempt < s3Config.maxUploadAttempts) {
|
|
203
|
+
console.warn(
|
|
204
|
+
`[page-analyzer] Failed to upload ${key} to S3, retrying ` +
|
|
205
|
+
`(${attempt}/${s3Config.maxUploadAttempts}): ${getErrorMessage(error)}`
|
|
206
|
+
);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
throw lastError;
|
|
212
|
+
}
|
|
213
|
+
|
|
88
214
|
async revealHiddenContent(page) {
|
|
89
215
|
return page.evaluate(() => {
|
|
90
216
|
const CONTENT_THRESHOLD = 20;
|
|
@@ -401,14 +527,14 @@ export class PageExtractor {
|
|
|
401
527
|
});
|
|
402
528
|
}
|
|
403
529
|
|
|
404
|
-
async
|
|
530
|
+
async captureBlockScreenshotData(page, block, screenshotOptions = {}) {
|
|
405
531
|
if (block?.hidden) {
|
|
406
|
-
return
|
|
532
|
+
return null;
|
|
407
533
|
}
|
|
408
534
|
|
|
409
535
|
const selector = getBlockSelector(block);
|
|
410
536
|
if (!selector) {
|
|
411
|
-
return
|
|
537
|
+
return null;
|
|
412
538
|
}
|
|
413
539
|
|
|
414
540
|
try {
|
|
@@ -416,8 +542,7 @@ export class PageExtractor {
|
|
|
416
542
|
if (await locator.count() > 0) {
|
|
417
543
|
await this.hideExternalFixedOverlays(page, selector);
|
|
418
544
|
try {
|
|
419
|
-
await locator.screenshot(
|
|
420
|
-
return true;
|
|
545
|
+
return await locator.screenshot(screenshotOptions);
|
|
421
546
|
} finally {
|
|
422
547
|
await this.restoreExternalFixedOverlays(page);
|
|
423
548
|
}
|
|
@@ -426,28 +551,49 @@ export class PageExtractor {
|
|
|
426
551
|
// Selector-only mode: skip blocks that cannot be captured through CSS.
|
|
427
552
|
}
|
|
428
553
|
|
|
429
|
-
return
|
|
554
|
+
return null;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
async captureBlockScreenshot(page, block, blockPath) {
|
|
558
|
+
const body = await this.captureBlockScreenshotData(page, block, { path: blockPath });
|
|
559
|
+
return Boolean(body) || body === undefined;
|
|
430
560
|
}
|
|
431
561
|
|
|
432
562
|
async captureScreenshots(page, targetUrl, blocks, options = {}) {
|
|
433
563
|
const fullPageScreenshot = options.fullPageScreenshot ?? this.config.fullPageScreenshot;
|
|
434
564
|
const blockScreenshots = options.blockScreenshots ?? this.config.blockScreenshots;
|
|
565
|
+
const useS3 = Boolean(this.config.s3);
|
|
435
566
|
|
|
436
567
|
if (!fullPageScreenshot && !blockScreenshots) {
|
|
437
568
|
return null;
|
|
438
569
|
}
|
|
439
570
|
|
|
440
|
-
|
|
571
|
+
if (!useS3) {
|
|
572
|
+
await fs.mkdir(this.config.snapshotDir, { recursive: true });
|
|
573
|
+
}
|
|
574
|
+
|
|
441
575
|
const prefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
|
|
442
576
|
const screenshots = {};
|
|
443
577
|
|
|
444
578
|
if (fullPageScreenshot) {
|
|
445
|
-
const
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
579
|
+
const fullPageFilename = `${prefix}-full-page.png`;
|
|
580
|
+
try {
|
|
581
|
+
if (useS3) {
|
|
582
|
+
const body = await page.screenshot({ fullPage: true });
|
|
583
|
+
screenshots.fullPage = await this.uploadScreenshotToS3(fullPageFilename, body);
|
|
584
|
+
} else {
|
|
585
|
+
const fullPagePath = path.join(this.config.snapshotDir, fullPageFilename);
|
|
586
|
+
await page.screenshot({
|
|
587
|
+
path: fullPagePath,
|
|
588
|
+
fullPage: true
|
|
589
|
+
});
|
|
590
|
+
screenshots.fullPage = fullPagePath;
|
|
591
|
+
}
|
|
592
|
+
} catch (error) {
|
|
593
|
+
console.warn(
|
|
594
|
+
`[page-analyzer] Failed to capture/upload full-page screenshot: ${getErrorMessage(error)}`
|
|
595
|
+
);
|
|
596
|
+
}
|
|
451
597
|
}
|
|
452
598
|
|
|
453
599
|
if (blockScreenshots) {
|
|
@@ -458,8 +604,29 @@ export class PageExtractor {
|
|
|
458
604
|
const blockIdx = getBlockNumber(block, index);
|
|
459
605
|
|
|
460
606
|
const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
|
|
461
|
-
const
|
|
607
|
+
const blockFilename = `${prefix}-block-${blockLabel}.png`;
|
|
462
608
|
try {
|
|
609
|
+
if (useS3) {
|
|
610
|
+
const body = await this.captureBlockScreenshotData(page, block);
|
|
611
|
+
if (!body) {
|
|
612
|
+
continue;
|
|
613
|
+
}
|
|
614
|
+
const url = await this.uploadScreenshotToS3(blockFilename, body);
|
|
615
|
+
const screenshotRecord = {
|
|
616
|
+
blockIdx,
|
|
617
|
+
path: url
|
|
618
|
+
};
|
|
619
|
+
if (typeof block?.blockName === 'string' && block.blockName.trim()) {
|
|
620
|
+
screenshotRecord.blockName = block.blockName.trim();
|
|
621
|
+
}
|
|
622
|
+
if (typeof block?.blockIdxs === 'string' && block.blockIdxs.trim()) {
|
|
623
|
+
screenshotRecord.blockIdxs = block.blockIdxs.trim();
|
|
624
|
+
}
|
|
625
|
+
screenshots.blocks.push(screenshotRecord);
|
|
626
|
+
continue;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
const blockPath = path.join(this.config.snapshotDir, blockFilename);
|
|
463
630
|
const captured = await this.captureBlockScreenshot(page, block, blockPath);
|
|
464
631
|
if (captured) {
|
|
465
632
|
const screenshotRecord = {
|
|
@@ -475,7 +642,9 @@ export class PageExtractor {
|
|
|
475
642
|
screenshots.blocks.push(screenshotRecord);
|
|
476
643
|
}
|
|
477
644
|
} catch (error) {
|
|
478
|
-
console.warn(
|
|
645
|
+
console.warn(
|
|
646
|
+
`[page-analyzer] Failed to capture/upload block ${blockIdx}: ${getErrorMessage(error)}`
|
|
647
|
+
);
|
|
479
648
|
}
|
|
480
649
|
}
|
|
481
650
|
}
|
|
@@ -494,11 +663,14 @@ export class PageExtractor {
|
|
|
494
663
|
await this.waitForImagesLoaded(page);
|
|
495
664
|
}
|
|
496
665
|
|
|
497
|
-
async
|
|
666
|
+
async withPreparedPage(url, callback) {
|
|
498
667
|
const targetUrl = String(url || '').trim();
|
|
499
668
|
if (!targetUrl) {
|
|
500
669
|
throw new Error('PageExtractor requires a non-empty URL');
|
|
501
670
|
}
|
|
671
|
+
if (typeof callback !== 'function') {
|
|
672
|
+
throw new Error('PageExtractor.withPreparedPage requires a callback');
|
|
673
|
+
}
|
|
502
674
|
|
|
503
675
|
const viewport = {
|
|
504
676
|
width: this.config.viewportWidth,
|
|
@@ -510,53 +682,60 @@ export class PageExtractor {
|
|
|
510
682
|
try {
|
|
511
683
|
const page = await browser.newPage({ viewport });
|
|
512
684
|
await this.preparePage(page, targetUrl);
|
|
513
|
-
return await
|
|
685
|
+
return await callback(page, targetUrl);
|
|
514
686
|
} finally {
|
|
515
687
|
await browser.close();
|
|
516
688
|
}
|
|
517
689
|
}
|
|
518
690
|
|
|
691
|
+
async captureUrlScreenshots(url, blocks, options = {}) {
|
|
692
|
+
return this.withPreparedPage(url, async (page, targetUrl) => {
|
|
693
|
+
return await this.captureScreenshots(page, targetUrl, blocks, options);
|
|
694
|
+
});
|
|
695
|
+
}
|
|
696
|
+
|
|
519
697
|
/**
|
|
520
|
-
* Extract page data
|
|
521
|
-
*
|
|
698
|
+
* Extract page data from an already prepared Playwright page:
|
|
699
|
+
* html, blocks, elementGeometries, screenshots.
|
|
700
|
+
* When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
|
|
701
|
+
* @param {import('playwright').Page} page - Prepared Playwright page
|
|
702
|
+
* @param {string} targetUrl - URL loaded in the page
|
|
522
703
|
* @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
|
|
523
704
|
*/
|
|
524
|
-
async
|
|
525
|
-
const targetUrl = String(url || '').trim();
|
|
526
|
-
if (!targetUrl) {
|
|
527
|
-
throw new Error('PageExtractor requires a non-empty URL');
|
|
528
|
-
}
|
|
529
|
-
|
|
705
|
+
async extractPreparedPage(page, targetUrl) {
|
|
530
706
|
const viewport = {
|
|
531
707
|
width: this.config.viewportWidth,
|
|
532
708
|
height: this.config.viewportHeight
|
|
533
709
|
};
|
|
534
710
|
|
|
535
|
-
const
|
|
536
|
-
const browser = await playwright.chromium.launch({ headless: true });
|
|
537
|
-
try {
|
|
538
|
-
const page = await browser.newPage({ viewport });
|
|
539
|
-
await this.preparePage(page, targetUrl);
|
|
711
|
+
const html = await page.content();
|
|
540
712
|
|
|
541
|
-
|
|
713
|
+
const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
|
|
714
|
+
const blocksResult = await page.evaluate(extractBlocksInBrowser, {
|
|
715
|
+
minHeight: this.config.minBlockHeight,
|
|
716
|
+
minWidth,
|
|
717
|
+
maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
|
|
718
|
+
maxDepth: this.config.blockMaxDepth,
|
|
719
|
+
textPreviewMaxChars: this.config.textPreviewMaxChars,
|
|
720
|
+
debug: false
|
|
721
|
+
});
|
|
722
|
+
const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
|
|
723
|
+
const elementGeometries = await this.collectElementGeometries(page);
|
|
724
|
+
const finalPageSize = await this.collectPageSize(page);
|
|
725
|
+
const screenshots = await this.captureScreenshots(page, targetUrl, blocks);
|
|
542
726
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
minHeight: this.config.minBlockHeight,
|
|
546
|
-
minWidth,
|
|
547
|
-
maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
|
|
548
|
-
maxDepth: this.config.blockMaxDepth,
|
|
549
|
-
textPreviewMaxChars: this.config.textPreviewMaxChars,
|
|
550
|
-
debug: false
|
|
551
|
-
});
|
|
552
|
-
const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
|
|
553
|
-
const elementGeometries = await this.collectElementGeometries(page);
|
|
554
|
-
const finalPageSize = await this.collectPageSize(page);
|
|
555
|
-
const screenshots = await this.captureScreenshots(page, targetUrl, blocks);
|
|
727
|
+
return { html, blocks, elementGeometries, screenshots, pageSize: finalPageSize };
|
|
728
|
+
}
|
|
556
729
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
730
|
+
/**
|
|
731
|
+
* Extract page data: html, blocks, elementGeometries, screenshots.
|
|
732
|
+
* When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
|
|
733
|
+
* @param {string} url - URL to extract
|
|
734
|
+
* @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
|
|
735
|
+
*/
|
|
736
|
+
async extract(url) {
|
|
737
|
+
return this.withPreparedPage(url, async (page, targetUrl) => {
|
|
738
|
+
return await this.extractPreparedPage(page, targetUrl);
|
|
739
|
+
});
|
|
561
740
|
}
|
|
562
741
|
}
|