page-analyzer 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +72 -9
- package/index.js +206 -22
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +23 -2
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +1 -1
- package/llm/analyzers/event-analyzer/event-analyzer.js +1 -1
- package/package.json +6 -3
- package/page-extractor.js +562 -36
- package/result-viewer.html +1064 -0
- package/scripts/analyze.js +51 -0
- package/scripts/build-result-viewer.js +1076 -0
- package/scripts/serve-result-viewer.js +68 -0
- package/test/smoke.test.js +454 -0
package/page-extractor.js
CHANGED
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
* Launches headless Chromium, navigates to URL, scrolls, extracts blocks + element geometries + HTML.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
+
import fs from 'node:fs/promises';
|
|
7
|
+
import path from 'node:path';
|
|
8
|
+
import { PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
|
|
9
|
+
|
|
6
10
|
// In-browser block extraction function (serialized into page.evaluate)
|
|
7
11
|
// Imported from the project's extract-blocks script
|
|
8
12
|
import {
|
|
@@ -11,6 +15,110 @@ import {
|
|
|
11
15
|
waitForStableHeight
|
|
12
16
|
} from './vendor/extract-blocks.js';
|
|
13
17
|
|
|
18
|
+
function createSnapshotSlug(url) {
|
|
19
|
+
let source = String(url || '').trim();
|
|
20
|
+
try {
|
|
21
|
+
const parsed = new URL(source);
|
|
22
|
+
source = `${parsed.hostname}${parsed.pathname}`;
|
|
23
|
+
} catch {
|
|
24
|
+
// Keep the raw value for non-URL inputs.
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const slug = source
|
|
28
|
+
.toLowerCase()
|
|
29
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
30
|
+
.replace(/^-+|-+$/g, '')
|
|
31
|
+
.slice(0, 80);
|
|
32
|
+
|
|
33
|
+
return slug || 'page';
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function createSnapshotRunId() {
|
|
37
|
+
return new Date()
|
|
38
|
+
.toISOString()
|
|
39
|
+
.replace(/\.\d{3}z$/i, '')
|
|
40
|
+
.replace(/[^0-9a-z]+/gi, '-')
|
|
41
|
+
.replace(/^-+|-+$/g, '');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function getBlockNumber(block, fallbackIndex) {
|
|
45
|
+
return Number.isInteger(block?.blockIdx) ? block.blockIdx : fallbackIndex;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function getBlockSelector(block) {
|
|
49
|
+
const selector = typeof block?.blockCssPath === 'string' ? block.blockCssPath.trim() : '';
|
|
50
|
+
return selector || '';
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function isObject(value) {
|
|
54
|
+
return value && typeof value === 'object' && !Array.isArray(value);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function normalizeS3Prefix(value) {
|
|
58
|
+
return String(value || '')
|
|
59
|
+
.trim()
|
|
60
|
+
.replace(/^\/+|\/+$/g, '');
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function normalizePublicBaseUrl(value) {
|
|
64
|
+
return String(value || '')
|
|
65
|
+
.trim()
|
|
66
|
+
.replace(/\/+$/g, '');
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function normalizeS3Config(config) {
|
|
70
|
+
if (config == null) {
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
if (!isObject(config)) {
|
|
74
|
+
throw new Error('extractorConfig.s3 must be an object');
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const bucket = String(config.bucket || '').trim();
|
|
78
|
+
const region = String(config.region || '').trim();
|
|
79
|
+
if (!bucket) {
|
|
80
|
+
throw new Error('extractorConfig.s3.bucket is required');
|
|
81
|
+
}
|
|
82
|
+
if (!region) {
|
|
83
|
+
throw new Error('extractorConfig.s3.region is required');
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
bucket,
|
|
88
|
+
region,
|
|
89
|
+
prefix: normalizeS3Prefix(config.prefix),
|
|
90
|
+
publicBaseUrl: normalizePublicBaseUrl(config.publicBaseUrl),
|
|
91
|
+
credentials: isObject(config.credentials) ? config.credentials : undefined,
|
|
92
|
+
client: config.client,
|
|
93
|
+
maxUploadAttempts: Number.isInteger(config.maxUploadAttempts)
|
|
94
|
+
? Math.max(1, config.maxUploadAttempts)
|
|
95
|
+
: 3
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function joinS3Key(prefix, filename) {
|
|
100
|
+
return [prefix, filename].filter(Boolean).join('/');
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function encodeS3Key(key) {
|
|
104
|
+
return String(key || '')
|
|
105
|
+
.split('/')
|
|
106
|
+
.map((part) => encodeURIComponent(part))
|
|
107
|
+
.join('/');
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function buildS3Url(s3Config, key, filename) {
|
|
111
|
+
if (s3Config.publicBaseUrl) {
|
|
112
|
+
return `${s3Config.publicBaseUrl}/${encodeURIComponent(filename)}`;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return `https://${s3Config.bucket}.s3.${s3Config.region}.amazonaws.com/${encodeS3Key(key)}`;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function getErrorMessage(error) {
|
|
119
|
+
return error instanceof Error ? error.message : String(error);
|
|
120
|
+
}
|
|
121
|
+
|
|
14
122
|
export class PageExtractor {
|
|
15
123
|
constructor(config = {}) {
|
|
16
124
|
this.config = {
|
|
@@ -27,9 +135,17 @@ export class PageExtractor {
|
|
|
27
135
|
blockMaxDepth: Number.isInteger(config.blockMaxDepth) ? Math.max(1, config.blockMaxDepth) : 15,
|
|
28
136
|
textPreviewMaxChars: Number.isInteger(config.textPreviewMaxChars)
|
|
29
137
|
? Math.max(120, config.textPreviewMaxChars)
|
|
30
|
-
: 1200
|
|
138
|
+
: 1200,
|
|
139
|
+
waitForImagesLoaded: Boolean(config.waitForImagesLoaded),
|
|
140
|
+
fullPageScreenshot: Boolean(config.fullPageScreenshot),
|
|
141
|
+
blockScreenshots: Boolean(config.blockScreenshots),
|
|
142
|
+
snapshotDir: typeof config.snapshotDir === 'string' && config.snapshotDir.trim()
|
|
143
|
+
? path.resolve(process.cwd(), config.snapshotDir)
|
|
144
|
+
: path.resolve(process.cwd(), 'snapshots'),
|
|
145
|
+
s3: normalizeS3Config(config.s3)
|
|
31
146
|
};
|
|
32
147
|
this.playwrightModule = null;
|
|
148
|
+
this.s3Client = null;
|
|
33
149
|
}
|
|
34
150
|
|
|
35
151
|
async getPlaywright() {
|
|
@@ -41,17 +157,97 @@ export class PageExtractor {
|
|
|
41
157
|
return this.playwrightModule;
|
|
42
158
|
}
|
|
43
159
|
|
|
160
|
+
getS3Client() {
|
|
161
|
+
if (!this.config.s3) {
|
|
162
|
+
return null;
|
|
163
|
+
}
|
|
164
|
+
if (this.config.s3.client) {
|
|
165
|
+
return this.config.s3.client;
|
|
166
|
+
}
|
|
167
|
+
if (this.s3Client) {
|
|
168
|
+
return this.s3Client;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
this.s3Client = new S3Client({
|
|
172
|
+
region: this.config.s3.region,
|
|
173
|
+
credentials: this.config.s3.credentials,
|
|
174
|
+
maxAttempts: 1
|
|
175
|
+
});
|
|
176
|
+
return this.s3Client;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
async uploadScreenshotToS3(filename, body) {
|
|
180
|
+
const s3Config = this.config.s3;
|
|
181
|
+
if (!s3Config) {
|
|
182
|
+
throw new Error('S3 is not configured');
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const key = joinS3Key(s3Config.prefix, filename);
|
|
186
|
+
const client = this.getS3Client();
|
|
187
|
+
const commandInput = {
|
|
188
|
+
Bucket: s3Config.bucket,
|
|
189
|
+
Key: key,
|
|
190
|
+
Body: body,
|
|
191
|
+
ContentType: 'image/png'
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
let lastError = null;
|
|
195
|
+
for (let attempt = 1; attempt <= s3Config.maxUploadAttempts; attempt += 1) {
|
|
196
|
+
try {
|
|
197
|
+
const command = new PutObjectCommand(commandInput);
|
|
198
|
+
await client.send(command);
|
|
199
|
+
return buildS3Url(s3Config, key, filename);
|
|
200
|
+
} catch (error) {
|
|
201
|
+
lastError = error;
|
|
202
|
+
if (attempt < s3Config.maxUploadAttempts) {
|
|
203
|
+
console.warn(
|
|
204
|
+
`[page-analyzer] Failed to upload ${key} to S3, retrying ` +
|
|
205
|
+
`(${attempt}/${s3Config.maxUploadAttempts}): ${getErrorMessage(error)}`
|
|
206
|
+
);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
throw lastError;
|
|
212
|
+
}
|
|
213
|
+
|
|
44
214
|
async revealHiddenContent(page) {
|
|
45
215
|
return page.evaluate(() => {
|
|
46
216
|
const CONTENT_THRESHOLD = 20;
|
|
217
|
+
const NON_CONTENT_TAGS = new Set([
|
|
218
|
+
'SCRIPT',
|
|
219
|
+
'STYLE',
|
|
220
|
+
'NOSCRIPT',
|
|
221
|
+
'TEMPLATE',
|
|
222
|
+
'META',
|
|
223
|
+
'LINK',
|
|
224
|
+
'IFRAME',
|
|
225
|
+
'OBJECT',
|
|
226
|
+
'EMBED'
|
|
227
|
+
]);
|
|
47
228
|
let opacityCount = 0;
|
|
48
229
|
let displayCount = 0;
|
|
49
230
|
|
|
231
|
+
const isLikelyScriptText = (value) => {
|
|
232
|
+
const text = String(value || '').trim();
|
|
233
|
+
if (!text) {
|
|
234
|
+
return false;
|
|
235
|
+
}
|
|
236
|
+
return /(_satellite|google_tag_manager|dataLayer|window\.|document\.|function\s*\(|=>|createElement\(|appendChild\(|\.push\(|var\s+\w+\s*=|const\s+\w+\s*=|let\s+\w+\s*=)/.test(text);
|
|
237
|
+
};
|
|
238
|
+
|
|
50
239
|
for (const el of document.querySelectorAll('*')) {
|
|
240
|
+
if (NON_CONTENT_TAGS.has(el.tagName)) {
|
|
241
|
+
continue;
|
|
242
|
+
}
|
|
243
|
+
|
|
51
244
|
const style = getComputedStyle(el);
|
|
52
245
|
if (parseFloat(style.opacity) === 0 && el.getBoundingClientRect().height > 0) {
|
|
53
246
|
const text = (el.innerText || '').trim();
|
|
54
|
-
if (
|
|
247
|
+
if (
|
|
248
|
+
!isLikelyScriptText(text) &&
|
|
249
|
+
(text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0)
|
|
250
|
+
) {
|
|
55
251
|
el.style.setProperty('opacity', '1', 'important');
|
|
56
252
|
opacityCount += 1;
|
|
57
253
|
}
|
|
@@ -68,7 +264,7 @@ export class PageExtractor {
|
|
|
68
264
|
el.style.setProperty('display', 'block', 'important');
|
|
69
265
|
const text = (el.innerText || '').trim();
|
|
70
266
|
|
|
71
|
-
if (text.length >= CONTENT_THRESHOLD) {
|
|
267
|
+
if (text.length >= CONTENT_THRESHOLD && !isLikelyScriptText(text)) {
|
|
72
268
|
displayCount += 1;
|
|
73
269
|
} else if (originalDisplay) {
|
|
74
270
|
el.style.display = originalDisplay;
|
|
@@ -82,6 +278,80 @@ export class PageExtractor {
|
|
|
82
278
|
});
|
|
83
279
|
}
|
|
84
280
|
|
|
281
|
+
async waitForImagesLoaded(page) {
|
|
282
|
+
if (!this.config.waitForImagesLoaded) {
|
|
283
|
+
return null;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
const timeoutMs = this.config.timeoutMs;
|
|
287
|
+
const result = await page.evaluate(async ({ timeoutMs: waitTimeoutMs }) => {
|
|
288
|
+
const images = Array.from(document.images || []);
|
|
289
|
+
const total = images.length;
|
|
290
|
+
if (total === 0) {
|
|
291
|
+
return {
|
|
292
|
+
total,
|
|
293
|
+
loaded: 0,
|
|
294
|
+
timedOut: false
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
const isSettled = (img) => img.complete;
|
|
299
|
+
const countLoaded = () => images.filter(isSettled).length;
|
|
300
|
+
const pending = images.filter((img) => !isSettled(img));
|
|
301
|
+
|
|
302
|
+
if (pending.length === 0) {
|
|
303
|
+
return {
|
|
304
|
+
total,
|
|
305
|
+
loaded: total,
|
|
306
|
+
timedOut: false
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
let timeoutId = null;
|
|
311
|
+
const waitForImage = (img) => new Promise((resolve) => {
|
|
312
|
+
if (isSettled(img)) {
|
|
313
|
+
resolve();
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
const done = () => {
|
|
318
|
+
img.removeEventListener('load', done);
|
|
319
|
+
img.removeEventListener('error', done);
|
|
320
|
+
resolve();
|
|
321
|
+
};
|
|
322
|
+
img.addEventListener('load', done, { once: true });
|
|
323
|
+
img.addEventListener('error', done, { once: true });
|
|
324
|
+
});
|
|
325
|
+
|
|
326
|
+
const allImagesDone = Promise.all(pending.map(waitForImage))
|
|
327
|
+
.then(() => ({ timedOut: false }));
|
|
328
|
+
const timeout = new Promise((resolve) => {
|
|
329
|
+
timeoutId = window.setTimeout(() => resolve({ timedOut: true }), waitTimeoutMs);
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
const waitResult = await Promise.race([allImagesDone, timeout]);
|
|
333
|
+
if (timeoutId !== null) {
|
|
334
|
+
window.clearTimeout(timeoutId);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
return {
|
|
338
|
+
total,
|
|
339
|
+
loaded: countLoaded(),
|
|
340
|
+
timedOut: Boolean(waitResult?.timedOut)
|
|
341
|
+
};
|
|
342
|
+
}, { timeoutMs });
|
|
343
|
+
|
|
344
|
+
if (result?.timedOut) {
|
|
345
|
+
console.warn(
|
|
346
|
+
`[page-analyzer] Timed out waiting for images: ${result.loaded}/${result.total} completed`
|
|
347
|
+
);
|
|
348
|
+
} else {
|
|
349
|
+
console.log(`[page-analyzer] Images loaded: ${result?.loaded || 0}/${result?.total || 0}`);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
return result;
|
|
353
|
+
}
|
|
354
|
+
|
|
85
355
|
async collectElementGeometries(page) {
|
|
86
356
|
return page.evaluate(() => {
|
|
87
357
|
const INTERACTIVE_SELECTOR = 'a, button, form, input, select, textarea, [onclick], [role="button"]';
|
|
@@ -161,16 +431,246 @@ export class PageExtractor {
|
|
|
161
431
|
});
|
|
162
432
|
}
|
|
163
433
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
434
|
+
async collectPageSize(page) {
|
|
435
|
+
return page.evaluate(() => {
|
|
436
|
+
const html = document.documentElement;
|
|
437
|
+
const body = document.body;
|
|
438
|
+
return {
|
|
439
|
+
width: Math.max(
|
|
440
|
+
html?.scrollWidth || 0,
|
|
441
|
+
html?.offsetWidth || 0,
|
|
442
|
+
html?.clientWidth || 0,
|
|
443
|
+
body?.scrollWidth || 0,
|
|
444
|
+
body?.offsetWidth || 0,
|
|
445
|
+
body?.clientWidth || 0,
|
|
446
|
+
window.innerWidth || 0
|
|
447
|
+
),
|
|
448
|
+
height: Math.max(
|
|
449
|
+
html?.scrollHeight || 0,
|
|
450
|
+
html?.offsetHeight || 0,
|
|
451
|
+
html?.clientHeight || 0,
|
|
452
|
+
body?.scrollHeight || 0,
|
|
453
|
+
body?.offsetHeight || 0,
|
|
454
|
+
body?.clientHeight || 0,
|
|
455
|
+
window.innerHeight || 0
|
|
456
|
+
)
|
|
457
|
+
};
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
async hideExternalFixedOverlays(page, selector) {
|
|
462
|
+
return page.evaluate((targetSelector) => {
|
|
463
|
+
const existing = Array.isArray(window.__pageAnalyzerHiddenOverlays)
|
|
464
|
+
? window.__pageAnalyzerHiddenOverlays
|
|
465
|
+
: [];
|
|
466
|
+
for (const item of existing) {
|
|
467
|
+
if (!item?.element) continue;
|
|
468
|
+
if (item.visibilityValue) {
|
|
469
|
+
item.element.style.setProperty(
|
|
470
|
+
'visibility',
|
|
471
|
+
item.visibilityValue,
|
|
472
|
+
item.visibilityPriority || ''
|
|
473
|
+
);
|
|
474
|
+
} else {
|
|
475
|
+
item.element.style.removeProperty('visibility');
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
const target = document.querySelector(targetSelector);
|
|
480
|
+
if (!(target instanceof Element)) {
|
|
481
|
+
window.__pageAnalyzerHiddenOverlays = [];
|
|
482
|
+
return 0;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
const hidden = [];
|
|
486
|
+
for (const element of document.querySelectorAll('body *')) {
|
|
487
|
+
if (!(element instanceof HTMLElement)) continue;
|
|
488
|
+
if (element === target || element.contains(target) || target.contains(element)) continue;
|
|
489
|
+
|
|
490
|
+
const style = getComputedStyle(element);
|
|
491
|
+
if (style.position !== 'fixed' && style.position !== 'sticky') continue;
|
|
492
|
+
|
|
493
|
+
const rect = element.getBoundingClientRect();
|
|
494
|
+
if (rect.width <= 0 || rect.height <= 0) continue;
|
|
495
|
+
|
|
496
|
+
hidden.push({
|
|
497
|
+
element,
|
|
498
|
+
visibilityValue: element.style.getPropertyValue('visibility'),
|
|
499
|
+
visibilityPriority: element.style.getPropertyPriority('visibility')
|
|
500
|
+
});
|
|
501
|
+
element.style.setProperty('visibility', 'hidden', 'important');
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
window.__pageAnalyzerHiddenOverlays = hidden;
|
|
505
|
+
return hidden.length;
|
|
506
|
+
}, selector);
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
async restoreExternalFixedOverlays(page) {
|
|
510
|
+
await page.evaluate(() => {
|
|
511
|
+
const hidden = Array.isArray(window.__pageAnalyzerHiddenOverlays)
|
|
512
|
+
? window.__pageAnalyzerHiddenOverlays
|
|
513
|
+
: [];
|
|
514
|
+
for (const item of hidden) {
|
|
515
|
+
if (!item?.element) continue;
|
|
516
|
+
if (item.visibilityValue) {
|
|
517
|
+
item.element.style.setProperty(
|
|
518
|
+
'visibility',
|
|
519
|
+
item.visibilityValue,
|
|
520
|
+
item.visibilityPriority || ''
|
|
521
|
+
);
|
|
522
|
+
} else {
|
|
523
|
+
item.element.style.removeProperty('visibility');
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
window.__pageAnalyzerHiddenOverlays = [];
|
|
527
|
+
});
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
async captureBlockScreenshotData(page, block, screenshotOptions = {}) {
|
|
531
|
+
if (block?.hidden) {
|
|
532
|
+
return null;
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
const selector = getBlockSelector(block);
|
|
536
|
+
if (!selector) {
|
|
537
|
+
return null;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
try {
|
|
541
|
+
const locator = page.locator(selector).first();
|
|
542
|
+
if (await locator.count() > 0) {
|
|
543
|
+
await this.hideExternalFixedOverlays(page, selector);
|
|
544
|
+
try {
|
|
545
|
+
return await locator.screenshot(screenshotOptions);
|
|
546
|
+
} finally {
|
|
547
|
+
await this.restoreExternalFixedOverlays(page);
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
} catch {
|
|
551
|
+
// Selector-only mode: skip blocks that cannot be captured through CSS.
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
return null;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
async captureBlockScreenshot(page, block, blockPath) {
|
|
558
|
+
const body = await this.captureBlockScreenshotData(page, block, { path: blockPath });
|
|
559
|
+
return Boolean(body) || body === undefined;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
async captureScreenshots(page, targetUrl, blocks, options = {}) {
|
|
563
|
+
const fullPageScreenshot = options.fullPageScreenshot ?? this.config.fullPageScreenshot;
|
|
564
|
+
const blockScreenshots = options.blockScreenshots ?? this.config.blockScreenshots;
|
|
565
|
+
const useS3 = Boolean(this.config.s3);
|
|
566
|
+
|
|
567
|
+
if (!fullPageScreenshot && !blockScreenshots) {
|
|
568
|
+
return null;
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
if (!useS3) {
|
|
572
|
+
await fs.mkdir(this.config.snapshotDir, { recursive: true });
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
const prefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
|
|
576
|
+
const screenshots = {};
|
|
577
|
+
|
|
578
|
+
if (fullPageScreenshot) {
|
|
579
|
+
const fullPageFilename = `${prefix}-full-page.png`;
|
|
580
|
+
try {
|
|
581
|
+
if (useS3) {
|
|
582
|
+
const body = await page.screenshot({ fullPage: true });
|
|
583
|
+
screenshots.fullPage = await this.uploadScreenshotToS3(fullPageFilename, body);
|
|
584
|
+
} else {
|
|
585
|
+
const fullPagePath = path.join(this.config.snapshotDir, fullPageFilename);
|
|
586
|
+
await page.screenshot({
|
|
587
|
+
path: fullPagePath,
|
|
588
|
+
fullPage: true
|
|
589
|
+
});
|
|
590
|
+
screenshots.fullPage = fullPagePath;
|
|
591
|
+
}
|
|
592
|
+
} catch (error) {
|
|
593
|
+
console.warn(
|
|
594
|
+
`[page-analyzer] Failed to capture/upload full-page screenshot: ${getErrorMessage(error)}`
|
|
595
|
+
);
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
if (blockScreenshots) {
|
|
600
|
+
screenshots.blocks = [];
|
|
601
|
+
|
|
602
|
+
for (let index = 0; index < blocks.length; index += 1) {
|
|
603
|
+
const block = blocks[index];
|
|
604
|
+
const blockIdx = getBlockNumber(block, index);
|
|
605
|
+
|
|
606
|
+
const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
|
|
607
|
+
const blockFilename = `${prefix}-block-${blockLabel}.png`;
|
|
608
|
+
try {
|
|
609
|
+
if (useS3) {
|
|
610
|
+
const body = await this.captureBlockScreenshotData(page, block);
|
|
611
|
+
if (!body) {
|
|
612
|
+
continue;
|
|
613
|
+
}
|
|
614
|
+
const url = await this.uploadScreenshotToS3(blockFilename, body);
|
|
615
|
+
const screenshotRecord = {
|
|
616
|
+
blockIdx,
|
|
617
|
+
path: url
|
|
618
|
+
};
|
|
619
|
+
if (typeof block?.blockName === 'string' && block.blockName.trim()) {
|
|
620
|
+
screenshotRecord.blockName = block.blockName.trim();
|
|
621
|
+
}
|
|
622
|
+
if (typeof block?.blockIdxs === 'string' && block.blockIdxs.trim()) {
|
|
623
|
+
screenshotRecord.blockIdxs = block.blockIdxs.trim();
|
|
624
|
+
}
|
|
625
|
+
screenshots.blocks.push(screenshotRecord);
|
|
626
|
+
continue;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
const blockPath = path.join(this.config.snapshotDir, blockFilename);
|
|
630
|
+
const captured = await this.captureBlockScreenshot(page, block, blockPath);
|
|
631
|
+
if (captured) {
|
|
632
|
+
const screenshotRecord = {
|
|
633
|
+
blockIdx,
|
|
634
|
+
path: blockPath
|
|
635
|
+
};
|
|
636
|
+
if (typeof block?.blockName === 'string' && block.blockName.trim()) {
|
|
637
|
+
screenshotRecord.blockName = block.blockName.trim();
|
|
638
|
+
}
|
|
639
|
+
if (typeof block?.blockIdxs === 'string' && block.blockIdxs.trim()) {
|
|
640
|
+
screenshotRecord.blockIdxs = block.blockIdxs.trim();
|
|
641
|
+
}
|
|
642
|
+
screenshots.blocks.push(screenshotRecord);
|
|
643
|
+
}
|
|
644
|
+
} catch (error) {
|
|
645
|
+
console.warn(
|
|
646
|
+
`[page-analyzer] Failed to capture/upload block ${blockIdx}: ${getErrorMessage(error)}`
|
|
647
|
+
);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
return screenshots;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
async preparePage(page, targetUrl) {
|
|
656
|
+
await page.goto(targetUrl, {
|
|
657
|
+
waitUntil: 'domcontentloaded',
|
|
658
|
+
timeout: this.config.timeoutMs
|
|
659
|
+
});
|
|
660
|
+
await scrollToBottom(page);
|
|
661
|
+
await waitForStableHeight(page, { maxWait: this.config.timeoutMs });
|
|
662
|
+
await this.revealHiddenContent(page);
|
|
663
|
+
await this.waitForImagesLoaded(page);
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
async withPreparedPage(url, callback) {
|
|
170
667
|
const targetUrl = String(url || '').trim();
|
|
171
668
|
if (!targetUrl) {
|
|
172
669
|
throw new Error('PageExtractor requires a non-empty URL');
|
|
173
670
|
}
|
|
671
|
+
if (typeof callback !== 'function') {
|
|
672
|
+
throw new Error('PageExtractor.withPreparedPage requires a callback');
|
|
673
|
+
}
|
|
174
674
|
|
|
175
675
|
const viewport = {
|
|
176
676
|
width: this.config.viewportWidth,
|
|
@@ -181,35 +681,61 @@ export class PageExtractor {
|
|
|
181
681
|
const browser = await playwright.chromium.launch({ headless: true });
|
|
182
682
|
try {
|
|
183
683
|
const page = await browser.newPage({ viewport });
|
|
184
|
-
await
|
|
185
|
-
|
|
186
|
-
timeout: this.config.timeoutMs
|
|
187
|
-
});
|
|
188
|
-
await scrollToBottom(page);
|
|
189
|
-
await waitForStableHeight(page, { maxWait: this.config.timeoutMs });
|
|
190
|
-
await this.revealHiddenContent(page);
|
|
191
|
-
|
|
192
|
-
const html = await page.content();
|
|
193
|
-
const pageSize = await page.evaluate(() => ({
|
|
194
|
-
width: document.documentElement.scrollWidth || 0,
|
|
195
|
-
height: document.documentElement.scrollHeight || 0
|
|
196
|
-
}));
|
|
197
|
-
|
|
198
|
-
const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
|
|
199
|
-
const blocksResult = await page.evaluate(extractBlocksInBrowser, {
|
|
200
|
-
minHeight: this.config.minBlockHeight,
|
|
201
|
-
minWidth,
|
|
202
|
-
maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
|
|
203
|
-
maxDepth: this.config.blockMaxDepth,
|
|
204
|
-
textPreviewMaxChars: this.config.textPreviewMaxChars,
|
|
205
|
-
debug: false
|
|
206
|
-
});
|
|
207
|
-
const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
|
|
208
|
-
const elementGeometries = await this.collectElementGeometries(page);
|
|
209
|
-
|
|
210
|
-
return { html, blocks, elementGeometries, pageSize };
|
|
684
|
+
await this.preparePage(page, targetUrl);
|
|
685
|
+
return await callback(page, targetUrl);
|
|
211
686
|
} finally {
|
|
212
687
|
await browser.close();
|
|
213
688
|
}
|
|
214
689
|
}
|
|
690
|
+
|
|
691
|
+
async captureUrlScreenshots(url, blocks, options = {}) {
|
|
692
|
+
return this.withPreparedPage(url, async (page, targetUrl) => {
|
|
693
|
+
return await this.captureScreenshots(page, targetUrl, blocks, options);
|
|
694
|
+
});
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
/**
|
|
698
|
+
* Extract page data from an already prepared Playwright page:
|
|
699
|
+
* html, blocks, elementGeometries, screenshots.
|
|
700
|
+
* When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
|
|
701
|
+
* @param {import('playwright').Page} page - Prepared Playwright page
|
|
702
|
+
* @param {string} targetUrl - URL loaded in the page
|
|
703
|
+
* @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
|
|
704
|
+
*/
|
|
705
|
+
async extractPreparedPage(page, targetUrl) {
|
|
706
|
+
const viewport = {
|
|
707
|
+
width: this.config.viewportWidth,
|
|
708
|
+
height: this.config.viewportHeight
|
|
709
|
+
};
|
|
710
|
+
|
|
711
|
+
const html = await page.content();
|
|
712
|
+
|
|
713
|
+
const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
|
|
714
|
+
const blocksResult = await page.evaluate(extractBlocksInBrowser, {
|
|
715
|
+
minHeight: this.config.minBlockHeight,
|
|
716
|
+
minWidth,
|
|
717
|
+
maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
|
|
718
|
+
maxDepth: this.config.blockMaxDepth,
|
|
719
|
+
textPreviewMaxChars: this.config.textPreviewMaxChars,
|
|
720
|
+
debug: false
|
|
721
|
+
});
|
|
722
|
+
const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
|
|
723
|
+
const elementGeometries = await this.collectElementGeometries(page);
|
|
724
|
+
const finalPageSize = await this.collectPageSize(page);
|
|
725
|
+
const screenshots = await this.captureScreenshots(page, targetUrl, blocks);
|
|
726
|
+
|
|
727
|
+
return { html, blocks, elementGeometries, screenshots, pageSize: finalPageSize };
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
/**
|
|
731
|
+
* Extract page data: html, blocks, elementGeometries, screenshots.
|
|
732
|
+
* When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
|
|
733
|
+
* @param {string} url - URL to extract
|
|
734
|
+
* @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
|
|
735
|
+
*/
|
|
736
|
+
async extract(url) {
|
|
737
|
+
return this.withPreparedPage(url, async (page, targetUrl) => {
|
|
738
|
+
return await this.extractPreparedPage(page, targetUrl);
|
|
739
|
+
});
|
|
740
|
+
}
|
|
215
741
|
}
|