page-analyzer 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/package.json +1 -1
- package/page-extractor.js +57 -12
- package/test/smoke.test.js +69 -11
package/README.md
CHANGED
|
@@ -250,7 +250,7 @@ const result = await analyzePageEvents({
|
|
|
250
250
|
|
|
251
251
|
启用 `blockScreenshots: true` 后,模块会在 LLM 合并区块后再截图。返回结果会包含 `screenshots.blocks`,每项包含逻辑区块序号 `blockIdx` 和对应截图 `path`;区块分析结果中的每个 block 也会额外带上 `blockScreenshotPaths`,每个逻辑区块最多对应一张截图。无法通过 `blockCssPath` 截图的隐藏或空区块会被跳过。
|
|
252
252
|
|
|
253
|
-
如果配置 `extractorConfig.s3`,截图不会写入本地 `snapshots/`,而是直接上传到 S3;`screenshots.fullPage`、`screenshots.blocks[].path` 和 `blockScreenshotPaths` 会返回 HTTPS URL
|
|
253
|
+
如果配置 `extractorConfig.s3`,截图不会写入本地 `snapshots/`,而是直接上传到 S3;`screenshots.fullPage`、`screenshots.blocks[].path` 和 `blockScreenshotPaths` 会返回 HTTPS URL。S3 对象 key 使用 `<prefix>/<domain>/<file-md5>.png`,上传前会先检查对象是否已存在,已存在时直接返回对应 URL,避免重复上传和冗余对象。上传不会设置 ACL,访问权限沿用 bucket 策略。单张截图检查或上传失败会重试 3 次,仍失败则跳过该截图。
|
|
254
254
|
|
|
255
255
|
启用 `waitForImagesLoaded: true` 后,模块会先滚动页面触发懒加载,再等待当前 DOM 中的 `<img>` 完成加载或失败,之后再提取区块、分析和截图;等待时间受 `extractorConfig.timeoutMs` 控制。
|
|
256
256
|
|
|
@@ -316,7 +316,7 @@ const result = await analyzeUrl('https://example.com', {
|
|
|
316
316
|
});
|
|
317
317
|
```
|
|
318
318
|
|
|
319
|
-
`extractorConfig.s3.bucket` 和 `extractorConfig.s3.region` 必填。`credentials` 可省略,省略时使用 AWS SDK 默认凭证链。`publicBaseUrl` 可省略,省略时返回 `https://<bucket>.s3.<region>.amazonaws.com/<key>`;配置后返回 `${publicBaseUrl}/<key
|
|
319
|
+
`extractorConfig.s3.bucket` 和 `extractorConfig.s3.region` 必填。`credentials` 可省略,省略时使用 AWS SDK 默认凭证链。`publicBaseUrl` 可省略,省略时返回 `https://<bucket>.s3.<region>.amazonaws.com/<key>`;配置后返回 `${publicBaseUrl}/<key>`。启用 S3 上传时,需要凭证具备 `s3:GetObject` 和 `s3:PutObject` 权限;如果希望不存在的对象能被稳定识别为 404,还需要对应 bucket/prefix 的 `s3:ListBucket` 权限。
|
|
320
320
|
|
|
321
321
|
### parserConfig
|
|
322
322
|
|
package/package.json
CHANGED
package/page-extractor.js
CHANGED
|
@@ -5,7 +5,8 @@
|
|
|
5
5
|
|
|
6
6
|
import fs from 'node:fs/promises';
|
|
7
7
|
import path from 'node:path';
|
|
8
|
-
import {
|
|
8
|
+
import { createHash } from 'node:crypto';
|
|
9
|
+
import { HeadObjectCommand, PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
|
|
9
10
|
|
|
10
11
|
// In-browser block extraction function (serialized into page.evaluate)
|
|
11
12
|
// Imported from the project's extract-blocks script
|
|
@@ -41,6 +42,24 @@ function createSnapshotRunId() {
|
|
|
41
42
|
.replace(/^-+|-+$/g, '');
|
|
42
43
|
}
|
|
43
44
|
|
|
45
|
+
function createS3DomainSegment(url) {
|
|
46
|
+
const source = String(url || '').trim();
|
|
47
|
+
try {
|
|
48
|
+
const parsed = new URL(source);
|
|
49
|
+
const hostname = parsed.hostname
|
|
50
|
+
.toLowerCase()
|
|
51
|
+
.replace(/[^a-z0-9.-]+/g, '-')
|
|
52
|
+
.replace(/^-+|-+$/g, '');
|
|
53
|
+
return hostname || 'page';
|
|
54
|
+
} catch {
|
|
55
|
+
return createSnapshotSlug(source);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function createFileMd5(body) {
|
|
60
|
+
return createHash('md5').update(body).digest('hex');
|
|
61
|
+
}
|
|
62
|
+
|
|
44
63
|
function getBlockNumber(block, fallbackIndex) {
|
|
45
64
|
return Number.isInteger(block?.blockIdx) ? block.blockIdx : fallbackIndex;
|
|
46
65
|
}
|
|
@@ -96,8 +115,8 @@ function normalizeS3Config(config) {
|
|
|
96
115
|
};
|
|
97
116
|
}
|
|
98
117
|
|
|
99
|
-
function joinS3Key(
|
|
100
|
-
return
|
|
118
|
+
function joinS3Key(...parts) {
|
|
119
|
+
return parts.filter(Boolean).join('/');
|
|
101
120
|
}
|
|
102
121
|
|
|
103
122
|
function encodeS3Key(key) {
|
|
@@ -119,6 +138,12 @@ function getErrorMessage(error) {
|
|
|
119
138
|
return error instanceof Error ? error.message : String(error);
|
|
120
139
|
}
|
|
121
140
|
|
|
141
|
+
function isS3NotFoundError(error) {
|
|
142
|
+
const statusCode = error?.$metadata?.httpStatusCode;
|
|
143
|
+
const errorName = String(error?.name || error?.Code || error?.code || '');
|
|
144
|
+
return statusCode === 404 || errorName === 'NotFound' || errorName === 'NoSuchKey';
|
|
145
|
+
}
|
|
146
|
+
|
|
122
147
|
export class PageExtractor {
|
|
123
148
|
constructor(config = {}) {
|
|
124
149
|
this.config = {
|
|
@@ -176,13 +201,30 @@ export class PageExtractor {
|
|
|
176
201
|
return this.s3Client;
|
|
177
202
|
}
|
|
178
203
|
|
|
179
|
-
async
|
|
204
|
+
async s3ObjectExists(client, key) {
|
|
205
|
+
const s3Config = this.config.s3;
|
|
206
|
+
try {
|
|
207
|
+
await client.send(new HeadObjectCommand({
|
|
208
|
+
Bucket: s3Config.bucket,
|
|
209
|
+
Key: key
|
|
210
|
+
}));
|
|
211
|
+
return true;
|
|
212
|
+
} catch (error) {
|
|
213
|
+
if (isS3NotFoundError(error)) {
|
|
214
|
+
return false;
|
|
215
|
+
}
|
|
216
|
+
throw error;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
async uploadScreenshotToS3(targetUrl, body) {
|
|
180
221
|
const s3Config = this.config.s3;
|
|
181
222
|
if (!s3Config) {
|
|
182
223
|
throw new Error('S3 is not configured');
|
|
183
224
|
}
|
|
184
225
|
|
|
185
|
-
const
|
|
226
|
+
const domain = createS3DomainSegment(targetUrl);
|
|
227
|
+
const key = joinS3Key(s3Config.prefix, domain, `${createFileMd5(body)}.png`);
|
|
186
228
|
const client = this.getS3Client();
|
|
187
229
|
const commandInput = {
|
|
188
230
|
Bucket: s3Config.bucket,
|
|
@@ -194,6 +236,9 @@ export class PageExtractor {
|
|
|
194
236
|
let lastError = null;
|
|
195
237
|
for (let attempt = 1; attempt <= s3Config.maxUploadAttempts; attempt += 1) {
|
|
196
238
|
try {
|
|
239
|
+
if (await this.s3ObjectExists(client, key)) {
|
|
240
|
+
return buildS3Url(s3Config, key);
|
|
241
|
+
}
|
|
197
242
|
const command = new PutObjectCommand(commandInput);
|
|
198
243
|
await client.send(command);
|
|
199
244
|
return buildS3Url(s3Config, key);
|
|
@@ -201,7 +246,7 @@ export class PageExtractor {
|
|
|
201
246
|
lastError = error;
|
|
202
247
|
if (attempt < s3Config.maxUploadAttempts) {
|
|
203
248
|
console.warn(
|
|
204
|
-
`[page-analyzer] Failed to upload ${key} to S3, retrying ` +
|
|
249
|
+
`[page-analyzer] Failed to check/upload ${key} to S3, retrying ` +
|
|
205
250
|
`(${attempt}/${s3Config.maxUploadAttempts}): ${getErrorMessage(error)}`
|
|
206
251
|
);
|
|
207
252
|
}
|
|
@@ -572,16 +617,16 @@ export class PageExtractor {
|
|
|
572
617
|
await fs.mkdir(this.config.snapshotDir, { recursive: true });
|
|
573
618
|
}
|
|
574
619
|
|
|
575
|
-
const
|
|
620
|
+
const localPrefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
|
|
576
621
|
const screenshots = {};
|
|
577
622
|
|
|
578
623
|
if (fullPageScreenshot) {
|
|
579
|
-
const fullPageFilename = `${prefix}-full-page.png`;
|
|
580
624
|
try {
|
|
581
625
|
if (useS3) {
|
|
582
626
|
const body = await page.screenshot({ fullPage: true });
|
|
583
|
-
screenshots.fullPage = await this.uploadScreenshotToS3(
|
|
627
|
+
screenshots.fullPage = await this.uploadScreenshotToS3(targetUrl, body);
|
|
584
628
|
} else {
|
|
629
|
+
const fullPageFilename = `${localPrefix}-full-page.png`;
|
|
585
630
|
const fullPagePath = path.join(this.config.snapshotDir, fullPageFilename);
|
|
586
631
|
await page.screenshot({
|
|
587
632
|
path: fullPagePath,
|
|
@@ -603,15 +648,13 @@ export class PageExtractor {
|
|
|
603
648
|
const block = blocks[index];
|
|
604
649
|
const blockIdx = getBlockNumber(block, index);
|
|
605
650
|
|
|
606
|
-
const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
|
|
607
|
-
const blockFilename = `${prefix}-block-${blockLabel}.png`;
|
|
608
651
|
try {
|
|
609
652
|
if (useS3) {
|
|
610
653
|
const body = await this.captureBlockScreenshotData(page, block);
|
|
611
654
|
if (!body) {
|
|
612
655
|
continue;
|
|
613
656
|
}
|
|
614
|
-
const url = await this.uploadScreenshotToS3(
|
|
657
|
+
const url = await this.uploadScreenshotToS3(targetUrl, body);
|
|
615
658
|
const screenshotRecord = {
|
|
616
659
|
blockIdx,
|
|
617
660
|
path: url
|
|
@@ -626,6 +669,8 @@ export class PageExtractor {
|
|
|
626
669
|
continue;
|
|
627
670
|
}
|
|
628
671
|
|
|
672
|
+
const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
|
|
673
|
+
const blockFilename = `${localPrefix}-block-${blockLabel}.png`;
|
|
629
674
|
const blockPath = path.join(this.config.snapshotDir, blockFilename);
|
|
630
675
|
const captured = await this.captureBlockScreenshot(page, block, blockPath);
|
|
631
676
|
if (captured) {
|
package/test/smoke.test.js
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
import assert from 'node:assert/strict';
|
|
2
|
+
import { createHash } from 'node:crypto';
|
|
2
3
|
import { EventAnalyzer } from '../llm/analyzers/event-analyzer/event-analyzer.js';
|
|
3
4
|
import { buildBlockAnalysisArtifact } from '../llm/analyzers/event-analyzer/event-analyzer-blocks.js';
|
|
4
5
|
import { OpenAiProvider } from '../llm/providers/openai-provider.js';
|
|
5
6
|
import { PageExtractor } from '../page-extractor.js';
|
|
6
7
|
import { analyzeUrl } from '../index.js';
|
|
7
8
|
|
|
9
|
+
function md5(value) {
|
|
10
|
+
return createHash('md5').update(value).digest('hex');
|
|
11
|
+
}
|
|
12
|
+
|
|
8
13
|
class FakeProvider {
|
|
9
14
|
constructor() {
|
|
10
15
|
this.calls = [];
|
|
@@ -26,9 +31,10 @@ class FakeProvider {
|
|
|
26
31
|
}
|
|
27
32
|
|
|
28
33
|
class FakeLocator {
|
|
29
|
-
constructor({ count = 1, throwOnScreenshot = false } = {}) {
|
|
34
|
+
constructor({ count = 1, throwOnScreenshot = false, screenshotBodies = null } = {}) {
|
|
30
35
|
this.countValue = count;
|
|
31
36
|
this.throwOnScreenshot = throwOnScreenshot;
|
|
37
|
+
this.screenshotBodies = Array.isArray(screenshotBodies) ? screenshotBodies : null;
|
|
32
38
|
this.screenshots = [];
|
|
33
39
|
}
|
|
34
40
|
|
|
@@ -41,11 +47,14 @@ class FakeLocator {
|
|
|
41
47
|
}
|
|
42
48
|
|
|
43
49
|
async screenshot(options) {
|
|
50
|
+
const screenshotIndex = this.screenshots.length;
|
|
44
51
|
this.screenshots.push(options);
|
|
45
52
|
if (this.throwOnScreenshot) {
|
|
46
53
|
throw new Error('selector screenshot failed');
|
|
47
54
|
}
|
|
48
|
-
return Buffer.from(
|
|
55
|
+
return Buffer.from(
|
|
56
|
+
this.screenshotBodies?.[screenshotIndex] || `locator screenshot:${options?.path || 'buffer'}`
|
|
57
|
+
);
|
|
49
58
|
}
|
|
50
59
|
}
|
|
51
60
|
|
|
@@ -74,22 +83,36 @@ class FakePage {
|
|
|
74
83
|
}
|
|
75
84
|
|
|
76
85
|
class FakeS3Client {
|
|
77
|
-
constructor({ failPredicate = null } = {}) {
|
|
86
|
+
constructor({ failPredicate = null, existingKeys = [] } = {}) {
|
|
78
87
|
this.failPredicate = failPredicate;
|
|
88
|
+
this.existingKeys = new Set(existingKeys);
|
|
89
|
+
this.headCommands = [];
|
|
79
90
|
this.commands = [];
|
|
80
91
|
this.attemptsByKey = new Map();
|
|
81
92
|
}
|
|
82
93
|
|
|
83
94
|
async send(command) {
|
|
84
95
|
const input = command.input;
|
|
85
|
-
|
|
96
|
+
if (command.constructor.name === 'HeadObjectCommand') {
|
|
97
|
+
this.headCommands.push(input);
|
|
98
|
+
if (this.existingKeys.has(input.Key)) {
|
|
99
|
+
return {};
|
|
100
|
+
}
|
|
101
|
+
const error = new Error(`s3 object not found for ${input.Key}`);
|
|
102
|
+
error.name = 'NotFound';
|
|
103
|
+
error.$metadata = { httpStatusCode: 404 };
|
|
104
|
+
throw error;
|
|
105
|
+
}
|
|
106
|
+
|
|
86
107
|
const attempts = (this.attemptsByKey.get(input.Key) || 0) + 1;
|
|
87
108
|
this.attemptsByKey.set(input.Key, attempts);
|
|
109
|
+
this.commands.push(input);
|
|
88
110
|
|
|
89
111
|
if (this.failPredicate?.(input, attempts)) {
|
|
90
112
|
throw new Error(`s3 upload failed for ${input.Key}`);
|
|
91
113
|
}
|
|
92
114
|
|
|
115
|
+
this.existingKeys.add(input.Key);
|
|
93
116
|
return {};
|
|
94
117
|
}
|
|
95
118
|
}
|
|
@@ -348,14 +371,16 @@ async function analyzeWith(options = {}) {
|
|
|
348
371
|
assert.deepEqual(page.pageScreenshots[0], { fullPage: true });
|
|
349
372
|
assert.equal(locator.screenshots.length, 1);
|
|
350
373
|
assert.deepEqual(locator.screenshots[0], {});
|
|
374
|
+
assert.equal(s3Client.headCommands.length, 2);
|
|
351
375
|
assert.equal(s3Client.commands.length, 2);
|
|
352
376
|
|
|
353
377
|
const [fullPageUpload, blockUpload] = s3Client.commands;
|
|
354
378
|
assert.equal(fullPageUpload.Bucket, 'page-analyzer-test');
|
|
355
379
|
assert.equal(fullPageUpload.ContentType, 'image/png');
|
|
356
380
|
assert.equal(Buffer.isBuffer(fullPageUpload.Body), true);
|
|
357
|
-
assert.match(fullPageUpload.Key, /^page-analyzer\/snapshots\/example
|
|
358
|
-
assert.match(blockUpload.Key, /^page-analyzer\/snapshots\/example
|
|
381
|
+
assert.match(fullPageUpload.Key, /^page-analyzer\/snapshots\/example\.com\/[a-f0-9]{32}\.png$/);
|
|
382
|
+
assert.match(blockUpload.Key, /^page-analyzer\/snapshots\/example\.com\/[a-f0-9]{32}\.png$/);
|
|
383
|
+
assert.notEqual(fullPageUpload.Key, blockUpload.Key);
|
|
359
384
|
|
|
360
385
|
assert.equal(
|
|
361
386
|
screenshots.fullPage,
|
|
@@ -384,21 +409,49 @@ async function analyzeWith(options = {}) {
|
|
|
384
409
|
});
|
|
385
410
|
|
|
386
411
|
const uploadedKey = s3Client.commands[0].Key;
|
|
387
|
-
assert.match(uploadedKey, /^nested\/prefix\/example
|
|
412
|
+
assert.match(uploadedKey, /^nested\/prefix\/example\.com\/[a-f0-9]{32}\.png$/);
|
|
388
413
|
assert.equal(
|
|
389
414
|
screenshots.blocks[0].path,
|
|
390
415
|
`https://page-analyzer-test.s3.ap-northeast-1.amazonaws.com/${uploadedKey}`
|
|
391
416
|
);
|
|
392
417
|
}
|
|
393
418
|
|
|
419
|
+
{
|
|
420
|
+
const body = Buffer.from('already uploaded screenshot');
|
|
421
|
+
const existingKey = `page-analyzer/snapshots/example.com/${md5(body)}.png`;
|
|
422
|
+
const s3Client = new FakeS3Client({
|
|
423
|
+
existingKeys: [existingKey]
|
|
424
|
+
});
|
|
425
|
+
const extractor = new PageExtractor({
|
|
426
|
+
s3: {
|
|
427
|
+
bucket: 'page-analyzer-test',
|
|
428
|
+
region: 'ap-northeast-1',
|
|
429
|
+
prefix: 'page-analyzer/snapshots',
|
|
430
|
+
publicBaseUrl: 'https://cdn.example.com',
|
|
431
|
+
client: s3Client
|
|
432
|
+
}
|
|
433
|
+
});
|
|
434
|
+
|
|
435
|
+
const url = await extractor.uploadScreenshotToS3('https://example.com/demo', body);
|
|
436
|
+
|
|
437
|
+
assert.equal(s3Client.headCommands.length, 1);
|
|
438
|
+
assert.equal(s3Client.headCommands[0].Key, existingKey);
|
|
439
|
+
assert.equal(s3Client.commands.length, 0);
|
|
440
|
+
assert.equal(url, `https://cdn.example.com/${existingKey}`);
|
|
441
|
+
}
|
|
442
|
+
|
|
394
443
|
{
|
|
395
444
|
const originalWarn = console.warn;
|
|
396
445
|
const warnings = [];
|
|
397
446
|
console.warn = (message) => warnings.push(message);
|
|
398
447
|
|
|
399
448
|
try {
|
|
449
|
+
const failingBlockKey =
|
|
450
|
+
`page-analyzer/snapshots/example.com/${md5('locator screenshot:block-0')}.png`;
|
|
451
|
+
const successfulBlockKey =
|
|
452
|
+
`page-analyzer/snapshots/example.com/${md5('locator screenshot:block-1')}.png`;
|
|
400
453
|
const s3Client = new FakeS3Client({
|
|
401
|
-
failPredicate: (input) => input.Key
|
|
454
|
+
failPredicate: (input) => input.Key === failingBlockKey
|
|
402
455
|
});
|
|
403
456
|
const extractor = new PageExtractor({
|
|
404
457
|
s3: {
|
|
@@ -409,7 +462,12 @@ async function analyzeWith(options = {}) {
|
|
|
409
462
|
client: s3Client
|
|
410
463
|
}
|
|
411
464
|
});
|
|
412
|
-
const locator = new FakeLocator(
|
|
465
|
+
const locator = new FakeLocator({
|
|
466
|
+
screenshotBodies: [
|
|
467
|
+
'locator screenshot:block-0',
|
|
468
|
+
'locator screenshot:block-1'
|
|
469
|
+
]
|
|
470
|
+
});
|
|
413
471
|
const page = new FakePage(locator);
|
|
414
472
|
const screenshots = await extractor.captureScreenshots(page, 'https://example.com/demo', [
|
|
415
473
|
{ blockName: 'Hero', blockCssPath: '#hero' },
|
|
@@ -421,8 +479,8 @@ async function analyzeWith(options = {}) {
|
|
|
421
479
|
|
|
422
480
|
assert.equal(screenshots.blocks.length, 1);
|
|
423
481
|
assert.equal(screenshots.blocks[0].blockIdx, 1);
|
|
424
|
-
assert.equal(s3Client.commands.filter((input) => input.Key
|
|
425
|
-
assert.equal(s3Client.commands.filter((input) => input.Key
|
|
482
|
+
assert.equal(s3Client.commands.filter((input) => input.Key === failingBlockKey).length, 3);
|
|
483
|
+
assert.equal(s3Client.commands.filter((input) => input.Key === successfulBlockKey).length, 1);
|
|
426
484
|
assert.equal(warnings.some((message) => message.includes('retrying')), true);
|
|
427
485
|
assert.equal(warnings.some((message) => message.includes('Failed to capture/upload block 0')), true);
|
|
428
486
|
} finally {
|