page-analyzer 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -250,7 +250,7 @@ const result = await analyzePageEvents({
250
250
 
251
251
  启用 `blockScreenshots: true` 后,模块会在 LLM 合并区块后再截图。返回结果会包含 `screenshots.blocks`,每项包含逻辑区块序号 `blockIdx` 和对应截图 `path`;区块分析结果中的每个 block 也会额外带上 `blockScreenshotPaths`,每个逻辑区块最多对应一张截图。无法通过 `blockCssPath` 截图的隐藏或空区块会被跳过。
252
252
 
253
- 如果配置 `extractorConfig.s3`,截图不会写入本地 `snapshots/`,而是直接上传到 S3;`screenshots.fullPage`、`screenshots.blocks[].path` 和 `blockScreenshotPaths` 会返回 HTTPS URL。上传不会设置 ACL,访问权限沿用 bucket 策略。单张截图上传失败会重试 3 次,仍失败则跳过该截图。
253
+ 如果配置 `extractorConfig.s3`,截图不会写入本地 `snapshots/`,而是直接上传到 S3;`screenshots.fullPage`、`screenshots.blocks[].path` 和 `blockScreenshotPaths` 会返回 HTTPS URL。S3 对象 key 使用 `<prefix>/<domain>/<file-md5>.png`,上传前会先检查对象是否已存在,已存在时直接返回对应 URL,避免重复上传和冗余对象。上传不会设置 ACL,访问权限沿用 bucket 策略。单张截图检查或上传失败会重试 3 次,仍失败则跳过该截图。
254
254
 
255
255
  启用 `waitForImagesLoaded: true` 后,模块会先滚动页面触发懒加载,再等待当前 DOM 中的 `<img>` 完成加载或失败,之后再提取区块、分析和截图;等待时间受 `extractorConfig.timeoutMs` 控制。
256
256
 
@@ -316,7 +316,7 @@ const result = await analyzeUrl('https://example.com', {
316
316
  });
317
317
  ```
318
318
 
319
- `extractorConfig.s3.bucket` 和 `extractorConfig.s3.region` 必填。`credentials` 可省略,省略时使用 AWS SDK 默认凭证链。`publicBaseUrl` 可省略,省略时返回 `https://<bucket>.s3.<region>.amazonaws.com/<key>`;配置后返回 `${publicBaseUrl}/<key>`。
319
+ `extractorConfig.s3.bucket` 和 `extractorConfig.s3.region` 必填。`credentials` 可省略,省略时使用 AWS SDK 默认凭证链。`publicBaseUrl` 可省略,省略时返回 `https://<bucket>.s3.<region>.amazonaws.com/<key>`;配置后返回 `${publicBaseUrl}/<key>`。启用 S3 上传时,需要凭证具备 `s3:GetObject` 和 `s3:PutObject` 权限;如果希望不存在的对象能被稳定识别为 404,还需要对应 bucket/prefix 的 `s3:ListBucket` 权限。
320
320
 
321
321
  ### parserConfig
322
322
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "page-analyzer",
3
- "version": "1.2.1",
3
+ "version": "1.2.2",
4
4
  "type": "module",
5
5
  "description": "Standalone page analysis module.",
6
6
  "license": "MIT",
package/page-extractor.js CHANGED
@@ -5,7 +5,8 @@
5
5
 
6
6
  import fs from 'node:fs/promises';
7
7
  import path from 'node:path';
8
- import { PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
8
+ import { createHash } from 'node:crypto';
9
+ import { HeadObjectCommand, PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
9
10
 
10
11
  // In-browser block extraction function (serialized into page.evaluate)
11
12
  // Imported from the project's extract-blocks script
@@ -41,6 +42,24 @@ function createSnapshotRunId() {
41
42
  .replace(/^-+|-+$/g, '');
42
43
  }
43
44
 
45
+ function createS3DomainSegment(url) {
46
+ const source = String(url || '').trim();
47
+ try {
48
+ const parsed = new URL(source);
49
+ const hostname = parsed.hostname
50
+ .toLowerCase()
51
+ .replace(/[^a-z0-9.-]+/g, '-')
52
+ .replace(/^-+|-+$/g, '');
53
+ return hostname || 'page';
54
+ } catch {
55
+ return createSnapshotSlug(source);
56
+ }
57
+ }
58
+
59
+ function createFileMd5(body) {
60
+ return createHash('md5').update(body).digest('hex');
61
+ }
62
+
44
63
  function getBlockNumber(block, fallbackIndex) {
45
64
  return Number.isInteger(block?.blockIdx) ? block.blockIdx : fallbackIndex;
46
65
  }
@@ -96,8 +115,8 @@ function normalizeS3Config(config) {
96
115
  };
97
116
  }
98
117
 
99
- function joinS3Key(prefix, filename) {
100
- return [prefix, filename].filter(Boolean).join('/');
118
+ function joinS3Key(...parts) {
119
+ return parts.filter(Boolean).join('/');
101
120
  }
102
121
 
103
122
  function encodeS3Key(key) {
@@ -119,6 +138,12 @@ function getErrorMessage(error) {
119
138
  return error instanceof Error ? error.message : String(error);
120
139
  }
121
140
 
141
+ function isS3NotFoundError(error) {
142
+ const statusCode = error?.$metadata?.httpStatusCode;
143
+ const errorName = String(error?.name || error?.Code || error?.code || '');
144
+ return statusCode === 404 || errorName === 'NotFound' || errorName === 'NoSuchKey';
145
+ }
146
+
122
147
  export class PageExtractor {
123
148
  constructor(config = {}) {
124
149
  this.config = {
@@ -176,13 +201,30 @@ export class PageExtractor {
176
201
  return this.s3Client;
177
202
  }
178
203
 
179
- async uploadScreenshotToS3(filename, body) {
204
+ async s3ObjectExists(client, key) {
205
+ const s3Config = this.config.s3;
206
+ try {
207
+ await client.send(new HeadObjectCommand({
208
+ Bucket: s3Config.bucket,
209
+ Key: key
210
+ }));
211
+ return true;
212
+ } catch (error) {
213
+ if (isS3NotFoundError(error)) {
214
+ return false;
215
+ }
216
+ throw error;
217
+ }
218
+ }
219
+
220
+ async uploadScreenshotToS3(targetUrl, body) {
180
221
  const s3Config = this.config.s3;
181
222
  if (!s3Config) {
182
223
  throw new Error('S3 is not configured');
183
224
  }
184
225
 
185
- const key = joinS3Key(s3Config.prefix, filename);
226
+ const domain = createS3DomainSegment(targetUrl);
227
+ const key = joinS3Key(s3Config.prefix, domain, `${createFileMd5(body)}.png`);
186
228
  const client = this.getS3Client();
187
229
  const commandInput = {
188
230
  Bucket: s3Config.bucket,
@@ -194,6 +236,9 @@ export class PageExtractor {
194
236
  let lastError = null;
195
237
  for (let attempt = 1; attempt <= s3Config.maxUploadAttempts; attempt += 1) {
196
238
  try {
239
+ if (await this.s3ObjectExists(client, key)) {
240
+ return buildS3Url(s3Config, key);
241
+ }
197
242
  const command = new PutObjectCommand(commandInput);
198
243
  await client.send(command);
199
244
  return buildS3Url(s3Config, key);
@@ -201,7 +246,7 @@ export class PageExtractor {
201
246
  lastError = error;
202
247
  if (attempt < s3Config.maxUploadAttempts) {
203
248
  console.warn(
204
- `[page-analyzer] Failed to upload ${key} to S3, retrying ` +
249
+ `[page-analyzer] Failed to check/upload ${key} to S3, retrying ` +
205
250
  `(${attempt}/${s3Config.maxUploadAttempts}): ${getErrorMessage(error)}`
206
251
  );
207
252
  }
@@ -572,16 +617,16 @@ export class PageExtractor {
572
617
  await fs.mkdir(this.config.snapshotDir, { recursive: true });
573
618
  }
574
619
 
575
- const prefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
620
+ const localPrefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
576
621
  const screenshots = {};
577
622
 
578
623
  if (fullPageScreenshot) {
579
- const fullPageFilename = `${prefix}-full-page.png`;
580
624
  try {
581
625
  if (useS3) {
582
626
  const body = await page.screenshot({ fullPage: true });
583
- screenshots.fullPage = await this.uploadScreenshotToS3(fullPageFilename, body);
627
+ screenshots.fullPage = await this.uploadScreenshotToS3(targetUrl, body);
584
628
  } else {
629
+ const fullPageFilename = `${localPrefix}-full-page.png`;
585
630
  const fullPagePath = path.join(this.config.snapshotDir, fullPageFilename);
586
631
  await page.screenshot({
587
632
  path: fullPagePath,
@@ -603,15 +648,13 @@ export class PageExtractor {
603
648
  const block = blocks[index];
604
649
  const blockIdx = getBlockNumber(block, index);
605
650
 
606
- const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
607
- const blockFilename = `${prefix}-block-${blockLabel}.png`;
608
651
  try {
609
652
  if (useS3) {
610
653
  const body = await this.captureBlockScreenshotData(page, block);
611
654
  if (!body) {
612
655
  continue;
613
656
  }
614
- const url = await this.uploadScreenshotToS3(blockFilename, body);
657
+ const url = await this.uploadScreenshotToS3(targetUrl, body);
615
658
  const screenshotRecord = {
616
659
  blockIdx,
617
660
  path: url
@@ -626,6 +669,8 @@ export class PageExtractor {
626
669
  continue;
627
670
  }
628
671
 
672
+ const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
673
+ const blockFilename = `${localPrefix}-block-${blockLabel}.png`;
629
674
  const blockPath = path.join(this.config.snapshotDir, blockFilename);
630
675
  const captured = await this.captureBlockScreenshot(page, block, blockPath);
631
676
  if (captured) {
@@ -1,10 +1,15 @@
1
1
  import assert from 'node:assert/strict';
2
+ import { createHash } from 'node:crypto';
2
3
  import { EventAnalyzer } from '../llm/analyzers/event-analyzer/event-analyzer.js';
3
4
  import { buildBlockAnalysisArtifact } from '../llm/analyzers/event-analyzer/event-analyzer-blocks.js';
4
5
  import { OpenAiProvider } from '../llm/providers/openai-provider.js';
5
6
  import { PageExtractor } from '../page-extractor.js';
6
7
  import { analyzeUrl } from '../index.js';
7
8
 
9
+ function md5(value) {
10
+ return createHash('md5').update(value).digest('hex');
11
+ }
12
+
8
13
  class FakeProvider {
9
14
  constructor() {
10
15
  this.calls = [];
@@ -26,9 +31,10 @@ class FakeProvider {
26
31
  }
27
32
 
28
33
  class FakeLocator {
29
- constructor({ count = 1, throwOnScreenshot = false } = {}) {
34
+ constructor({ count = 1, throwOnScreenshot = false, screenshotBodies = null } = {}) {
30
35
  this.countValue = count;
31
36
  this.throwOnScreenshot = throwOnScreenshot;
37
+ this.screenshotBodies = Array.isArray(screenshotBodies) ? screenshotBodies : null;
32
38
  this.screenshots = [];
33
39
  }
34
40
 
@@ -41,11 +47,14 @@ class FakeLocator {
41
47
  }
42
48
 
43
49
  async screenshot(options) {
50
+ const screenshotIndex = this.screenshots.length;
44
51
  this.screenshots.push(options);
45
52
  if (this.throwOnScreenshot) {
46
53
  throw new Error('selector screenshot failed');
47
54
  }
48
- return Buffer.from(`locator screenshot:${options?.path || 'buffer'}`);
55
+ return Buffer.from(
56
+ this.screenshotBodies?.[screenshotIndex] || `locator screenshot:${options?.path || 'buffer'}`
57
+ );
49
58
  }
50
59
  }
51
60
 
@@ -74,22 +83,36 @@ class FakePage {
74
83
  }
75
84
 
76
85
  class FakeS3Client {
77
- constructor({ failPredicate = null } = {}) {
86
+ constructor({ failPredicate = null, existingKeys = [] } = {}) {
78
87
  this.failPredicate = failPredicate;
88
+ this.existingKeys = new Set(existingKeys);
89
+ this.headCommands = [];
79
90
  this.commands = [];
80
91
  this.attemptsByKey = new Map();
81
92
  }
82
93
 
83
94
  async send(command) {
84
95
  const input = command.input;
85
- this.commands.push(input);
96
+ if (command.constructor.name === 'HeadObjectCommand') {
97
+ this.headCommands.push(input);
98
+ if (this.existingKeys.has(input.Key)) {
99
+ return {};
100
+ }
101
+ const error = new Error(`s3 object not found for ${input.Key}`);
102
+ error.name = 'NotFound';
103
+ error.$metadata = { httpStatusCode: 404 };
104
+ throw error;
105
+ }
106
+
86
107
  const attempts = (this.attemptsByKey.get(input.Key) || 0) + 1;
87
108
  this.attemptsByKey.set(input.Key, attempts);
109
+ this.commands.push(input);
88
110
 
89
111
  if (this.failPredicate?.(input, attempts)) {
90
112
  throw new Error(`s3 upload failed for ${input.Key}`);
91
113
  }
92
114
 
115
+ this.existingKeys.add(input.Key);
93
116
  return {};
94
117
  }
95
118
  }
@@ -348,14 +371,16 @@ async function analyzeWith(options = {}) {
348
371
  assert.deepEqual(page.pageScreenshots[0], { fullPage: true });
349
372
  assert.equal(locator.screenshots.length, 1);
350
373
  assert.deepEqual(locator.screenshots[0], {});
374
+ assert.equal(s3Client.headCommands.length, 2);
351
375
  assert.equal(s3Client.commands.length, 2);
352
376
 
353
377
  const [fullPageUpload, blockUpload] = s3Client.commands;
354
378
  assert.equal(fullPageUpload.Bucket, 'page-analyzer-test');
355
379
  assert.equal(fullPageUpload.ContentType, 'image/png');
356
380
  assert.equal(Buffer.isBuffer(fullPageUpload.Body), true);
357
- assert.match(fullPageUpload.Key, /^page-analyzer\/snapshots\/example-com-demo-.*-full-page\.png$/);
358
- assert.match(blockUpload.Key, /^page-analyzer\/snapshots\/example-com-demo-.*-block-000\.png$/);
381
+ assert.match(fullPageUpload.Key, /^page-analyzer\/snapshots\/example\.com\/[a-f0-9]{32}\.png$/);
382
+ assert.match(blockUpload.Key, /^page-analyzer\/snapshots\/example\.com\/[a-f0-9]{32}\.png$/);
383
+ assert.notEqual(fullPageUpload.Key, blockUpload.Key);
359
384
 
360
385
  assert.equal(
361
386
  screenshots.fullPage,
@@ -384,21 +409,49 @@ async function analyzeWith(options = {}) {
384
409
  });
385
410
 
386
411
  const uploadedKey = s3Client.commands[0].Key;
387
- assert.match(uploadedKey, /^nested\/prefix\/example-com-demo-.*-block-000\.png$/);
412
+ assert.match(uploadedKey, /^nested\/prefix\/example\.com\/[a-f0-9]{32}\.png$/);
388
413
  assert.equal(
389
414
  screenshots.blocks[0].path,
390
415
  `https://page-analyzer-test.s3.ap-northeast-1.amazonaws.com/${uploadedKey}`
391
416
  );
392
417
  }
393
418
 
419
+ {
420
+ const body = Buffer.from('already uploaded screenshot');
421
+ const existingKey = `page-analyzer/snapshots/example.com/${md5(body)}.png`;
422
+ const s3Client = new FakeS3Client({
423
+ existingKeys: [existingKey]
424
+ });
425
+ const extractor = new PageExtractor({
426
+ s3: {
427
+ bucket: 'page-analyzer-test',
428
+ region: 'ap-northeast-1',
429
+ prefix: 'page-analyzer/snapshots',
430
+ publicBaseUrl: 'https://cdn.example.com',
431
+ client: s3Client
432
+ }
433
+ });
434
+
435
+ const url = await extractor.uploadScreenshotToS3('https://example.com/demo', body);
436
+
437
+ assert.equal(s3Client.headCommands.length, 1);
438
+ assert.equal(s3Client.headCommands[0].Key, existingKey);
439
+ assert.equal(s3Client.commands.length, 0);
440
+ assert.equal(url, `https://cdn.example.com/${existingKey}`);
441
+ }
442
+
394
443
  {
395
444
  const originalWarn = console.warn;
396
445
  const warnings = [];
397
446
  console.warn = (message) => warnings.push(message);
398
447
 
399
448
  try {
449
+ const failingBlockKey =
450
+ `page-analyzer/snapshots/example.com/${md5('locator screenshot:block-0')}.png`;
451
+ const successfulBlockKey =
452
+ `page-analyzer/snapshots/example.com/${md5('locator screenshot:block-1')}.png`;
400
453
  const s3Client = new FakeS3Client({
401
- failPredicate: (input) => input.Key.endsWith('-block-000.png')
454
+ failPredicate: (input) => input.Key === failingBlockKey
402
455
  });
403
456
  const extractor = new PageExtractor({
404
457
  s3: {
@@ -409,7 +462,12 @@ async function analyzeWith(options = {}) {
409
462
  client: s3Client
410
463
  }
411
464
  });
412
- const locator = new FakeLocator();
465
+ const locator = new FakeLocator({
466
+ screenshotBodies: [
467
+ 'locator screenshot:block-0',
468
+ 'locator screenshot:block-1'
469
+ ]
470
+ });
413
471
  const page = new FakePage(locator);
414
472
  const screenshots = await extractor.captureScreenshots(page, 'https://example.com/demo', [
415
473
  { blockName: 'Hero', blockCssPath: '#hero' },
@@ -421,8 +479,8 @@ async function analyzeWith(options = {}) {
421
479
 
422
480
  assert.equal(screenshots.blocks.length, 1);
423
481
  assert.equal(screenshots.blocks[0].blockIdx, 1);
424
- assert.equal(s3Client.commands.filter((input) => input.Key.endsWith('-block-000.png')).length, 3);
425
- assert.equal(s3Client.commands.filter((input) => input.Key.endsWith('-block-001.png')).length, 1);
482
+ assert.equal(s3Client.commands.filter((input) => input.Key === failingBlockKey).length, 3);
483
+ assert.equal(s3Client.commands.filter((input) => input.Key === successfulBlockKey).length, 1);
426
484
  assert.equal(warnings.some((message) => message.includes('retrying')), true);
427
485
  assert.equal(warnings.some((message) => message.includes('Failed to capture/upload block 0')), true);
428
486
  } finally {