page-analyzer 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +72 -9
- package/index.js +206 -22
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +23 -2
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +1 -1
- package/llm/analyzers/event-analyzer/event-analyzer.js +1 -1
- package/package.json +6 -3
- package/page-extractor.js +562 -36
- package/result-viewer.html +1064 -0
- package/scripts/analyze.js +51 -0
- package/scripts/build-result-viewer.js +1076 -0
- package/scripts/serve-result-viewer.js +68 -0
- package/test/smoke.test.js +454 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 page-analyzer contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -71,21 +71,21 @@ LLM_API_ENDPOINT=https://api.openai.com/v1/chat/completions
|
|
|
71
71
|
LLM_MODEL=gpt-4o-mini
|
|
72
72
|
```
|
|
73
73
|
|
|
74
|
-
##
|
|
74
|
+
## 运行测试和示例
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
本地测试不会调用真实网页或 LLM 接口:
|
|
77
77
|
|
|
78
78
|
```bash
|
|
79
79
|
npm test
|
|
80
80
|
```
|
|
81
81
|
|
|
82
|
-
|
|
82
|
+
如需手动分析真实页面,可以运行示例脚本。它会读取项目根目录下的 `.env`,分析指定 URL,并把结果写入 `result.json`。
|
|
83
83
|
|
|
84
84
|
```bash
|
|
85
|
-
|
|
85
|
+
npm run analyze -- https://example.com
|
|
86
86
|
```
|
|
87
87
|
|
|
88
|
-
注意:`
|
|
88
|
+
注意:`npm run analyze` 依赖以下环境变量:
|
|
89
89
|
|
|
90
90
|
- `LLM_API_KEY`
|
|
91
91
|
- `LLM_API_ENDPOINT`
|
|
@@ -114,6 +114,9 @@ const result = await analyzeUrl('https://example.com', {
|
|
|
114
114
|
},
|
|
115
115
|
showEvents: true,
|
|
116
116
|
showBlockIdx: true,
|
|
117
|
+
fullPageScreenshot: true,
|
|
118
|
+
blockScreenshots: true,
|
|
119
|
+
waitForImagesLoaded: true,
|
|
117
120
|
knownEventTypes: ['click_link', 'submit_form'],
|
|
118
121
|
extractorConfig: {
|
|
119
122
|
viewportWidth: 1440,
|
|
@@ -145,6 +148,10 @@ const result = await analyzeUrl('https://example.com', {
|
|
|
145
148
|
| `options.extractorConfig` | `object` | 否 | Playwright 页面抓取配置 |
|
|
146
149
|
| `options.showEvents` | `boolean` | 否 | 是否返回完整事件数组和元素明细 |
|
|
147
150
|
| `options.showBlockIdx` | `boolean` | 否 | 是否返回 CSV 与区块索引相关字段 |
|
|
151
|
+
| `options.fullPageScreenshot` | `boolean` | 否 | 是否保存整页截图到当前运行目录的 `snapshots/` 并返回文件路径 |
|
|
152
|
+
| `options.blockScreenshots` | `boolean` | 否 | 是否在 LLM 合并区块后,保存每个逻辑区块截图到当前运行目录的 `snapshots/` 并返回文件路径 |
|
|
153
|
+
| `options.waitForImagesLoaded` | `boolean` | 否 | 是否在提取区块、分析和截图前等待页面图片加载完成,默认 `false` |
|
|
154
|
+
| `options.extractorConfig.s3` | `object` | 否 | 截图 S3 上传配置。配置后截图上传到 S3,返回 HTTPS URL;未配置时仍保存到本地 `snapshots/` |
|
|
148
155
|
|
|
149
156
|
### analyzePageEvents(input)
|
|
150
157
|
|
|
@@ -239,6 +246,30 @@ const result = await analyzePageEvents({
|
|
|
239
246
|
|
|
240
247
|
启用 `showBlockIdx: true` 后,区块结果中会额外包含 `blockIdxs`、`blockSemanticGroups`、`rowCount` 等字段,并返回 `csvContent`。
|
|
241
248
|
|
|
249
|
+
启用 `fullPageScreenshot: true` 后,返回结果会包含 `screenshots.fullPage`,值为整页截图文件路径。
|
|
250
|
+
|
|
251
|
+
启用 `blockScreenshots: true` 后,模块会在 LLM 合并区块后再截图。返回结果会包含 `screenshots.blocks`,每项包含逻辑区块序号 `blockIdx` 和对应截图 `path`;区块分析结果中的每个 block 也会额外带上 `blockScreenshotPaths`,每个逻辑区块最多对应一张截图。无法通过 `blockCssPath` 截图的隐藏或空区块会被跳过。
|
|
252
|
+
|
|
253
|
+
如果配置 `extractorConfig.s3`,截图不会写入本地 `snapshots/`,而是直接上传到 S3;`screenshots.fullPage`、`screenshots.blocks[].path` 和 `blockScreenshotPaths` 会返回 HTTPS URL。上传不会设置 ACL,访问权限沿用 bucket 策略。单张截图上传失败会重试 3 次,仍失败则跳过该截图。
|
|
254
|
+
|
|
255
|
+
启用 `waitForImagesLoaded: true` 后,模块会先滚动页面触发懒加载,再等待当前 DOM 中的 `<img>` 完成加载或失败,之后再提取区块、分析和截图;等待时间受 `extractorConfig.timeoutMs` 控制。
|
|
256
|
+
|
|
257
|
+
截图参数启用后的新增输出示例:
|
|
258
|
+
|
|
259
|
+
```js
|
|
260
|
+
{
|
|
261
|
+
screenshots: {
|
|
262
|
+
fullPage: '/path/to/page-analyzer/snapshots/example-com-20260507-095500-full-page.png',
|
|
263
|
+
blocks: [
|
|
264
|
+
{
|
|
265
|
+
blockIdx: 0,
|
|
266
|
+
path: '/path/to/page-analyzer/snapshots/example-com-20260507-095500-block-000.png'
|
|
267
|
+
}
|
|
268
|
+
]
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
```
|
|
272
|
+
|
|
242
273
|
## 配置项
|
|
243
274
|
|
|
244
275
|
### extractorConfig
|
|
@@ -255,6 +286,37 @@ const result = await analyzePageEvents({
|
|
|
255
286
|
| `blockMaxHeightRatio` | `1.5` | 最大区块高度占视口高度比例 |
|
|
256
287
|
| `blockMaxDepth` | `15` | 区块提取最大 DOM 深度 |
|
|
257
288
|
| `textPreviewMaxChars` | `1200` | 区块文本预览最大长度 |
|
|
289
|
+
| `waitForImagesLoaded` | `false` | 是否在提取区块、分析和截图前等待页面图片加载完成 |
|
|
290
|
+
| `s3` | 无 | 截图 S3 上传配置。配置后截图直接上传到 S3,未配置时保存到本地 |
|
|
291
|
+
|
|
292
|
+
S3 截图上传示例:
|
|
293
|
+
|
|
294
|
+
```js
|
|
295
|
+
const result = await analyzeUrl('https://example.com', {
|
|
296
|
+
fullPageScreenshot: true,
|
|
297
|
+
blockScreenshots: true,
|
|
298
|
+
llm: {
|
|
299
|
+
apiKey: process.env.LLM_API_KEY,
|
|
300
|
+
apiEndpoint: process.env.LLM_API_ENDPOINT,
|
|
301
|
+
model: process.env.LLM_MODEL
|
|
302
|
+
},
|
|
303
|
+
extractorConfig: {
|
|
304
|
+
s3: {
|
|
305
|
+
bucket: 'my-bucket',
|
|
306
|
+
region: 'ap-northeast-1',
|
|
307
|
+
prefix: 'page-analyzer/snapshots',
|
|
308
|
+
publicBaseUrl: 'https://cdn.example.com/page-analyzer/snapshots',
|
|
309
|
+
credentials: {
|
|
310
|
+
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
|
|
311
|
+
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
|
|
312
|
+
sessionToken: process.env.AWS_SESSION_TOKEN
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
});
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
`extractorConfig.s3.bucket` 和 `extractorConfig.s3.region` 必填。`credentials` 可省略,省略时使用 AWS SDK 默认凭证链。`publicBaseUrl` 可省略,省略时返回 `https://<bucket>.s3.<region>.amazonaws.com/<key>`。
|
|
258
320
|
|
|
259
321
|
### parserConfig
|
|
260
322
|
|
|
@@ -320,7 +382,7 @@ data.choices[0].message.content
|
|
|
320
382
|
npm install
|
|
321
383
|
```
|
|
322
384
|
|
|
323
|
-
|
|
385
|
+
运行本地测试:
|
|
324
386
|
|
|
325
387
|
```bash
|
|
326
388
|
npm test
|
|
@@ -355,12 +417,13 @@ page-analyzer/
|
|
|
355
417
|
models/ # 上下文数据模型
|
|
356
418
|
utils/ # 文本、URL、选择器工具
|
|
357
419
|
vendor/ # 浏览器内区块提取脚本
|
|
358
|
-
|
|
420
|
+
scripts/analyze.js # 手动真实页面分析脚本
|
|
421
|
+
test/smoke.test.js # 本地 smoke test
|
|
359
422
|
```
|
|
360
423
|
|
|
361
424
|
## 常见问题
|
|
362
425
|
|
|
363
|
-
### npm
|
|
426
|
+
### npm run analyze 报 LLM 配置缺失
|
|
364
427
|
|
|
365
428
|
确认项目根目录存在 `.env`,并且包含:
|
|
366
429
|
|
|
@@ -397,4 +460,4 @@ extractorConfig: {
|
|
|
397
460
|
|
|
398
461
|
## License
|
|
399
462
|
|
|
400
|
-
|
|
463
|
+
MIT License. See [LICENSE](./LICENSE).
|
package/index.js
CHANGED
|
@@ -38,6 +38,131 @@ function normalizeDisplayOptions(options = {}) {
|
|
|
38
38
|
};
|
|
39
39
|
}
|
|
40
40
|
|
|
41
|
+
function parseBlockIdxs(value) {
|
|
42
|
+
if (Array.isArray(value)) {
|
|
43
|
+
return value
|
|
44
|
+
.map((item) => Number.parseInt(String(item), 10))
|
|
45
|
+
.filter(Number.isInteger);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (Number.isInteger(value)) {
|
|
49
|
+
return [value];
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return String(value || '')
|
|
53
|
+
.split(/[.,\s]+/)
|
|
54
|
+
.map((item) => Number.parseInt(item, 10))
|
|
55
|
+
.filter(Number.isInteger);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function buildBlockScreenshotMap(screenshots) {
|
|
59
|
+
const map = new Map();
|
|
60
|
+
for (const item of Array.isArray(screenshots?.blocks) ? screenshots.blocks : []) {
|
|
61
|
+
const blockIdx = Number.isInteger(item?.blockIdx)
|
|
62
|
+
? item.blockIdx
|
|
63
|
+
: Number.parseInt(String(item?.blockIdx), 10);
|
|
64
|
+
const screenshotPath = typeof item?.path === 'string' ? item.path : '';
|
|
65
|
+
if (Number.isInteger(blockIdx) && screenshotPath) {
|
|
66
|
+
map.set(blockIdx, screenshotPath);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return map;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function attachBlockScreenshotPaths(analysis, screenshots) {
|
|
73
|
+
const screenshotByBlockIdx = buildBlockScreenshotMap(screenshots);
|
|
74
|
+
if (screenshotByBlockIdx.size === 0 || !isObject(analysis?.block_analysis)) {
|
|
75
|
+
return analysis;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const sourceBlocks = analysis.block_analysis.blocks;
|
|
79
|
+
if (!Array.isArray(sourceBlocks)) {
|
|
80
|
+
return analysis;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const blocks = sourceBlocks.map((block) => {
|
|
84
|
+
const blockIdxs = parseBlockIdxs(block?.blockIdxs ?? block?.blockIdx);
|
|
85
|
+
const blockScreenshotPaths = blockIdxs
|
|
86
|
+
.map((blockIdx) => screenshotByBlockIdx.get(blockIdx))
|
|
87
|
+
.filter(Boolean);
|
|
88
|
+
|
|
89
|
+
if (blockScreenshotPaths.length === 0) {
|
|
90
|
+
return block;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
...block,
|
|
95
|
+
blockScreenshotPaths
|
|
96
|
+
};
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
return {
|
|
100
|
+
...analysis,
|
|
101
|
+
block_analysis: {
|
|
102
|
+
...analysis.block_analysis,
|
|
103
|
+
blocks
|
|
104
|
+
}
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function hasScreenshots(screenshots) {
|
|
109
|
+
return Boolean(
|
|
110
|
+
screenshots?.fullPage ||
|
|
111
|
+
(Array.isArray(screenshots?.blocks) && screenshots.blocks.length > 0)
|
|
112
|
+
);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function mergeScreenshots(primary, secondary) {
|
|
116
|
+
const merged = {};
|
|
117
|
+
if (primary?.fullPage) {
|
|
118
|
+
merged.fullPage = primary.fullPage;
|
|
119
|
+
}
|
|
120
|
+
if (secondary?.fullPage) {
|
|
121
|
+
merged.fullPage = secondary.fullPage;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const primaryBlocks = Array.isArray(primary?.blocks) ? primary.blocks : [];
|
|
125
|
+
const secondaryBlocks = Array.isArray(secondary?.blocks) ? secondary.blocks : [];
|
|
126
|
+
const blocks = secondaryBlocks.length > 0 ? secondaryBlocks : primaryBlocks;
|
|
127
|
+
if (blocks.length > 0) {
|
|
128
|
+
merged.blocks = blocks;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return hasScreenshots(merged) ? merged : null;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function attachLogicalBlockScreenshotPaths(result, screenshots) {
|
|
135
|
+
const blocks = result?.analysis?.block_analysis?.blocks;
|
|
136
|
+
if (!Array.isArray(blocks) || blocks.length === 0) {
|
|
137
|
+
return result;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const screenshotByLogicalIndex = buildBlockScreenshotMap(screenshots);
|
|
141
|
+
if (screenshotByLogicalIndex.size === 0) {
|
|
142
|
+
return result;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return {
|
|
146
|
+
...result,
|
|
147
|
+
analysis: {
|
|
148
|
+
...result.analysis,
|
|
149
|
+
block_analysis: {
|
|
150
|
+
...result.analysis.block_analysis,
|
|
151
|
+
blocks: blocks.map((block, index) => {
|
|
152
|
+
const screenshotPath = screenshotByLogicalIndex.get(index);
|
|
153
|
+
if (!screenshotPath) {
|
|
154
|
+
return block;
|
|
155
|
+
}
|
|
156
|
+
return {
|
|
157
|
+
...block,
|
|
158
|
+
blockScreenshotPaths: [screenshotPath]
|
|
159
|
+
};
|
|
160
|
+
})
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
41
166
|
function compactBlockAnalysisBlock(block, displayOptions) {
|
|
42
167
|
const source = isObject(block) ? block : {};
|
|
43
168
|
const out = {
|
|
@@ -63,6 +188,10 @@ function compactBlockAnalysisBlock(block, displayOptions) {
|
|
|
63
188
|
out.mode = source.mode;
|
|
64
189
|
}
|
|
65
190
|
|
|
191
|
+
if (Array.isArray(source.blockScreenshotPaths) && source.blockScreenshotPaths.length > 0) {
|
|
192
|
+
out.blockScreenshotPaths = source.blockScreenshotPaths;
|
|
193
|
+
}
|
|
194
|
+
|
|
66
195
|
return out;
|
|
67
196
|
}
|
|
68
197
|
|
|
@@ -121,14 +250,20 @@ function buildPageAnalysisResult({
|
|
|
121
250
|
csvContent,
|
|
122
251
|
pageData,
|
|
123
252
|
analysis,
|
|
124
|
-
displayOptions
|
|
253
|
+
displayOptions,
|
|
254
|
+
screenshots
|
|
125
255
|
}) {
|
|
256
|
+
const analysisWithScreenshots = attachBlockScreenshotPaths(analysis, screenshots);
|
|
126
257
|
const result = {
|
|
127
258
|
title: pageData.title,
|
|
128
259
|
parseMetrics: pageData.metrics,
|
|
129
|
-
analysis: buildAnalysisResult(
|
|
260
|
+
analysis: buildAnalysisResult(analysisWithScreenshots, displayOptions)
|
|
130
261
|
};
|
|
131
262
|
|
|
263
|
+
if (hasScreenshots(screenshots)) {
|
|
264
|
+
result.screenshots = screenshots;
|
|
265
|
+
}
|
|
266
|
+
|
|
132
267
|
if (displayOptions.showEvents) {
|
|
133
268
|
result.elements = elements;
|
|
134
269
|
result.csvContent = csvContent;
|
|
@@ -154,38 +289,84 @@ function buildPageAnalysisResult({
|
|
|
154
289
|
* @param {boolean} [options.showEvents=false] - Include event arrays and full event-related metadata.
|
|
155
290
|
* Also enables node-level event classification.
|
|
156
291
|
* @param {boolean} [options.showBlockIdx=false] - Include CSV/block index alignment fields.
|
|
292
|
+
* @param {boolean} [options.fullPageScreenshot=false] - Save a full-page screenshot to snapshots/ and return its path.
|
|
293
|
+
* @param {boolean} [options.blockScreenshots=false] - Save one screenshot per merged logical block to snapshots/ and return their paths.
|
|
294
|
+
* @param {boolean} [options.waitForImagesLoaded=false] - Wait for page images before extracting and screenshotting.
|
|
295
|
+
* @param {Object} [options.extractorConfig.s3] - Optional S3 config for uploading screenshots instead of saving locally.
|
|
157
296
|
* @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
|
|
158
297
|
*/
|
|
159
298
|
export async function analyzeUrl(url, options = {}) {
|
|
160
|
-
const {
|
|
299
|
+
const {
|
|
300
|
+
llm: llmConfig,
|
|
301
|
+
knownEventTypes,
|
|
302
|
+
parserConfig,
|
|
303
|
+
extractorConfig,
|
|
304
|
+
showEvents,
|
|
305
|
+
showBlockIdx,
|
|
306
|
+
fullPageScreenshot,
|
|
307
|
+
blockScreenshots,
|
|
308
|
+
waitForImagesLoaded
|
|
309
|
+
} = options;
|
|
161
310
|
|
|
162
311
|
if (!url) throw new Error('url is required');
|
|
163
312
|
if (!llmConfig?.apiKey || !llmConfig?.apiEndpoint || !llmConfig?.model) {
|
|
164
313
|
throw new Error('options.llm.apiKey, apiEndpoint, and model are required');
|
|
165
314
|
}
|
|
166
315
|
|
|
316
|
+
const shouldCaptureFullPage = fullPageScreenshot ?? extractorConfig?.fullPageScreenshot;
|
|
317
|
+
const shouldCaptureBlocks = blockScreenshots ?? extractorConfig?.blockScreenshots;
|
|
318
|
+
|
|
167
319
|
// Step 0: Playwright extraction
|
|
168
320
|
console.log(`[page-analyzer] Extracting ${url} ...`);
|
|
169
|
-
const extractor = new PageExtractor(
|
|
170
|
-
|
|
171
|
-
|
|
321
|
+
const extractor = new PageExtractor({
|
|
322
|
+
...extractorConfig,
|
|
323
|
+
fullPageScreenshot: shouldCaptureFullPage,
|
|
324
|
+
blockScreenshots: false,
|
|
325
|
+
waitForImagesLoaded: waitForImagesLoaded ?? extractorConfig?.waitForImagesLoaded
|
|
326
|
+
});
|
|
172
327
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
328
|
+
return await extractor.withPreparedPage(url, async (page, targetUrl) => {
|
|
329
|
+
const bundle = await extractor.extractPreparedPage(page, targetUrl);
|
|
330
|
+
console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
|
|
331
|
+
|
|
332
|
+
// Derive domain from URL
|
|
333
|
+
let domain = '';
|
|
334
|
+
try { domain = new URL(targetUrl).hostname.replace(/^www\./, ''); } catch { /* ignore */ }
|
|
335
|
+
|
|
336
|
+
let result = await analyzePageEvents({
|
|
337
|
+
html: bundle.html,
|
|
338
|
+
url: targetUrl,
|
|
339
|
+
blocks: bundle.blocks,
|
|
340
|
+
elementGeometries: bundle.elementGeometries,
|
|
341
|
+
llm: llmConfig,
|
|
342
|
+
knownEventTypes,
|
|
343
|
+
parserConfig,
|
|
344
|
+
showEvents,
|
|
345
|
+
showBlockIdx,
|
|
346
|
+
screenshots: bundle.screenshots,
|
|
347
|
+
domain,
|
|
348
|
+
nodeId: `${domain}-root`
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
if (shouldCaptureBlocks) {
|
|
352
|
+
const logicalBlocks = Array.isArray(result?.analysis?.block_analysis?.blocks)
|
|
353
|
+
? result.analysis.block_analysis.blocks
|
|
354
|
+
: [];
|
|
355
|
+
const blockScreenshotsBundle = await extractor.captureScreenshots(page, targetUrl, logicalBlocks, {
|
|
356
|
+
fullPageScreenshot: false,
|
|
357
|
+
blockScreenshots: true
|
|
358
|
+
});
|
|
359
|
+
const screenshots = mergeScreenshots(result.screenshots, blockScreenshotsBundle);
|
|
360
|
+
result = attachLogicalBlockScreenshotPaths(
|
|
361
|
+
{
|
|
362
|
+
...result,
|
|
363
|
+
...(screenshots ? { screenshots } : {})
|
|
364
|
+
},
|
|
365
|
+
screenshots
|
|
366
|
+
);
|
|
367
|
+
}
|
|
176
368
|
|
|
177
|
-
|
|
178
|
-
html: bundle.html,
|
|
179
|
-
url,
|
|
180
|
-
blocks: bundle.blocks,
|
|
181
|
-
elementGeometries: bundle.elementGeometries,
|
|
182
|
-
llm: llmConfig,
|
|
183
|
-
knownEventTypes,
|
|
184
|
-
parserConfig,
|
|
185
|
-
showEvents,
|
|
186
|
-
showBlockIdx,
|
|
187
|
-
domain,
|
|
188
|
-
nodeId: `${domain}-root`
|
|
369
|
+
return result;
|
|
189
370
|
});
|
|
190
371
|
}
|
|
191
372
|
|
|
@@ -213,6 +394,7 @@ export async function analyzeUrl(url, options = {}) {
|
|
|
213
394
|
* @param {boolean} [input.showEvents=false] - Include event arrays and full event-related metadata.
|
|
214
395
|
* Also enables node-level event classification.
|
|
215
396
|
* @param {boolean} [input.showBlockIdx=false] - Include CSV/block index alignment fields.
|
|
397
|
+
* @param {Object} [input.screenshots] - Screenshot paths captured during extraction.
|
|
216
398
|
* @param {string} [input.nodeId] - Node ID for logging context
|
|
217
399
|
* @param {string} [input.domain] - Domain for logging context
|
|
218
400
|
* @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
|
|
@@ -229,6 +411,7 @@ export async function analyzePageEvents(input) {
|
|
|
229
411
|
parserConfig = {},
|
|
230
412
|
showEvents = false,
|
|
231
413
|
showBlockIdx = false,
|
|
414
|
+
screenshots = null,
|
|
232
415
|
nodeId = '',
|
|
233
416
|
domain = ''
|
|
234
417
|
} = input;
|
|
@@ -289,7 +472,8 @@ export async function analyzePageEvents(input) {
|
|
|
289
472
|
csvContent,
|
|
290
473
|
pageData,
|
|
291
474
|
analysis,
|
|
292
|
-
displayOptions
|
|
475
|
+
displayOptions,
|
|
476
|
+
screenshots
|
|
293
477
|
});
|
|
294
478
|
}
|
|
295
479
|
|
|
@@ -115,13 +115,34 @@ function buildLogicalBlockPosition(sourceBlocks = []) {
|
|
|
115
115
|
}
|
|
116
116
|
|
|
117
117
|
function resolveLogicalBlockCssPath(sourceBlocks = []) {
|
|
118
|
+
const paths = [];
|
|
118
119
|
for (const block of Array.isArray(sourceBlocks) ? sourceBlocks : []) {
|
|
119
120
|
const path = cleanText(block?.blockCssPath || block?.cssPath || '', 500);
|
|
120
121
|
if (path) {
|
|
121
|
-
|
|
122
|
+
paths.push(path);
|
|
122
123
|
}
|
|
123
124
|
}
|
|
124
|
-
|
|
125
|
+
|
|
126
|
+
if (paths.length === 0) {
|
|
127
|
+
return '';
|
|
128
|
+
}
|
|
129
|
+
if (paths.length === 1) {
|
|
130
|
+
return paths[0];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const partsList = paths.map((path) => path.split('>').map((part) => part.trim()).filter(Boolean));
|
|
134
|
+
const commonParts = [];
|
|
135
|
+
const firstParts = partsList[0];
|
|
136
|
+
for (let index = 0; index < firstParts.length; index += 1) {
|
|
137
|
+
const part = firstParts[index];
|
|
138
|
+
if (partsList.every((parts) => parts[index] === part)) {
|
|
139
|
+
commonParts.push(part);
|
|
140
|
+
continue;
|
|
141
|
+
}
|
|
142
|
+
break;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return commonParts.length > 1 ? commonParts.join(' > ') : paths[0];
|
|
125
146
|
}
|
|
126
147
|
|
|
127
148
|
function normalizePossibleEvents(responseHelper, value) {
|
|
@@ -307,7 +307,7 @@ class EventAnalyzer {
|
|
|
307
307
|
}
|
|
308
308
|
|
|
309
309
|
async analyzeEvents(csvData, _mdData, knownEventTypes = [], options = {}) {
|
|
310
|
-
const analyzeNodeEvents =
|
|
310
|
+
const analyzeNodeEvents = options?.analyzeNodeEvents === true;
|
|
311
311
|
const configuredKnownEventTypes = this.response.normalizeStringList(
|
|
312
312
|
this.config?.knownEventTypes,
|
|
313
313
|
{ eventType: true }
|
package/package.json
CHANGED
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "page-analyzer",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Standalone page analysis module.",
|
|
6
|
+
"license": "MIT",
|
|
6
7
|
"main": "index.js",
|
|
7
8
|
"scripts": {
|
|
8
|
-
"test": "node test.js",
|
|
9
|
-
"analyze": "node
|
|
9
|
+
"test": "node test/smoke.test.js",
|
|
10
|
+
"analyze": "node scripts/analyze.js",
|
|
11
|
+
"viewer": "node scripts/serve-result-viewer.js"
|
|
10
12
|
},
|
|
11
13
|
"dependencies": {
|
|
14
|
+
"@aws-sdk/client-s3": "^3.1045.0",
|
|
12
15
|
"cheerio": "^1.2.0",
|
|
13
16
|
"csv-parse": "^5.6.0",
|
|
14
17
|
"playwright": "^1.58.2"
|