page-analyzer 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +430 -0
- package/index.js +186 -6
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +23 -2
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +1 -1
- package/llm/analyzers/event-analyzer/event-analyzer.js +1 -1
- package/package.json +5 -3
- package/page-extractor.js +364 -17
- package/result-viewer.html +879 -0
- package/scripts/analyze.js +51 -0
- package/scripts/build-result-viewer.js +891 -0
- package/scripts/serve-result-viewer.js +68 -0
- package/test/smoke.test.js +213 -0
package/index.js
CHANGED
|
@@ -38,6 +38,131 @@ function normalizeDisplayOptions(options = {}) {
|
|
|
38
38
|
};
|
|
39
39
|
}
|
|
40
40
|
|
|
41
|
+
function parseBlockIdxs(value) {
|
|
42
|
+
if (Array.isArray(value)) {
|
|
43
|
+
return value
|
|
44
|
+
.map((item) => Number.parseInt(String(item), 10))
|
|
45
|
+
.filter(Number.isInteger);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (Number.isInteger(value)) {
|
|
49
|
+
return [value];
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return String(value || '')
|
|
53
|
+
.split(/[.,\s]+/)
|
|
54
|
+
.map((item) => Number.parseInt(item, 10))
|
|
55
|
+
.filter(Number.isInteger);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function buildBlockScreenshotMap(screenshots) {
|
|
59
|
+
const map = new Map();
|
|
60
|
+
for (const item of Array.isArray(screenshots?.blocks) ? screenshots.blocks : []) {
|
|
61
|
+
const blockIdx = Number.isInteger(item?.blockIdx)
|
|
62
|
+
? item.blockIdx
|
|
63
|
+
: Number.parseInt(String(item?.blockIdx), 10);
|
|
64
|
+
const screenshotPath = typeof item?.path === 'string' ? item.path : '';
|
|
65
|
+
if (Number.isInteger(blockIdx) && screenshotPath) {
|
|
66
|
+
map.set(blockIdx, screenshotPath);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return map;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function attachBlockScreenshotPaths(analysis, screenshots) {
|
|
73
|
+
const screenshotByBlockIdx = buildBlockScreenshotMap(screenshots);
|
|
74
|
+
if (screenshotByBlockIdx.size === 0 || !isObject(analysis?.block_analysis)) {
|
|
75
|
+
return analysis;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const sourceBlocks = analysis.block_analysis.blocks;
|
|
79
|
+
if (!Array.isArray(sourceBlocks)) {
|
|
80
|
+
return analysis;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const blocks = sourceBlocks.map((block) => {
|
|
84
|
+
const blockIdxs = parseBlockIdxs(block?.blockIdxs ?? block?.blockIdx);
|
|
85
|
+
const blockScreenshotPaths = blockIdxs
|
|
86
|
+
.map((blockIdx) => screenshotByBlockIdx.get(blockIdx))
|
|
87
|
+
.filter(Boolean);
|
|
88
|
+
|
|
89
|
+
if (blockScreenshotPaths.length === 0) {
|
|
90
|
+
return block;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
...block,
|
|
95
|
+
blockScreenshotPaths
|
|
96
|
+
};
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
return {
|
|
100
|
+
...analysis,
|
|
101
|
+
block_analysis: {
|
|
102
|
+
...analysis.block_analysis,
|
|
103
|
+
blocks
|
|
104
|
+
}
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function hasScreenshots(screenshots) {
|
|
109
|
+
return Boolean(
|
|
110
|
+
screenshots?.fullPage ||
|
|
111
|
+
(Array.isArray(screenshots?.blocks) && screenshots.blocks.length > 0)
|
|
112
|
+
);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function mergeScreenshots(primary, secondary) {
|
|
116
|
+
const merged = {};
|
|
117
|
+
if (primary?.fullPage) {
|
|
118
|
+
merged.fullPage = primary.fullPage;
|
|
119
|
+
}
|
|
120
|
+
if (secondary?.fullPage) {
|
|
121
|
+
merged.fullPage = secondary.fullPage;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const primaryBlocks = Array.isArray(primary?.blocks) ? primary.blocks : [];
|
|
125
|
+
const secondaryBlocks = Array.isArray(secondary?.blocks) ? secondary.blocks : [];
|
|
126
|
+
const blocks = secondaryBlocks.length > 0 ? secondaryBlocks : primaryBlocks;
|
|
127
|
+
if (blocks.length > 0) {
|
|
128
|
+
merged.blocks = blocks;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return hasScreenshots(merged) ? merged : null;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function attachLogicalBlockScreenshotPaths(result, screenshots) {
|
|
135
|
+
const blocks = result?.analysis?.block_analysis?.blocks;
|
|
136
|
+
if (!Array.isArray(blocks) || blocks.length === 0) {
|
|
137
|
+
return result;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const screenshotByLogicalIndex = buildBlockScreenshotMap(screenshots);
|
|
141
|
+
if (screenshotByLogicalIndex.size === 0) {
|
|
142
|
+
return result;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return {
|
|
146
|
+
...result,
|
|
147
|
+
analysis: {
|
|
148
|
+
...result.analysis,
|
|
149
|
+
block_analysis: {
|
|
150
|
+
...result.analysis.block_analysis,
|
|
151
|
+
blocks: blocks.map((block, index) => {
|
|
152
|
+
const screenshotPath = screenshotByLogicalIndex.get(index);
|
|
153
|
+
if (!screenshotPath) {
|
|
154
|
+
return block;
|
|
155
|
+
}
|
|
156
|
+
return {
|
|
157
|
+
...block,
|
|
158
|
+
blockScreenshotPaths: [screenshotPath]
|
|
159
|
+
};
|
|
160
|
+
})
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
41
166
|
function compactBlockAnalysisBlock(block, displayOptions) {
|
|
42
167
|
const source = isObject(block) ? block : {};
|
|
43
168
|
const out = {
|
|
@@ -63,6 +188,10 @@ function compactBlockAnalysisBlock(block, displayOptions) {
|
|
|
63
188
|
out.mode = source.mode;
|
|
64
189
|
}
|
|
65
190
|
|
|
191
|
+
if (Array.isArray(source.blockScreenshotPaths) && source.blockScreenshotPaths.length > 0) {
|
|
192
|
+
out.blockScreenshotPaths = source.blockScreenshotPaths;
|
|
193
|
+
}
|
|
194
|
+
|
|
66
195
|
return out;
|
|
67
196
|
}
|
|
68
197
|
|
|
@@ -121,14 +250,20 @@ function buildPageAnalysisResult({
|
|
|
121
250
|
csvContent,
|
|
122
251
|
pageData,
|
|
123
252
|
analysis,
|
|
124
|
-
displayOptions
|
|
253
|
+
displayOptions,
|
|
254
|
+
screenshots
|
|
125
255
|
}) {
|
|
256
|
+
const analysisWithScreenshots = attachBlockScreenshotPaths(analysis, screenshots);
|
|
126
257
|
const result = {
|
|
127
258
|
title: pageData.title,
|
|
128
259
|
parseMetrics: pageData.metrics,
|
|
129
|
-
analysis: buildAnalysisResult(
|
|
260
|
+
analysis: buildAnalysisResult(analysisWithScreenshots, displayOptions)
|
|
130
261
|
};
|
|
131
262
|
|
|
263
|
+
if (hasScreenshots(screenshots)) {
|
|
264
|
+
result.screenshots = screenshots;
|
|
265
|
+
}
|
|
266
|
+
|
|
132
267
|
if (displayOptions.showEvents) {
|
|
133
268
|
result.elements = elements;
|
|
134
269
|
result.csvContent = csvContent;
|
|
@@ -154,19 +289,40 @@ function buildPageAnalysisResult({
|
|
|
154
289
|
* @param {boolean} [options.showEvents=false] - Include event arrays and full event-related metadata.
|
|
155
290
|
* Also enables node-level event classification.
|
|
156
291
|
* @param {boolean} [options.showBlockIdx=false] - Include CSV/block index alignment fields.
|
|
292
|
+
* @param {boolean} [options.fullPageScreenshot=false] - Save a full-page screenshot to snapshots/ and return its path.
|
|
293
|
+
* @param {boolean} [options.blockScreenshots=false] - Save one screenshot per merged logical block to snapshots/ and return their paths.
|
|
294
|
+
* @param {boolean} [options.waitForImagesLoaded=false] - Wait for page images before extracting and screenshotting.
|
|
157
295
|
* @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
|
|
158
296
|
*/
|
|
159
297
|
export async function analyzeUrl(url, options = {}) {
|
|
160
|
-
const {
|
|
298
|
+
const {
|
|
299
|
+
llm: llmConfig,
|
|
300
|
+
knownEventTypes,
|
|
301
|
+
parserConfig,
|
|
302
|
+
extractorConfig,
|
|
303
|
+
showEvents,
|
|
304
|
+
showBlockIdx,
|
|
305
|
+
fullPageScreenshot,
|
|
306
|
+
blockScreenshots,
|
|
307
|
+
waitForImagesLoaded
|
|
308
|
+
} = options;
|
|
161
309
|
|
|
162
310
|
if (!url) throw new Error('url is required');
|
|
163
311
|
if (!llmConfig?.apiKey || !llmConfig?.apiEndpoint || !llmConfig?.model) {
|
|
164
312
|
throw new Error('options.llm.apiKey, apiEndpoint, and model are required');
|
|
165
313
|
}
|
|
166
314
|
|
|
315
|
+
const shouldCaptureFullPage = fullPageScreenshot ?? extractorConfig?.fullPageScreenshot;
|
|
316
|
+
const shouldCaptureBlocks = blockScreenshots ?? extractorConfig?.blockScreenshots;
|
|
317
|
+
|
|
167
318
|
// Step 0: Playwright extraction
|
|
168
319
|
console.log(`[page-analyzer] Extracting ${url} ...`);
|
|
169
|
-
const extractor = new PageExtractor(
|
|
320
|
+
const extractor = new PageExtractor({
|
|
321
|
+
...extractorConfig,
|
|
322
|
+
fullPageScreenshot: shouldCaptureFullPage,
|
|
323
|
+
blockScreenshots: false,
|
|
324
|
+
waitForImagesLoaded: waitForImagesLoaded ?? extractorConfig?.waitForImagesLoaded
|
|
325
|
+
});
|
|
170
326
|
const bundle = await extractor.extract(url);
|
|
171
327
|
console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
|
|
172
328
|
|
|
@@ -174,7 +330,7 @@ export async function analyzeUrl(url, options = {}) {
|
|
|
174
330
|
let domain = '';
|
|
175
331
|
try { domain = new URL(url).hostname.replace(/^www\./, ''); } catch { /* ignore */ }
|
|
176
332
|
|
|
177
|
-
|
|
333
|
+
let result = await analyzePageEvents({
|
|
178
334
|
html: bundle.html,
|
|
179
335
|
url,
|
|
180
336
|
blocks: bundle.blocks,
|
|
@@ -184,9 +340,30 @@ export async function analyzeUrl(url, options = {}) {
|
|
|
184
340
|
parserConfig,
|
|
185
341
|
showEvents,
|
|
186
342
|
showBlockIdx,
|
|
343
|
+
screenshots: bundle.screenshots,
|
|
187
344
|
domain,
|
|
188
345
|
nodeId: `${domain}-root`
|
|
189
346
|
});
|
|
347
|
+
|
|
348
|
+
if (shouldCaptureBlocks) {
|
|
349
|
+
const logicalBlocks = Array.isArray(result?.analysis?.block_analysis?.blocks)
|
|
350
|
+
? result.analysis.block_analysis.blocks
|
|
351
|
+
: [];
|
|
352
|
+
const blockScreenshotsBundle = await extractor.captureUrlScreenshots(url, logicalBlocks, {
|
|
353
|
+
fullPageScreenshot: false,
|
|
354
|
+
blockScreenshots: true
|
|
355
|
+
});
|
|
356
|
+
const screenshots = mergeScreenshots(result.screenshots, blockScreenshotsBundle);
|
|
357
|
+
result = attachLogicalBlockScreenshotPaths(
|
|
358
|
+
{
|
|
359
|
+
...result,
|
|
360
|
+
...(screenshots ? { screenshots } : {})
|
|
361
|
+
},
|
|
362
|
+
screenshots
|
|
363
|
+
);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return result;
|
|
190
367
|
}
|
|
191
368
|
|
|
192
369
|
/**
|
|
@@ -213,6 +390,7 @@ export async function analyzeUrl(url, options = {}) {
|
|
|
213
390
|
* @param {boolean} [input.showEvents=false] - Include event arrays and full event-related metadata.
|
|
214
391
|
* Also enables node-level event classification.
|
|
215
392
|
* @param {boolean} [input.showBlockIdx=false] - Include CSV/block index alignment fields.
|
|
393
|
+
* @param {Object} [input.screenshots] - Screenshot paths captured during extraction.
|
|
216
394
|
* @param {string} [input.nodeId] - Node ID for logging context
|
|
217
395
|
* @param {string} [input.domain] - Domain for logging context
|
|
218
396
|
* @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
|
|
@@ -229,6 +407,7 @@ export async function analyzePageEvents(input) {
|
|
|
229
407
|
parserConfig = {},
|
|
230
408
|
showEvents = false,
|
|
231
409
|
showBlockIdx = false,
|
|
410
|
+
screenshots = null,
|
|
232
411
|
nodeId = '',
|
|
233
412
|
domain = ''
|
|
234
413
|
} = input;
|
|
@@ -289,7 +468,8 @@ export async function analyzePageEvents(input) {
|
|
|
289
468
|
csvContent,
|
|
290
469
|
pageData,
|
|
291
470
|
analysis,
|
|
292
|
-
displayOptions
|
|
471
|
+
displayOptions,
|
|
472
|
+
screenshots
|
|
293
473
|
});
|
|
294
474
|
}
|
|
295
475
|
|
|
@@ -115,13 +115,34 @@ function buildLogicalBlockPosition(sourceBlocks = []) {
|
|
|
115
115
|
}
|
|
116
116
|
|
|
117
117
|
function resolveLogicalBlockCssPath(sourceBlocks = []) {
|
|
118
|
+
const paths = [];
|
|
118
119
|
for (const block of Array.isArray(sourceBlocks) ? sourceBlocks : []) {
|
|
119
120
|
const path = cleanText(block?.blockCssPath || block?.cssPath || '', 500);
|
|
120
121
|
if (path) {
|
|
121
|
-
|
|
122
|
+
paths.push(path);
|
|
122
123
|
}
|
|
123
124
|
}
|
|
124
|
-
|
|
125
|
+
|
|
126
|
+
if (paths.length === 0) {
|
|
127
|
+
return '';
|
|
128
|
+
}
|
|
129
|
+
if (paths.length === 1) {
|
|
130
|
+
return paths[0];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const partsList = paths.map((path) => path.split('>').map((part) => part.trim()).filter(Boolean));
|
|
134
|
+
const commonParts = [];
|
|
135
|
+
const firstParts = partsList[0];
|
|
136
|
+
for (let index = 0; index < firstParts.length; index += 1) {
|
|
137
|
+
const part = firstParts[index];
|
|
138
|
+
if (partsList.every((parts) => parts[index] === part)) {
|
|
139
|
+
commonParts.push(part);
|
|
140
|
+
continue;
|
|
141
|
+
}
|
|
142
|
+
break;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return commonParts.length > 1 ? commonParts.join(' > ') : paths[0];
|
|
125
146
|
}
|
|
126
147
|
|
|
127
148
|
function normalizePossibleEvents(responseHelper, value) {
|
|
@@ -307,7 +307,7 @@ class EventAnalyzer {
|
|
|
307
307
|
}
|
|
308
308
|
|
|
309
309
|
async analyzeEvents(csvData, _mdData, knownEventTypes = [], options = {}) {
|
|
310
|
-
const analyzeNodeEvents =
|
|
310
|
+
const analyzeNodeEvents = options?.analyzeNodeEvents === true;
|
|
311
311
|
const configuredKnownEventTypes = this.response.normalizeStringList(
|
|
312
312
|
this.config?.knownEventTypes,
|
|
313
313
|
{ eventType: true }
|
package/package.json
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "page-analyzer",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Standalone page analysis module.",
|
|
6
|
+
"license": "MIT",
|
|
6
7
|
"main": "index.js",
|
|
7
8
|
"scripts": {
|
|
8
|
-
"test": "node test.js",
|
|
9
|
-
"analyze": "node
|
|
9
|
+
"test": "node test/smoke.test.js",
|
|
10
|
+
"analyze": "node scripts/analyze.js",
|
|
11
|
+
"viewer": "node scripts/serve-result-viewer.js"
|
|
10
12
|
},
|
|
11
13
|
"dependencies": {
|
|
12
14
|
"cheerio": "^1.2.0",
|