page-analyzer 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -38,6 +38,131 @@ function normalizeDisplayOptions(options = {}) {
38
38
  };
39
39
  }
40
40
 
41
+ function parseBlockIdxs(value) {
42
+ if (Array.isArray(value)) {
43
+ return value
44
+ .map((item) => Number.parseInt(String(item), 10))
45
+ .filter(Number.isInteger);
46
+ }
47
+
48
+ if (Number.isInteger(value)) {
49
+ return [value];
50
+ }
51
+
52
+ return String(value || '')
53
+ .split(/[.,\s]+/)
54
+ .map((item) => Number.parseInt(item, 10))
55
+ .filter(Number.isInteger);
56
+ }
57
+
58
+ function buildBlockScreenshotMap(screenshots) {
59
+ const map = new Map();
60
+ for (const item of Array.isArray(screenshots?.blocks) ? screenshots.blocks : []) {
61
+ const blockIdx = Number.isInteger(item?.blockIdx)
62
+ ? item.blockIdx
63
+ : Number.parseInt(String(item?.blockIdx), 10);
64
+ const screenshotPath = typeof item?.path === 'string' ? item.path : '';
65
+ if (Number.isInteger(blockIdx) && screenshotPath) {
66
+ map.set(blockIdx, screenshotPath);
67
+ }
68
+ }
69
+ return map;
70
+ }
71
+
72
+ function attachBlockScreenshotPaths(analysis, screenshots) {
73
+ const screenshotByBlockIdx = buildBlockScreenshotMap(screenshots);
74
+ if (screenshotByBlockIdx.size === 0 || !isObject(analysis?.block_analysis)) {
75
+ return analysis;
76
+ }
77
+
78
+ const sourceBlocks = analysis.block_analysis.blocks;
79
+ if (!Array.isArray(sourceBlocks)) {
80
+ return analysis;
81
+ }
82
+
83
+ const blocks = sourceBlocks.map((block) => {
84
+ const blockIdxs = parseBlockIdxs(block?.blockIdxs ?? block?.blockIdx);
85
+ const blockScreenshotPaths = blockIdxs
86
+ .map((blockIdx) => screenshotByBlockIdx.get(blockIdx))
87
+ .filter(Boolean);
88
+
89
+ if (blockScreenshotPaths.length === 0) {
90
+ return block;
91
+ }
92
+
93
+ return {
94
+ ...block,
95
+ blockScreenshotPaths
96
+ };
97
+ });
98
+
99
+ return {
100
+ ...analysis,
101
+ block_analysis: {
102
+ ...analysis.block_analysis,
103
+ blocks
104
+ }
105
+ };
106
+ }
107
+
108
+ function hasScreenshots(screenshots) {
109
+ return Boolean(
110
+ screenshots?.fullPage ||
111
+ (Array.isArray(screenshots?.blocks) && screenshots.blocks.length > 0)
112
+ );
113
+ }
114
+
115
+ function mergeScreenshots(primary, secondary) {
116
+ const merged = {};
117
+ if (primary?.fullPage) {
118
+ merged.fullPage = primary.fullPage;
119
+ }
120
+ if (secondary?.fullPage) {
121
+ merged.fullPage = secondary.fullPage;
122
+ }
123
+
124
+ const primaryBlocks = Array.isArray(primary?.blocks) ? primary.blocks : [];
125
+ const secondaryBlocks = Array.isArray(secondary?.blocks) ? secondary.blocks : [];
126
+ const blocks = secondaryBlocks.length > 0 ? secondaryBlocks : primaryBlocks;
127
+ if (blocks.length > 0) {
128
+ merged.blocks = blocks;
129
+ }
130
+
131
+ return hasScreenshots(merged) ? merged : null;
132
+ }
133
+
134
+ function attachLogicalBlockScreenshotPaths(result, screenshots) {
135
+ const blocks = result?.analysis?.block_analysis?.blocks;
136
+ if (!Array.isArray(blocks) || blocks.length === 0) {
137
+ return result;
138
+ }
139
+
140
+ const screenshotByLogicalIndex = buildBlockScreenshotMap(screenshots);
141
+ if (screenshotByLogicalIndex.size === 0) {
142
+ return result;
143
+ }
144
+
145
+ return {
146
+ ...result,
147
+ analysis: {
148
+ ...result.analysis,
149
+ block_analysis: {
150
+ ...result.analysis.block_analysis,
151
+ blocks: blocks.map((block, index) => {
152
+ const screenshotPath = screenshotByLogicalIndex.get(index);
153
+ if (!screenshotPath) {
154
+ return block;
155
+ }
156
+ return {
157
+ ...block,
158
+ blockScreenshotPaths: [screenshotPath]
159
+ };
160
+ })
161
+ }
162
+ }
163
+ };
164
+ }
165
+
41
166
  function compactBlockAnalysisBlock(block, displayOptions) {
42
167
  const source = isObject(block) ? block : {};
43
168
  const out = {
@@ -63,6 +188,10 @@ function compactBlockAnalysisBlock(block, displayOptions) {
63
188
  out.mode = source.mode;
64
189
  }
65
190
 
191
+ if (Array.isArray(source.blockScreenshotPaths) && source.blockScreenshotPaths.length > 0) {
192
+ out.blockScreenshotPaths = source.blockScreenshotPaths;
193
+ }
194
+
66
195
  return out;
67
196
  }
68
197
 
@@ -121,14 +250,20 @@ function buildPageAnalysisResult({
121
250
  csvContent,
122
251
  pageData,
123
252
  analysis,
124
- displayOptions
253
+ displayOptions,
254
+ screenshots
125
255
  }) {
256
+ const analysisWithScreenshots = attachBlockScreenshotPaths(analysis, screenshots);
126
257
  const result = {
127
258
  title: pageData.title,
128
259
  parseMetrics: pageData.metrics,
129
- analysis: buildAnalysisResult(analysis, displayOptions)
260
+ analysis: buildAnalysisResult(analysisWithScreenshots, displayOptions)
130
261
  };
131
262
 
263
+ if (hasScreenshots(screenshots)) {
264
+ result.screenshots = screenshots;
265
+ }
266
+
132
267
  if (displayOptions.showEvents) {
133
268
  result.elements = elements;
134
269
  result.csvContent = csvContent;
@@ -154,19 +289,40 @@ function buildPageAnalysisResult({
154
289
  * @param {boolean} [options.showEvents=false] - Include event arrays and full event-related metadata.
155
290
  * Also enables node-level event classification.
156
291
  * @param {boolean} [options.showBlockIdx=false] - Include CSV/block index alignment fields.
292
+ * @param {boolean} [options.fullPageScreenshot=false] - Save a full-page screenshot to snapshots/ and return its path.
293
+ * @param {boolean} [options.blockScreenshots=false] - Save one screenshot per merged logical block to snapshots/ and return their paths.
294
+ * @param {boolean} [options.waitForImagesLoaded=false] - Wait for page images before extracting and screenshotting.
157
295
  * @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
158
296
  */
159
297
  export async function analyzeUrl(url, options = {}) {
160
- const { llm: llmConfig, knownEventTypes, parserConfig, extractorConfig, showEvents, showBlockIdx } = options;
298
+ const {
299
+ llm: llmConfig,
300
+ knownEventTypes,
301
+ parserConfig,
302
+ extractorConfig,
303
+ showEvents,
304
+ showBlockIdx,
305
+ fullPageScreenshot,
306
+ blockScreenshots,
307
+ waitForImagesLoaded
308
+ } = options;
161
309
 
162
310
  if (!url) throw new Error('url is required');
163
311
  if (!llmConfig?.apiKey || !llmConfig?.apiEndpoint || !llmConfig?.model) {
164
312
  throw new Error('options.llm.apiKey, apiEndpoint, and model are required');
165
313
  }
166
314
 
315
+ const shouldCaptureFullPage = fullPageScreenshot ?? extractorConfig?.fullPageScreenshot;
316
+ const shouldCaptureBlocks = blockScreenshots ?? extractorConfig?.blockScreenshots;
317
+
167
318
  // Step 0: Playwright extraction
168
319
  console.log(`[page-analyzer] Extracting ${url} ...`);
169
- const extractor = new PageExtractor(extractorConfig);
320
+ const extractor = new PageExtractor({
321
+ ...extractorConfig,
322
+ fullPageScreenshot: shouldCaptureFullPage,
323
+ blockScreenshots: false,
324
+ waitForImagesLoaded: waitForImagesLoaded ?? extractorConfig?.waitForImagesLoaded
325
+ });
170
326
  const bundle = await extractor.extract(url);
171
327
  console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
172
328
 
@@ -174,7 +330,7 @@ export async function analyzeUrl(url, options = {}) {
174
330
  let domain = '';
175
331
  try { domain = new URL(url).hostname.replace(/^www\./, ''); } catch { /* ignore */ }
176
332
 
177
- return analyzePageEvents({
333
+ let result = await analyzePageEvents({
178
334
  html: bundle.html,
179
335
  url,
180
336
  blocks: bundle.blocks,
@@ -184,9 +340,30 @@ export async function analyzeUrl(url, options = {}) {
184
340
  parserConfig,
185
341
  showEvents,
186
342
  showBlockIdx,
343
+ screenshots: bundle.screenshots,
187
344
  domain,
188
345
  nodeId: `${domain}-root`
189
346
  });
347
+
348
+ if (shouldCaptureBlocks) {
349
+ const logicalBlocks = Array.isArray(result?.analysis?.block_analysis?.blocks)
350
+ ? result.analysis.block_analysis.blocks
351
+ : [];
352
+ const blockScreenshotsBundle = await extractor.captureUrlScreenshots(url, logicalBlocks, {
353
+ fullPageScreenshot: false,
354
+ blockScreenshots: true
355
+ });
356
+ const screenshots = mergeScreenshots(result.screenshots, blockScreenshotsBundle);
357
+ result = attachLogicalBlockScreenshotPaths(
358
+ {
359
+ ...result,
360
+ ...(screenshots ? { screenshots } : {})
361
+ },
362
+ screenshots
363
+ );
364
+ }
365
+
366
+ return result;
190
367
  }
191
368
 
192
369
  /**
@@ -213,6 +390,7 @@ export async function analyzeUrl(url, options = {}) {
213
390
  * @param {boolean} [input.showEvents=false] - Include event arrays and full event-related metadata.
214
391
  * Also enables node-level event classification.
215
392
  * @param {boolean} [input.showBlockIdx=false] - Include CSV/block index alignment fields.
393
+ * @param {Object} [input.screenshots] - Screenshot paths captured during extraction.
216
394
  * @param {string} [input.nodeId] - Node ID for logging context
217
395
  * @param {string} [input.domain] - Domain for logging context
218
396
  * @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
@@ -229,6 +407,7 @@ export async function analyzePageEvents(input) {
229
407
  parserConfig = {},
230
408
  showEvents = false,
231
409
  showBlockIdx = false,
410
+ screenshots = null,
232
411
  nodeId = '',
233
412
  domain = ''
234
413
  } = input;
@@ -289,7 +468,8 @@ export async function analyzePageEvents(input) {
289
468
  csvContent,
290
469
  pageData,
291
470
  analysis,
292
- displayOptions
471
+ displayOptions,
472
+ screenshots
293
473
  });
294
474
  }
295
475
 
@@ -115,13 +115,34 @@ function buildLogicalBlockPosition(sourceBlocks = []) {
115
115
  }
116
116
 
117
117
  function resolveLogicalBlockCssPath(sourceBlocks = []) {
118
+ const paths = [];
118
119
  for (const block of Array.isArray(sourceBlocks) ? sourceBlocks : []) {
119
120
  const path = cleanText(block?.blockCssPath || block?.cssPath || '', 500);
120
121
  if (path) {
121
- return path;
122
+ paths.push(path);
122
123
  }
123
124
  }
124
- return '';
125
+
126
+ if (paths.length === 0) {
127
+ return '';
128
+ }
129
+ if (paths.length === 1) {
130
+ return paths[0];
131
+ }
132
+
133
+ const partsList = paths.map((path) => path.split('>').map((part) => part.trim()).filter(Boolean));
134
+ const commonParts = [];
135
+ const firstParts = partsList[0];
136
+ for (let index = 0; index < firstParts.length; index += 1) {
137
+ const part = firstParts[index];
138
+ if (partsList.every((parts) => parts[index] === part)) {
139
+ commonParts.push(part);
140
+ continue;
141
+ }
142
+ break;
143
+ }
144
+
145
+ return commonParts.length > 1 ? commonParts.join(' > ') : paths[0];
125
146
  }
126
147
 
127
148
  function normalizePossibleEvents(responseHelper, value) {
@@ -1,7 +1,7 @@
1
1
  const DEFAULT_ATTRIBUTE_KEYS = [
2
2
  'text',
3
3
  'page_area',
4
- 'content_category(producdt/support/company/legal)',
4
+ 'content_category(product/support/company/legal)',
5
5
  'is_external'
6
6
  ];
7
7
 
@@ -307,7 +307,7 @@ class EventAnalyzer {
307
307
  }
308
308
 
309
309
  async analyzeEvents(csvData, _mdData, knownEventTypes = [], options = {}) {
310
- const analyzeNodeEvents = !options?.analyzeNodeEvents;
310
+ const analyzeNodeEvents = options?.analyzeNodeEvents === true;
311
311
  const configuredKnownEventTypes = this.response.normalizeStringList(
312
312
  this.config?.knownEventTypes,
313
313
  { eventType: true }
package/package.json CHANGED
@@ -1,12 +1,14 @@
1
1
  {
2
2
  "name": "page-analyzer",
3
- "version": "1.0.0",
3
+ "version": "1.1.1",
4
4
  "type": "module",
5
5
  "description": "Standalone page analysis module.",
6
+ "license": "MIT",
6
7
  "main": "index.js",
7
8
  "scripts": {
8
- "test": "node test.js",
9
- "analyze": "node test.js"
9
+ "test": "node test/smoke.test.js",
10
+ "analyze": "node scripts/analyze.js",
11
+ "viewer": "node scripts/serve-result-viewer.js"
10
12
  },
11
13
  "dependencies": {
12
14
  "cheerio": "^1.2.0",