page-analyzer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/csv-exporter.js +192 -0
- package/extractors/block-assigner.js +281 -0
- package/extractors/context-extractor.js +275 -0
- package/extractors/css-selector-builder.js +202 -0
- package/extractors/pt-selector-builder.js +344 -0
- package/html-parser.js +206 -0
- package/index.js +303 -0
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +553 -0
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +22 -0
- package/llm/analyzers/event-analyzer/event-analyzer-events.js +97 -0
- package/llm/analyzers/event-analyzer/event-analyzer-input.js +168 -0
- package/llm/analyzers/event-analyzer/event-analyzer-metadata.js +15 -0
- package/llm/analyzers/event-analyzer/event-analyzer-prompt.js +71 -0
- package/llm/analyzers/event-analyzer/event-analyzer-response.js +290 -0
- package/llm/analyzers/event-analyzer/event-analyzer-utils.js +96 -0
- package/llm/analyzers/event-analyzer/event-analyzer.js +546 -0
- package/llm/analyzers/prompts/event-analysis.txt +52 -0
- package/llm/analyzers/prompts/special-block-confirmation.txt +127 -0
- package/llm/providers/base-provider.js +64 -0
- package/llm/providers/openai-provider.js +168 -0
- package/llm/utils/event-csv.js +276 -0
- package/models/context.js +44 -0
- package/package.json +16 -0
- package/page-extractor.js +215 -0
- package/utils/selector-utils.js +31 -0
- package/utils/text-utils.js +11 -0
- package/utils/url-utils.js +43 -0
- package/vendor/extract-blocks.js +903 -0
|
@@ -0,0 +1,903 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Standalone script: extract visual content blocks from a web page.
|
|
5
|
+
*
|
|
6
|
+
* Core logic runs inside Playwright (headless Chromium), identifying blocks
|
|
7
|
+
* by their rendered layout (height + width) rather than DOM structure.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* node scripts/extract-blocks.js <url-or-html-file> [options]
|
|
11
|
+
*
|
|
12
|
+
* Options:
|
|
13
|
+
* --min-height <px> Minimum block height in pixels (default: 80)
|
|
14
|
+
* --min-width-ratio <0-1> Minimum block width as ratio of viewport (default: 0.25)
|
|
15
|
+
* --max-height <px> Blocks taller than this get subdivided (default: viewport * 1.5)
|
|
16
|
+
* --viewport <WxH> Viewport size (default: 1440x900)
|
|
17
|
+
* --json Output JSON instead of CSV
|
|
18
|
+
* --out <file> Write output to file instead of stdout
|
|
19
|
+
* --save-html <file> Save rendered HTML to file
|
|
20
|
+
*
|
|
21
|
+
* Examples:
|
|
22
|
+
* node scripts/extract-blocks.js https://example.com
|
|
23
|
+
* node scripts/extract-blocks.js https://example.com --viewport 1920x1080 --out blocks.csv
|
|
24
|
+
* node scripts/extract-blocks.js page.html --min-height 100 --json
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import fs from 'node:fs';
|
|
28
|
+
import path from 'node:path';
|
|
29
|
+
import { pathToFileURL } from 'node:url';
|
|
30
|
+
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// CLI
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
function parseArgs() {
|
|
36
|
+
const args = process.argv.slice(2);
|
|
37
|
+
|
|
38
|
+
if (args.length === 0) {
|
|
39
|
+
console.error('Usage: node scripts/extract-blocks.js <url-or-html-file> [options]');
|
|
40
|
+
console.error('');
|
|
41
|
+
console.error('Options:');
|
|
42
|
+
console.error(' --min-height <px> Min block height (default: 80)');
|
|
43
|
+
console.error(' --min-width-ratio <0-1> Min width as viewport ratio (default: 0.25)');
|
|
44
|
+
console.error(' --max-height <px> Subdivide blocks taller than this (default: viewport*1.5)');
|
|
45
|
+
console.error(' --viewport <WxH> Viewport size (default: 1440x900)');
|
|
46
|
+
console.error(' --json Output JSON instead of CSV');
|
|
47
|
+
console.error(' --out <file> Write to file instead of stdout');
|
|
48
|
+
console.error(' --save-html <file> Save rendered HTML');
|
|
49
|
+
process.exit(1);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const opts = {
|
|
53
|
+
input: args[0],
|
|
54
|
+
minHeight: 40,
|
|
55
|
+
minWidthRatio: 0.25,
|
|
56
|
+
maxHeight: null, // computed from viewport if not set
|
|
57
|
+
viewportWidth: 1440,
|
|
58
|
+
viewportHeight: 900,
|
|
59
|
+
json: false,
|
|
60
|
+
out: null,
|
|
61
|
+
saveHtml: null,
|
|
62
|
+
debug: false
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
for (let i = 1; i < args.length; i++) {
|
|
66
|
+
switch (args[i]) {
|
|
67
|
+
case '--min-height':
|
|
68
|
+
opts.minHeight = parseInt(args[++i], 10);
|
|
69
|
+
break;
|
|
70
|
+
case '--min-width-ratio':
|
|
71
|
+
opts.minWidthRatio = parseFloat(args[++i]);
|
|
72
|
+
break;
|
|
73
|
+
case '--max-height':
|
|
74
|
+
opts.maxHeight = parseInt(args[++i], 10);
|
|
75
|
+
break;
|
|
76
|
+
case '--viewport': {
|
|
77
|
+
const [w, h] = args[++i].split('x').map(Number);
|
|
78
|
+
opts.viewportWidth = w || 1440;
|
|
79
|
+
opts.viewportHeight = h || 900;
|
|
80
|
+
break;
|
|
81
|
+
}
|
|
82
|
+
case '--json':
|
|
83
|
+
opts.json = true;
|
|
84
|
+
break;
|
|
85
|
+
case '--out':
|
|
86
|
+
opts.out = args[++i];
|
|
87
|
+
break;
|
|
88
|
+
case '--save-html':
|
|
89
|
+
opts.saveHtml = args[++i];
|
|
90
|
+
break;
|
|
91
|
+
case '--debug':
|
|
92
|
+
opts.debug = true;
|
|
93
|
+
break;
|
|
94
|
+
default:
|
|
95
|
+
console.error(`Unknown option: ${args[i]}`);
|
|
96
|
+
process.exit(1);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (opts.maxHeight === null) {
|
|
101
|
+
opts.maxHeight = Math.round(opts.viewportHeight * 1.5);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return opts;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
// Helpers
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
function isUrl(input) {
|
|
112
|
+
return /^https?:\/\//i.test(input);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// ---------------------------------------------------------------------------
|
|
116
|
+
// CSV formatter
|
|
117
|
+
// ---------------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
function escapeCsv(value) {
|
|
120
|
+
const str = String(value ?? '');
|
|
121
|
+
if (str.includes(',') || str.includes('"') || str.includes('\n') || str.includes('\r')) {
|
|
122
|
+
return `"${str.replace(/"/g, '""')}"`;
|
|
123
|
+
}
|
|
124
|
+
return str;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
export function blocksToCsv(blocks) {
|
|
128
|
+
const header = 'blockIdx,branchPath,depth,domOrder,tag,fixed,top,left,width,height,blockCssPath,blockPosition,textPreview,childInteractiveCount';
|
|
129
|
+
const rows = blocks.map(b => [
|
|
130
|
+
b.blockIdx,
|
|
131
|
+
escapeCsv(b.branchPath ?? ''),
|
|
132
|
+
b.depth ?? 0,
|
|
133
|
+
b.domOrder ?? 0,
|
|
134
|
+
b.tag,
|
|
135
|
+
b.fixed ? 1 : 0,
|
|
136
|
+
Math.round(b.top),
|
|
137
|
+
Math.round(b.left),
|
|
138
|
+
Math.round(b.width),
|
|
139
|
+
Math.round(b.height),
|
|
140
|
+
escapeCsv(b.blockCssPath ?? ''),
|
|
141
|
+
escapeCsv(formatBlockPositionForCsv(b.blockPosition ?? {
|
|
142
|
+
left: b.left,
|
|
143
|
+
top: b.top,
|
|
144
|
+
width: b.width,
|
|
145
|
+
height: b.height
|
|
146
|
+
})),
|
|
147
|
+
escapeCsv(b.textPreview),
|
|
148
|
+
b.childInteractiveCount
|
|
149
|
+
].join(','));
|
|
150
|
+
return [header, ...rows].join('\n');
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function formatBlockPositionForCsv(position = {}) {
|
|
154
|
+
const left = Number(position.left) || 0;
|
|
155
|
+
const top = Number(position.top) || 0;
|
|
156
|
+
const width = Number(position.width) || 0;
|
|
157
|
+
const height = Number(position.height) || 0;
|
|
158
|
+
return `left=${Math.round(left)};top=${Math.round(top)};height=${Math.round(height)};width=${Math.round(width)}`;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// ---------------------------------------------------------------------------
|
|
162
|
+
// Core: in-browser block extraction (runs inside page.evaluate)
|
|
163
|
+
// ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* This function is serialised and executed inside Chromium via page.evaluate().
|
|
167
|
+
* It has NO access to Node APIs — pure browser JS only.
|
|
168
|
+
*/
|
|
169
|
+
export function extractBlocksInBrowser(config) {
|
|
170
|
+
const { minHeight, minWidth, maxHeight, maxDepth, debug } = config;
|
|
171
|
+
const debugLog = [];
|
|
172
|
+
const TEXT_PREVIEW_MAX_CHARS = Number.isInteger(config?.textPreviewMaxChars)
|
|
173
|
+
? Math.max(120, config.textPreviewMaxChars)
|
|
174
|
+
: 1200;
|
|
175
|
+
|
|
176
|
+
const SKIP_TAGS = new Set([
|
|
177
|
+
'script', 'style', 'link', 'meta', 'noscript', 'svg', 'br', 'hr',
|
|
178
|
+
'img', 'video', 'audio', 'canvas', 'iframe', 'object', 'embed'
|
|
179
|
+
]);
|
|
180
|
+
|
|
181
|
+
const INTERACTIVE_SELECTOR = 'a, button, input, select, textarea, [onclick], [role="button"], [role="link"]';
|
|
182
|
+
|
|
183
|
+
/** Minimum text length for a hidden element to qualify as a content block */
|
|
184
|
+
const HIDDEN_MIN_TEXT = 20;
|
|
185
|
+
|
|
186
|
+
/** Hidden elements with text longer than this get subdivided instead of treated as one block */
|
|
187
|
+
const HIDDEN_SUBDIVIDE_TEXT = 500;
|
|
188
|
+
|
|
189
|
+
const blocks = [];
|
|
190
|
+
let domOrderCounter = 0;
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Check computed visibility of an element.
|
|
194
|
+
*/
|
|
195
|
+
function isVisible(el) {
|
|
196
|
+
const style = getComputedStyle(el);
|
|
197
|
+
if (style.display === 'none') return false;
|
|
198
|
+
if (style.visibility === 'hidden') return false;
|
|
199
|
+
if (parseFloat(style.opacity) === 0) return false;
|
|
200
|
+
return true;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Extract image info from an element (for blocks with no/little text).
|
|
205
|
+
* Returns a string like "img[alt=Photo of house](hero.jpg)" or empty string.
|
|
206
|
+
*/
|
|
207
|
+
function extractImageInfo(el) {
|
|
208
|
+
const imgs = el.querySelectorAll('img');
|
|
209
|
+
if (imgs.length === 0) return '';
|
|
210
|
+
const parts = [];
|
|
211
|
+
for (const img of imgs) {
|
|
212
|
+
const alt = (img.getAttribute('alt') || '').trim();
|
|
213
|
+
const src = (img.getAttribute('src') || '').trim();
|
|
214
|
+
// Shorten src to last path segment + query params stripped
|
|
215
|
+
let shortSrc = '';
|
|
216
|
+
try {
|
|
217
|
+
const url = new URL(src, location.href);
|
|
218
|
+
shortSrc = url.pathname.split('/').filter(Boolean).pop() || '';
|
|
219
|
+
} catch {
|
|
220
|
+
shortSrc = src.split('/').pop()?.split('?')[0] || '';
|
|
221
|
+
}
|
|
222
|
+
if (shortSrc.length > 60) shortSrc = shortSrc.slice(0, 57) + '...';
|
|
223
|
+
const altStr = alt ? `alt="${alt}"` : 'no-alt';
|
|
224
|
+
parts.push(`img[${altStr}](${shortSrc})`);
|
|
225
|
+
if (parts.length >= 3) break; // limit to 3 images
|
|
226
|
+
}
|
|
227
|
+
return parts.join('; ');
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function normalizeActionValue(value) {
|
|
231
|
+
const raw = String(value || '').trim();
|
|
232
|
+
if (!raw) return '';
|
|
233
|
+
try {
|
|
234
|
+
const parsed = new URL(raw, location.href);
|
|
235
|
+
return `${parsed.pathname || '/'}${parsed.search || ''}`;
|
|
236
|
+
} catch {
|
|
237
|
+
return raw;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
function normalizeActionLabel(node) {
|
|
242
|
+
const candidates = [
|
|
243
|
+
node?.innerText,
|
|
244
|
+
node?.textContent,
|
|
245
|
+
node?.getAttribute?.('aria-label'),
|
|
246
|
+
node?.getAttribute?.('name'),
|
|
247
|
+
node?.getAttribute?.('value'),
|
|
248
|
+
node?.getAttribute?.('alt')
|
|
249
|
+
];
|
|
250
|
+
for (const value of candidates) {
|
|
251
|
+
const text = String(value || '')
|
|
252
|
+
.replace(/\s+/g, ' ')
|
|
253
|
+
.replace(/[<>\[\]]/g, '')
|
|
254
|
+
.trim();
|
|
255
|
+
if (!text) {
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
if (text.length > 24) {
|
|
259
|
+
return `${text.slice(0, 21)}...`;
|
|
260
|
+
}
|
|
261
|
+
return text;
|
|
262
|
+
}
|
|
263
|
+
return '';
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
function extractActionInfo(el) {
|
|
267
|
+
const parts = [];
|
|
268
|
+
const seenActionValues = new Set();
|
|
269
|
+
const interactiveNodes = [];
|
|
270
|
+
|
|
271
|
+
if (typeof el.matches === 'function' && el.matches(INTERACTIVE_SELECTOR)) {
|
|
272
|
+
interactiveNodes.push(el);
|
|
273
|
+
}
|
|
274
|
+
for (const node of el.querySelectorAll(INTERACTIVE_SELECTOR)) {
|
|
275
|
+
interactiveNodes.push(node);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
for (const node of interactiveNodes) {
|
|
279
|
+
const href = node.getAttribute('href') || '';
|
|
280
|
+
const action = node.getAttribute('action') || '';
|
|
281
|
+
const formAction = node.getAttribute('formaction') || '';
|
|
282
|
+
const onClick = node.getAttribute('onclick') || '';
|
|
283
|
+
|
|
284
|
+
let actionValue = normalizeActionValue(action || formAction || href);
|
|
285
|
+
if (!actionValue && onClick) {
|
|
286
|
+
actionValue = 'inline_onclick';
|
|
287
|
+
}
|
|
288
|
+
if (!actionValue) {
|
|
289
|
+
continue;
|
|
290
|
+
}
|
|
291
|
+
if (actionValue.length > 120) {
|
|
292
|
+
actionValue = `${actionValue.slice(0, 117)}...`;
|
|
293
|
+
}
|
|
294
|
+
if (seenActionValues.has(actionValue)) {
|
|
295
|
+
continue;
|
|
296
|
+
}
|
|
297
|
+
seenActionValues.add(actionValue);
|
|
298
|
+
|
|
299
|
+
const label = normalizeActionLabel(node);
|
|
300
|
+
const token = label
|
|
301
|
+
? `<action[${label}]=${actionValue}>`
|
|
302
|
+
: `<action=${actionValue}>`;
|
|
303
|
+
parts.push(token);
|
|
304
|
+
if (parts.length >= 10) break;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
return parts.join(' ');
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
function truncateTextPreview(value) {
|
|
311
|
+
const text = String(value || '');
|
|
312
|
+
if (text.length <= TEXT_PREVIEW_MAX_CHARS) {
|
|
313
|
+
return text;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
let out = text.slice(0, TEXT_PREVIEW_MAX_CHARS);
|
|
317
|
+
const lastOpen = out.lastIndexOf('<');
|
|
318
|
+
const lastClose = out.lastIndexOf('>');
|
|
319
|
+
if (lastOpen > lastClose) {
|
|
320
|
+
out = out.slice(0, lastOpen);
|
|
321
|
+
}
|
|
322
|
+
return out.replace(/[|;, ]+$/g, '').trim();
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Build the textPreview for a block, including image info if text is sparse.
|
|
327
|
+
*/
|
|
328
|
+
function buildTextPreview(el) {
|
|
329
|
+
const text = (el.innerText || '').replace(/\s+/g, ' ').trim();
|
|
330
|
+
const actionInfo = extractActionInfo(el);
|
|
331
|
+
if (text.length >= 10) {
|
|
332
|
+
return actionInfo
|
|
333
|
+
? truncateTextPreview(`[interactive-actions] ${actionInfo} | ${text}`)
|
|
334
|
+
: truncateTextPreview(text);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Text is empty/sparse — try to provide image and action context
|
|
338
|
+
const imgInfo = extractImageInfo(el);
|
|
339
|
+
const segments = [];
|
|
340
|
+
if (text) segments.push(text);
|
|
341
|
+
if (imgInfo) segments.push(imgInfo);
|
|
342
|
+
if (actionInfo) segments.push(`[interactive-actions] ${actionInfo}`);
|
|
343
|
+
if (segments.length > 0) {
|
|
344
|
+
return truncateTextPreview(segments.join(' | '));
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
return truncateTextPreview(text);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* Recursive top-down block identification based on visual dimensions.
|
|
352
|
+
*
|
|
353
|
+
* Logic:
|
|
354
|
+
* - Hidden + substantial content → accept as ONE block (no recursion)
|
|
355
|
+
* - Hidden + sparse content → skip
|
|
356
|
+
* - Visible + too small → skip
|
|
357
|
+
* - Visible + good size (≤ maxHeight) → accept as block
|
|
358
|
+
* - Visible + too tall (> maxHeight) → subdivide into children
|
|
359
|
+
*/
|
|
360
|
+
const indent = d => ' '.repeat(d);
|
|
361
|
+
|
|
362
|
+
function textSnippet(el, len = 60) {
|
|
363
|
+
return (el.innerText || '').replace(/\s+/g, ' ').trim().slice(0, len);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
function normalizePositionNumber(value) {
|
|
367
|
+
const parsed = Number(value);
|
|
368
|
+
return Number.isFinite(parsed) ? parsed : 0;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
function buildBlockPosition({ left, top, width, height } = {}) {
|
|
372
|
+
return {
|
|
373
|
+
left: normalizePositionNumber(left),
|
|
374
|
+
top: normalizePositionNumber(top),
|
|
375
|
+
height: Math.max(0, normalizePositionNumber(height)),
|
|
376
|
+
width: Math.max(0, normalizePositionNumber(width))
|
|
377
|
+
};
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
function buildBlockCssPath(el) {
|
|
381
|
+
if (!(el instanceof Element) || !el.tagName) {
|
|
382
|
+
return '';
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
const parts = [];
|
|
386
|
+
let current = el;
|
|
387
|
+
while (current && current instanceof Element && current.tagName) {
|
|
388
|
+
const tag = current.tagName.toLowerCase();
|
|
389
|
+
if (tag === 'body') {
|
|
390
|
+
break;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
const parent = current.parentElement;
|
|
394
|
+
if (!parent || parent.tagName.toLowerCase() === 'html') {
|
|
395
|
+
break;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
const siblings = Array.from(parent.children).filter((child) => {
|
|
399
|
+
return child instanceof Element && child.tagName.toLowerCase() === tag;
|
|
400
|
+
});
|
|
401
|
+
const index = siblings.indexOf(current) + 1;
|
|
402
|
+
parts.unshift(`${tag}:nth-of-type(${Math.max(1, index)})`);
|
|
403
|
+
current = parent;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
return parts.length > 0 ? `body > ${parts.join(' > ')}` : 'body';
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
function createBlockRecord(el, fields = {}) {
|
|
410
|
+
return {
|
|
411
|
+
...fields,
|
|
412
|
+
blockCssPath: buildBlockCssPath(el),
|
|
413
|
+
blockPosition: buildBlockPosition(fields)
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* Walk inside hidden containers — since we can't use visual dimensions,
|
|
419
|
+
* we subdivide by text content length, recursing into children that are
|
|
420
|
+
* large enough to split further.
|
|
421
|
+
*/
|
|
422
|
+
function walkHidden(el, depth, branchPath) {
|
|
423
|
+
if (depth > maxDepth) return;
|
|
424
|
+
const tag = el.tagName?.toLowerCase();
|
|
425
|
+
if (!tag || SKIP_TAGS.has(tag)) return;
|
|
426
|
+
|
|
427
|
+
const myDomOrder = ++domOrderCounter;
|
|
428
|
+
const text = (el.innerText || '').trim();
|
|
429
|
+
const textLen = text.length;
|
|
430
|
+
|
|
431
|
+
// Too little content → skip
|
|
432
|
+
if (textLen < HIDDEN_MIN_TEXT) return;
|
|
433
|
+
|
|
434
|
+
// Large child with multiple children → keep subdividing
|
|
435
|
+
if (textLen >= HIDDEN_SUBDIVIDE_TEXT && el.children.length > 1) {
|
|
436
|
+
if (debug) {
|
|
437
|
+
debugLog.push(`${indent(depth)}HIDDEN-sub ↓ ${el.children.length} children (${textLen} chars)`);
|
|
438
|
+
}
|
|
439
|
+
const countBefore = blocks.length;
|
|
440
|
+
const children = Array.from(el.children);
|
|
441
|
+
for (let i = 0; i < children.length; i++) {
|
|
442
|
+
walkHidden(children[i], depth + 1, `${branchPath}.${i}`);
|
|
443
|
+
}
|
|
444
|
+
if (blocks.length > countBefore) return; // subdivision succeeded
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// Leaf-ish hidden block — accept
|
|
448
|
+
if (debug) {
|
|
449
|
+
const snippet = textSnippet(el);
|
|
450
|
+
debugLog.push(`${indent(depth)}HIDDEN-sub ✅ | ${snippet}`);
|
|
451
|
+
}
|
|
452
|
+
blocks.push(createBlockRecord(el, {
|
|
453
|
+
el,
|
|
454
|
+
tag,
|
|
455
|
+
top: 0,
|
|
456
|
+
left: 0,
|
|
457
|
+
width: 0,
|
|
458
|
+
height: 0,
|
|
459
|
+
hidden: true,
|
|
460
|
+
branchPath,
|
|
461
|
+
depth,
|
|
462
|
+
domOrder: myDomOrder,
|
|
463
|
+
textPreview: buildTextPreview(el),
|
|
464
|
+
childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
|
|
465
|
+
}));
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
function walk(el, depth, branchPath) {
|
|
469
|
+
if (depth > maxDepth) return;
|
|
470
|
+
|
|
471
|
+
const tag = el.tagName.toLowerCase();
|
|
472
|
+
if (SKIP_TAGS.has(tag)) return;
|
|
473
|
+
|
|
474
|
+
const myDomOrder = ++domOrderCounter;
|
|
475
|
+
|
|
476
|
+
// Detect fixed/sticky overlays (floating headers, chat widgets, modals).
|
|
477
|
+
const position = getComputedStyle(el).position;
|
|
478
|
+
const isFixed = position === 'fixed' || position === 'sticky';
|
|
479
|
+
|
|
480
|
+
// Fixed/sticky → treat as a single block (no subdivision), so each overlay
|
|
481
|
+
// maps to exactly one block row regardless of its height.
|
|
482
|
+
if (isFixed) {
|
|
483
|
+
const rect = el.getBoundingClientRect();
|
|
484
|
+
const preview = buildTextPreview(el);
|
|
485
|
+
if (debug) {
|
|
486
|
+
debugLog.push(`${indent(depth)}FIXED(${position}) ${Math.round(rect.width)}x${Math.round(rect.height)} ✅ | ${preview || '(empty)'}`);
|
|
487
|
+
}
|
|
488
|
+
blocks.push(createBlockRecord(el, {
|
|
489
|
+
el,
|
|
490
|
+
tag,
|
|
491
|
+
top: rect.top + window.scrollY,
|
|
492
|
+
left: rect.left + window.scrollX,
|
|
493
|
+
width: rect.width,
|
|
494
|
+
height: rect.height,
|
|
495
|
+
hidden: false,
|
|
496
|
+
fixed: true,
|
|
497
|
+
branchPath,
|
|
498
|
+
depth,
|
|
499
|
+
domOrder: myDomOrder,
|
|
500
|
+
textPreview: preview,
|
|
501
|
+
childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
|
|
502
|
+
}));
|
|
503
|
+
return;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// Hidden element handling
|
|
507
|
+
if (!isVisible(el)) {
|
|
508
|
+
const text = (el.innerText || '').trim();
|
|
509
|
+
const textLen = text.length;
|
|
510
|
+
const substantial = textLen >= HIDDEN_MIN_TEXT;
|
|
511
|
+
|
|
512
|
+
if (!substantial) {
|
|
513
|
+
if (debug && depth <= 4) {
|
|
514
|
+
const snippet = textSnippet(el);
|
|
515
|
+
if (snippet) debugLog.push(`${indent(depth)}HIDDEN → skip | ${snippet}`);
|
|
516
|
+
}
|
|
517
|
+
return;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// Large hidden container → subdivide into children
|
|
521
|
+
if (textLen >= HIDDEN_SUBDIVIDE_TEXT && el.children.length > 1) {
|
|
522
|
+
if (debug) {
|
|
523
|
+
debugLog.push(`${indent(depth)}HIDDEN → subdivide (${textLen} chars, ${el.children.length} children)`);
|
|
524
|
+
}
|
|
525
|
+
const countBefore = blocks.length;
|
|
526
|
+
const children = Array.from(el.children);
|
|
527
|
+
for (let i = 0; i < children.length; i++) {
|
|
528
|
+
walkHidden(children[i], depth + 1, `${branchPath}.${i}`);
|
|
529
|
+
}
|
|
530
|
+
// If subdivision produced nothing, fall through to treat as one block
|
|
531
|
+
if (blocks.length > countBefore) return;
|
|
532
|
+
if (debug) {
|
|
533
|
+
debugLog.push(`${indent(depth)}HIDDEN → no children qualified, treating as one block`);
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
// Small hidden element or subdivision failed → treat as one block
|
|
538
|
+
if (debug && depth <= 4) {
|
|
539
|
+
const snippet = textSnippet(el);
|
|
540
|
+
debugLog.push(`${indent(depth)}HIDDEN → block | ${snippet}`);
|
|
541
|
+
}
|
|
542
|
+
blocks.push(createBlockRecord(el, {
|
|
543
|
+
el,
|
|
544
|
+
tag,
|
|
545
|
+
top: 0,
|
|
546
|
+
left: 0,
|
|
547
|
+
width: 0,
|
|
548
|
+
height: 0,
|
|
549
|
+
hidden: true,
|
|
550
|
+
branchPath,
|
|
551
|
+
depth,
|
|
552
|
+
domOrder: myDomOrder,
|
|
553
|
+
textPreview: buildTextPreview(el),
|
|
554
|
+
childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
|
|
555
|
+
}));
|
|
556
|
+
return;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
const rect = el.getBoundingClientRect();
|
|
560
|
+
const w = rect.width;
|
|
561
|
+
const h = rect.height;
|
|
562
|
+
|
|
563
|
+
// Too small → skip entirely
|
|
564
|
+
if (w < minWidth || h < minHeight) {
|
|
565
|
+
if (debug && depth <= 3) {
|
|
566
|
+
const snippet = textSnippet(el);
|
|
567
|
+
debugLog.push(`${indent(depth)}${Math.round(w)}x${Math.round(h)} skip | ${snippet || '(empty)'}`);
|
|
568
|
+
}
|
|
569
|
+
return;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
// Good size → accept as block
|
|
573
|
+
if (h <= maxHeight) {
|
|
574
|
+
const preview = buildTextPreview(el);
|
|
575
|
+
if (debug) {
|
|
576
|
+
debugLog.push(`${indent(depth)}${Math.round(w)}x${Math.round(h)} ✅ | ${preview || '(empty)'}`);
|
|
577
|
+
}
|
|
578
|
+
const absTop = rect.top + window.scrollY;
|
|
579
|
+
const absLeft = rect.left + window.scrollX;
|
|
580
|
+
|
|
581
|
+
blocks.push(createBlockRecord(el, {
|
|
582
|
+
el,
|
|
583
|
+
tag,
|
|
584
|
+
top: absTop,
|
|
585
|
+
left: absLeft,
|
|
586
|
+
width: w,
|
|
587
|
+
height: h,
|
|
588
|
+
hidden: false,
|
|
589
|
+
fixed: isFixed,
|
|
590
|
+
branchPath,
|
|
591
|
+
depth,
|
|
592
|
+
domOrder: myDomOrder,
|
|
593
|
+
textPreview: preview,
|
|
594
|
+
childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
|
|
595
|
+
}));
|
|
596
|
+
return; // don't recurse further — this element is the block
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
// Too tall → try to subdivide into children
|
|
600
|
+
if (debug) {
|
|
601
|
+
debugLog.push(`${indent(depth)}${Math.round(w)}x${Math.round(h)} ↓ ${el.children.length} children`);
|
|
602
|
+
}
|
|
603
|
+
const countBefore = blocks.length;
|
|
604
|
+
const children = Array.from(el.children);
|
|
605
|
+
for (let i = 0; i < children.length; i++) {
|
|
606
|
+
walk(children[i], depth + 1, `${branchPath}.${i}`);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
// If subdivision produced nothing, accept the large element itself
|
|
610
|
+
if (blocks.length === countBefore) {
|
|
611
|
+
const preview = buildTextPreview(el);
|
|
612
|
+
if (debug) {
|
|
613
|
+
debugLog.push(`${indent(depth)}${Math.round(w)}x${Math.round(h)} ✅ (no children qualified) | ${preview || '(empty)'}`);
|
|
614
|
+
}
|
|
615
|
+
const absTop = rect.top + window.scrollY;
|
|
616
|
+
const absLeft = rect.left + window.scrollX;
|
|
617
|
+
blocks.push(createBlockRecord(el, {
|
|
618
|
+
el,
|
|
619
|
+
tag,
|
|
620
|
+
top: absTop,
|
|
621
|
+
left: absLeft,
|
|
622
|
+
width: w,
|
|
623
|
+
height: h,
|
|
624
|
+
hidden: false,
|
|
625
|
+
fixed: isFixed,
|
|
626
|
+
branchPath,
|
|
627
|
+
depth,
|
|
628
|
+
domOrder: myDomOrder,
|
|
629
|
+
textPreview: preview,
|
|
630
|
+
childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
|
|
631
|
+
}));
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// Start from body's direct children (skip body itself — it's always full-page)
|
|
636
|
+
const bodyChildren = Array.from(document.body.children);
|
|
637
|
+
for (let i = 0; i < bodyChildren.length; i++) {
|
|
638
|
+
walk(bodyChildren[i], 0, String(i));
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// ── Deduplication: remove blocks fully contained by another block ─────────
|
|
642
|
+
// Keep the inner (more specific) block, remove the outer container.
|
|
643
|
+
const toRemove = new Set();
|
|
644
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
645
|
+
for (let j = 0; j < blocks.length; j++) {
|
|
646
|
+
if (i === j) continue;
|
|
647
|
+
if (blocks[i].el.contains(blocks[j].el)) {
|
|
648
|
+
// i contains j → remove i (the outer one)
|
|
649
|
+
toRemove.add(i);
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
const deduped = blocks
|
|
655
|
+
.filter((_, idx) => !toRemove.has(idx))
|
|
656
|
+
// Drop empty blocks: no text, no images, no interactive children
|
|
657
|
+
.filter(b => b.textPreview || b.childInteractiveCount > 0)
|
|
658
|
+
.map(({ el, ...rest }, idx) => ({ blockIdx: idx, ...rest }));
|
|
659
|
+
|
|
660
|
+
return { blocks: deduped, debugLog };
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// ---------------------------------------------------------------------------
|
|
664
|
+
// Scroll to bottom: trigger lazy-load / JS rendering
|
|
665
|
+
// ---------------------------------------------------------------------------
|
|
666
|
+
|
|
667
|
+
/**
|
|
668
|
+
* Incrementally scroll from top to bottom of the page,
|
|
669
|
+
* pausing at each step to let lazy content load.
|
|
670
|
+
*/
|
|
671
|
+
export async function scrollToBottom(page, { step = 800, delay = 300, maxScrolls = 30 } = {}) {
|
|
672
|
+
console.error('📜 Scrolling to bottom...');
|
|
673
|
+
|
|
674
|
+
let lastHeight = 0;
|
|
675
|
+
let scrollCount = 0;
|
|
676
|
+
|
|
677
|
+
while (scrollCount < maxScrolls) {
|
|
678
|
+
const currentHeight = await page.evaluate(() => document.documentElement.scrollHeight);
|
|
679
|
+
|
|
680
|
+
// Already at or past the bottom
|
|
681
|
+
const scrollY = await page.evaluate(() => window.scrollY + window.innerHeight);
|
|
682
|
+
if (scrollY >= currentHeight) break;
|
|
683
|
+
|
|
684
|
+
// Infinite scroll detection: if height grew since last check, page is extending
|
|
685
|
+
if (currentHeight > lastHeight && scrollCount > 0) {
|
|
686
|
+
const growth = currentHeight - lastHeight;
|
|
687
|
+
// If page grew more than 2x viewport in one scroll step → likely infinite scroll, stop
|
|
688
|
+
const viewportH = await page.evaluate(() => window.innerHeight);
|
|
689
|
+
if (growth > viewportH * 2) {
|
|
690
|
+
console.error(`📜 Infinite scroll detected (grew ${growth}px), stopping`);
|
|
691
|
+
break;
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
lastHeight = currentHeight;
|
|
695
|
+
|
|
696
|
+
await page.evaluate(s => window.scrollBy(0, s), step);
|
|
697
|
+
await page.waitForTimeout(delay);
|
|
698
|
+
scrollCount++;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
console.error(`📜 Scrolled ${scrollCount} steps`);
|
|
702
|
+
|
|
703
|
+
// Scroll back to top
|
|
704
|
+
await page.evaluate(() => window.scrollTo(0, 0));
|
|
705
|
+
await page.waitForTimeout(300);
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
// ---------------------------------------------------------------------------
|
|
709
|
+
// Wait strategy: poll until page height stabilises
|
|
710
|
+
// ---------------------------------------------------------------------------
|
|
711
|
+
|
|
712
|
+
/**
|
|
713
|
+
* Poll document.body.scrollHeight every `interval` ms.
|
|
714
|
+
* Once it stays the same for `stableCount` consecutive checks, return.
|
|
715
|
+
* Gives up after `maxWait` ms total.
|
|
716
|
+
*/
|
|
717
|
+
export async function waitForStableHeight(page, { interval = 500, stableCount = 3, maxWait = 15000 } = {}) {
|
|
718
|
+
const t0 = Date.now();
|
|
719
|
+
let lastHeight = 0;
|
|
720
|
+
let same = 0;
|
|
721
|
+
|
|
722
|
+
while (Date.now() - t0 < maxWait) {
|
|
723
|
+
const h = await page.evaluate(() => document.documentElement.scrollHeight);
|
|
724
|
+
if (h === lastHeight) {
|
|
725
|
+
same++;
|
|
726
|
+
if (same >= stableCount) {
|
|
727
|
+
console.error(`⏱ Page stable at height ${h}px (${Date.now() - t0}ms)`);
|
|
728
|
+
return;
|
|
729
|
+
}
|
|
730
|
+
} else {
|
|
731
|
+
same = 0;
|
|
732
|
+
lastHeight = h;
|
|
733
|
+
}
|
|
734
|
+
await page.waitForTimeout(interval);
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
console.error(`⏱ Max wait ${maxWait}ms reached, proceeding (height=${lastHeight}px)`);
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// ---------------------------------------------------------------------------
|
|
741
|
+
// Main
|
|
742
|
+
// ---------------------------------------------------------------------------
|
|
743
|
+
|
|
744
|
+
async function main() {
|
|
745
|
+
const opts = parseArgs();
|
|
746
|
+
const viewport = { width: opts.viewportWidth, height: opts.viewportHeight };
|
|
747
|
+
|
|
748
|
+
// ── Load Playwright ──────────────────────────────────────────────────────
|
|
749
|
+
let pw;
|
|
750
|
+
try {
|
|
751
|
+
const mod = await import('playwright');
|
|
752
|
+
pw = mod.default || mod;
|
|
753
|
+
} catch {
|
|
754
|
+
console.error('❌ playwright is required. Install: npm i playwright');
|
|
755
|
+
process.exit(1);
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
// ── Launch browser & load page ───────────────────────────────────────────
|
|
759
|
+
const inputLabel = opts.input;
|
|
760
|
+
console.error(`🌐 Opening: ${inputLabel} (${viewport.width}x${viewport.height})`);
|
|
761
|
+
|
|
762
|
+
const browser = await pw.chromium.launch({ headless: true });
|
|
763
|
+
let page;
|
|
764
|
+
try {
|
|
765
|
+
page = await browser.newPage({ viewport });
|
|
766
|
+
|
|
767
|
+
if (isUrl(opts.input)) {
|
|
768
|
+
await page.goto(opts.input, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
769
|
+
} else {
|
|
770
|
+
const inputPath = path.resolve(opts.input);
|
|
771
|
+
if (!fs.existsSync(inputPath)) {
|
|
772
|
+
console.error(`File not found: ${inputPath}`);
|
|
773
|
+
process.exit(1);
|
|
774
|
+
}
|
|
775
|
+
const html = fs.readFileSync(inputPath, 'utf-8');
|
|
776
|
+
await page.setContent(html, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
// Scroll to bottom to trigger lazy-load content, then wait for stable height
|
|
780
|
+
await scrollToBottom(page);
|
|
781
|
+
await waitForStableHeight(page);
|
|
782
|
+
|
|
783
|
+
// Force-reveal hidden content so block extraction can see real dimensions.
|
|
784
|
+
// Two categories:
|
|
785
|
+
// 1. opacity:0 — scroll-triggered animations (IntersectionObserver) that never fire in headless
|
|
786
|
+
// 2. display:none — interactive panels (dropdown menus, accordions) with substantial content
|
|
787
|
+
const revealStats = await page.evaluate(() => {
|
|
788
|
+
const CONTENT_THRESHOLD = 20; // min text chars to keep a revealed element
|
|
789
|
+
let opacityCount = 0;
|
|
790
|
+
let displayCount = 0;
|
|
791
|
+
|
|
792
|
+
for (const el of document.querySelectorAll('*')) {
|
|
793
|
+
const s = getComputedStyle(el);
|
|
794
|
+
|
|
795
|
+
// Case 1: opacity:0 with real layout — animation that didn't trigger
|
|
796
|
+
// Only reveal if the element (or its subtree) has actual content,
|
|
797
|
+
// to avoid surfacing empty scroll-anchor / animation-placeholder divs.
|
|
798
|
+
if (parseFloat(s.opacity) === 0 && el.getBoundingClientRect().height > 0) {
|
|
799
|
+
const text = (el.innerText || '').trim();
|
|
800
|
+
if (text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0) {
|
|
801
|
+
el.style.setProperty('opacity', '1', 'important');
|
|
802
|
+
opacityCount++;
|
|
803
|
+
}
|
|
804
|
+
continue;
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
// Case 2: display:none — dropdown/accordion/menu panel
|
|
808
|
+
if (s.display === 'none') {
|
|
809
|
+
// Only process "root" hidden elements (parent is visible).
|
|
810
|
+
// Children of a display:none parent inherit none; they'll be handled
|
|
811
|
+
// once we reveal their parent.
|
|
812
|
+
const parent = el.parentElement;
|
|
813
|
+
if (parent && getComputedStyle(parent).display === 'none') continue;
|
|
814
|
+
|
|
815
|
+
// Temporarily reveal to measure content
|
|
816
|
+
const origDisplay = el.style.display;
|
|
817
|
+
el.style.setProperty('display', 'block', 'important');
|
|
818
|
+
const text = (el.innerText || '').trim();
|
|
819
|
+
|
|
820
|
+
if (text.length >= CONTENT_THRESHOLD) {
|
|
821
|
+
displayCount++;
|
|
822
|
+
// Keep visible — has substantial content
|
|
823
|
+
} else {
|
|
824
|
+
// Revert — not enough content to qualify
|
|
825
|
+
if (origDisplay) {
|
|
826
|
+
el.style.display = origDisplay;
|
|
827
|
+
} else {
|
|
828
|
+
el.style.removeProperty('display');
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
return { opacityCount, displayCount };
|
|
834
|
+
});
|
|
835
|
+
if (revealStats.opacityCount > 0 || revealStats.displayCount > 0) {
|
|
836
|
+
console.error(`👁 Force-revealed: ${revealStats.opacityCount} opacity:0, ${revealStats.displayCount} display:none`);
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
// Save rendered HTML if requested
|
|
840
|
+
if (opts.saveHtml) {
|
|
841
|
+
const rendered = await page.content();
|
|
842
|
+
const savePath = path.resolve(opts.saveHtml);
|
|
843
|
+
fs.writeFileSync(savePath, rendered, 'utf-8');
|
|
844
|
+
console.error(`💾 HTML saved to: ${savePath}`);
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
const pageSize = await page.evaluate(() => ({
|
|
848
|
+
w: document.documentElement.scrollWidth,
|
|
849
|
+
h: document.documentElement.scrollHeight
|
|
850
|
+
}));
|
|
851
|
+
console.error(`📄 Page size: ${pageSize.w}x${pageSize.h}`);
|
|
852
|
+
|
|
853
|
+
// ── Extract blocks inside browser ────────────────────────────────────
|
|
854
|
+
const minWidth = Math.round(viewport.width * opts.minWidthRatio);
|
|
855
|
+
const result = await page.evaluate(extractBlocksInBrowser, {
|
|
856
|
+
minHeight: opts.minHeight,
|
|
857
|
+
minWidth,
|
|
858
|
+
maxHeight: opts.maxHeight,
|
|
859
|
+
maxDepth: 15,
|
|
860
|
+
textPreviewMaxChars: 1200,
|
|
861
|
+
debug: opts.debug
|
|
862
|
+
});
|
|
863
|
+
|
|
864
|
+
const blocks = result.blocks;
|
|
865
|
+
if (opts.debug && result.debugLog.length > 0) {
|
|
866
|
+
console.error('\n🔍 Walk tree:');
|
|
867
|
+
for (const line of result.debugLog) {
|
|
868
|
+
console.error(line);
|
|
869
|
+
}
|
|
870
|
+
console.error('');
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
console.error(`📦 Blocks found: ${blocks.length}`);
|
|
874
|
+
|
|
875
|
+
// ── Output ───────────────────────────────────────────────────────────
|
|
876
|
+
let output;
|
|
877
|
+
if (opts.json) {
|
|
878
|
+
output = JSON.stringify(blocks, null, 2);
|
|
879
|
+
} else {
|
|
880
|
+
output = blocksToCsv(blocks);
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
if (opts.out) {
|
|
884
|
+
const outPath = path.resolve(opts.out);
|
|
885
|
+
fs.writeFileSync(outPath, output, 'utf-8');
|
|
886
|
+
console.error(`💾 Written to: ${outPath}`);
|
|
887
|
+
} else {
|
|
888
|
+
console.log(output);
|
|
889
|
+
}
|
|
890
|
+
} finally {
|
|
891
|
+
await browser.close();
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
const cliArgvPath = process.argv[1] ? path.resolve(process.argv[1]) : '';
|
|
896
|
+
const isCliEntry = cliArgvPath && pathToFileURL(cliArgvPath).href === import.meta.url;
|
|
897
|
+
|
|
898
|
+
if (isCliEntry) {
|
|
899
|
+
main().catch(err => {
|
|
900
|
+
console.error(`❌ ${err.message}`);
|
|
901
|
+
process.exit(1);
|
|
902
|
+
});
|
|
903
|
+
}
|