page-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,903 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Standalone script: extract visual content blocks from a web page.
5
+ *
6
+ * Core logic runs inside Playwright (headless Chromium), identifying blocks
7
+ * by their rendered layout (height + width) rather than DOM structure.
8
+ *
9
+ * Usage:
10
+ * node scripts/extract-blocks.js <url-or-html-file> [options]
11
+ *
12
+ * Options:
13
+ * --min-height <px> Minimum block height in pixels (default: 80)
14
+ * --min-width-ratio <0-1> Minimum block width as ratio of viewport (default: 0.25)
15
+ * --max-height <px> Blocks taller than this get subdivided (default: viewport * 1.5)
16
+ * --viewport <WxH> Viewport size (default: 1440x900)
17
+ * --json Output JSON instead of CSV
18
+ * --out <file> Write output to file instead of stdout
19
+ * --save-html <file> Save rendered HTML to file
20
+ *
21
+ * Examples:
22
+ * node scripts/extract-blocks.js https://example.com
23
+ * node scripts/extract-blocks.js https://example.com --viewport 1920x1080 --out blocks.csv
24
+ * node scripts/extract-blocks.js page.html --min-height 100 --json
25
+ */
26
+
27
+ import fs from 'node:fs';
28
+ import path from 'node:path';
29
+ import { pathToFileURL } from 'node:url';
30
+
31
+ // ---------------------------------------------------------------------------
32
+ // CLI
33
+ // ---------------------------------------------------------------------------
34
+
35
+ function parseArgs() {
36
+ const args = process.argv.slice(2);
37
+
38
+ if (args.length === 0) {
39
+ console.error('Usage: node scripts/extract-blocks.js <url-or-html-file> [options]');
40
+ console.error('');
41
+ console.error('Options:');
42
+ console.error(' --min-height <px> Min block height (default: 80)');
43
+ console.error(' --min-width-ratio <0-1> Min width as viewport ratio (default: 0.25)');
44
+ console.error(' --max-height <px> Subdivide blocks taller than this (default: viewport*1.5)');
45
+ console.error(' --viewport <WxH> Viewport size (default: 1440x900)');
46
+ console.error(' --json Output JSON instead of CSV');
47
+ console.error(' --out <file> Write to file instead of stdout');
48
+ console.error(' --save-html <file> Save rendered HTML');
49
+ process.exit(1);
50
+ }
51
+
52
+ const opts = {
53
+ input: args[0],
54
+ minHeight: 40,
55
+ minWidthRatio: 0.25,
56
+ maxHeight: null, // computed from viewport if not set
57
+ viewportWidth: 1440,
58
+ viewportHeight: 900,
59
+ json: false,
60
+ out: null,
61
+ saveHtml: null,
62
+ debug: false
63
+ };
64
+
65
+ for (let i = 1; i < args.length; i++) {
66
+ switch (args[i]) {
67
+ case '--min-height':
68
+ opts.minHeight = parseInt(args[++i], 10);
69
+ break;
70
+ case '--min-width-ratio':
71
+ opts.minWidthRatio = parseFloat(args[++i]);
72
+ break;
73
+ case '--max-height':
74
+ opts.maxHeight = parseInt(args[++i], 10);
75
+ break;
76
+ case '--viewport': {
77
+ const [w, h] = args[++i].split('x').map(Number);
78
+ opts.viewportWidth = w || 1440;
79
+ opts.viewportHeight = h || 900;
80
+ break;
81
+ }
82
+ case '--json':
83
+ opts.json = true;
84
+ break;
85
+ case '--out':
86
+ opts.out = args[++i];
87
+ break;
88
+ case '--save-html':
89
+ opts.saveHtml = args[++i];
90
+ break;
91
+ case '--debug':
92
+ opts.debug = true;
93
+ break;
94
+ default:
95
+ console.error(`Unknown option: ${args[i]}`);
96
+ process.exit(1);
97
+ }
98
+ }
99
+
100
+ if (opts.maxHeight === null) {
101
+ opts.maxHeight = Math.round(opts.viewportHeight * 1.5);
102
+ }
103
+
104
+ return opts;
105
+ }
106
+
107
+ // ---------------------------------------------------------------------------
108
+ // Helpers
109
+ // ---------------------------------------------------------------------------
110
+
111
+ function isUrl(input) {
112
+ return /^https?:\/\//i.test(input);
113
+ }
114
+
115
+ // ---------------------------------------------------------------------------
116
+ // CSV formatter
117
+ // ---------------------------------------------------------------------------
118
+
119
+ function escapeCsv(value) {
120
+ const str = String(value ?? '');
121
+ if (str.includes(',') || str.includes('"') || str.includes('\n') || str.includes('\r')) {
122
+ return `"${str.replace(/"/g, '""')}"`;
123
+ }
124
+ return str;
125
+ }
126
+
127
+ export function blocksToCsv(blocks) {
128
+ const header = 'blockIdx,branchPath,depth,domOrder,tag,fixed,top,left,width,height,blockCssPath,blockPosition,textPreview,childInteractiveCount';
129
+ const rows = blocks.map(b => [
130
+ b.blockIdx,
131
+ escapeCsv(b.branchPath ?? ''),
132
+ b.depth ?? 0,
133
+ b.domOrder ?? 0,
134
+ b.tag,
135
+ b.fixed ? 1 : 0,
136
+ Math.round(b.top),
137
+ Math.round(b.left),
138
+ Math.round(b.width),
139
+ Math.round(b.height),
140
+ escapeCsv(b.blockCssPath ?? ''),
141
+ escapeCsv(formatBlockPositionForCsv(b.blockPosition ?? {
142
+ left: b.left,
143
+ top: b.top,
144
+ width: b.width,
145
+ height: b.height
146
+ })),
147
+ escapeCsv(b.textPreview),
148
+ b.childInteractiveCount
149
+ ].join(','));
150
+ return [header, ...rows].join('\n');
151
+ }
152
+
153
+ function formatBlockPositionForCsv(position = {}) {
154
+ const left = Number(position.left) || 0;
155
+ const top = Number(position.top) || 0;
156
+ const width = Number(position.width) || 0;
157
+ const height = Number(position.height) || 0;
158
+ return `left=${Math.round(left)};top=${Math.round(top)};height=${Math.round(height)};width=${Math.round(width)}`;
159
+ }
160
+
161
+ // ---------------------------------------------------------------------------
162
+ // Core: in-browser block extraction (runs inside page.evaluate)
163
+ // ---------------------------------------------------------------------------
164
+
165
+ /**
166
+ * This function is serialised and executed inside Chromium via page.evaluate().
167
+ * It has NO access to Node APIs — pure browser JS only.
168
+ */
169
+ export function extractBlocksInBrowser(config) {
170
+ const { minHeight, minWidth, maxHeight, maxDepth, debug } = config;
171
+ const debugLog = [];
172
+ const TEXT_PREVIEW_MAX_CHARS = Number.isInteger(config?.textPreviewMaxChars)
173
+ ? Math.max(120, config.textPreviewMaxChars)
174
+ : 1200;
175
+
176
+ const SKIP_TAGS = new Set([
177
+ 'script', 'style', 'link', 'meta', 'noscript', 'svg', 'br', 'hr',
178
+ 'img', 'video', 'audio', 'canvas', 'iframe', 'object', 'embed'
179
+ ]);
180
+
181
+ const INTERACTIVE_SELECTOR = 'a, button, input, select, textarea, [onclick], [role="button"], [role="link"]';
182
+
183
+ /** Minimum text length for a hidden element to qualify as a content block */
184
+ const HIDDEN_MIN_TEXT = 20;
185
+
186
+ /** Hidden elements with text longer than this get subdivided instead of treated as one block */
187
+ const HIDDEN_SUBDIVIDE_TEXT = 500;
188
+
189
+ const blocks = [];
190
+ let domOrderCounter = 0;
191
+
192
+ /**
193
+ * Check computed visibility of an element.
194
+ */
195
+ function isVisible(el) {
196
+ const style = getComputedStyle(el);
197
+ if (style.display === 'none') return false;
198
+ if (style.visibility === 'hidden') return false;
199
+ if (parseFloat(style.opacity) === 0) return false;
200
+ return true;
201
+ }
202
+
203
+ /**
204
+ * Extract image info from an element (for blocks with no/little text).
205
+ * Returns a string like "img[alt=Photo of house](hero.jpg)" or empty string.
206
+ */
207
+ function extractImageInfo(el) {
208
+ const imgs = el.querySelectorAll('img');
209
+ if (imgs.length === 0) return '';
210
+ const parts = [];
211
+ for (const img of imgs) {
212
+ const alt = (img.getAttribute('alt') || '').trim();
213
+ const src = (img.getAttribute('src') || '').trim();
214
+ // Shorten src to last path segment + query params stripped
215
+ let shortSrc = '';
216
+ try {
217
+ const url = new URL(src, location.href);
218
+ shortSrc = url.pathname.split('/').filter(Boolean).pop() || '';
219
+ } catch {
220
+ shortSrc = src.split('/').pop()?.split('?')[0] || '';
221
+ }
222
+ if (shortSrc.length > 60) shortSrc = shortSrc.slice(0, 57) + '...';
223
+ const altStr = alt ? `alt="${alt}"` : 'no-alt';
224
+ parts.push(`img[${altStr}](${shortSrc})`);
225
+ if (parts.length >= 3) break; // limit to 3 images
226
+ }
227
+ return parts.join('; ');
228
+ }
229
+
230
+ function normalizeActionValue(value) {
231
+ const raw = String(value || '').trim();
232
+ if (!raw) return '';
233
+ try {
234
+ const parsed = new URL(raw, location.href);
235
+ return `${parsed.pathname || '/'}${parsed.search || ''}`;
236
+ } catch {
237
+ return raw;
238
+ }
239
+ }
240
+
241
+ function normalizeActionLabel(node) {
242
+ const candidates = [
243
+ node?.innerText,
244
+ node?.textContent,
245
+ node?.getAttribute?.('aria-label'),
246
+ node?.getAttribute?.('name'),
247
+ node?.getAttribute?.('value'),
248
+ node?.getAttribute?.('alt')
249
+ ];
250
+ for (const value of candidates) {
251
+ const text = String(value || '')
252
+ .replace(/\s+/g, ' ')
253
+ .replace(/[<>\[\]]/g, '')
254
+ .trim();
255
+ if (!text) {
256
+ continue;
257
+ }
258
+ if (text.length > 24) {
259
+ return `${text.slice(0, 21)}...`;
260
+ }
261
+ return text;
262
+ }
263
+ return '';
264
+ }
265
+
266
+ function extractActionInfo(el) {
267
+ const parts = [];
268
+ const seenActionValues = new Set();
269
+ const interactiveNodes = [];
270
+
271
+ if (typeof el.matches === 'function' && el.matches(INTERACTIVE_SELECTOR)) {
272
+ interactiveNodes.push(el);
273
+ }
274
+ for (const node of el.querySelectorAll(INTERACTIVE_SELECTOR)) {
275
+ interactiveNodes.push(node);
276
+ }
277
+
278
+ for (const node of interactiveNodes) {
279
+ const href = node.getAttribute('href') || '';
280
+ const action = node.getAttribute('action') || '';
281
+ const formAction = node.getAttribute('formaction') || '';
282
+ const onClick = node.getAttribute('onclick') || '';
283
+
284
+ let actionValue = normalizeActionValue(action || formAction || href);
285
+ if (!actionValue && onClick) {
286
+ actionValue = 'inline_onclick';
287
+ }
288
+ if (!actionValue) {
289
+ continue;
290
+ }
291
+ if (actionValue.length > 120) {
292
+ actionValue = `${actionValue.slice(0, 117)}...`;
293
+ }
294
+ if (seenActionValues.has(actionValue)) {
295
+ continue;
296
+ }
297
+ seenActionValues.add(actionValue);
298
+
299
+ const label = normalizeActionLabel(node);
300
+ const token = label
301
+ ? `<action[${label}]=${actionValue}>`
302
+ : `<action=${actionValue}>`;
303
+ parts.push(token);
304
+ if (parts.length >= 10) break;
305
+ }
306
+
307
+ return parts.join(' ');
308
+ }
309
+
310
+ function truncateTextPreview(value) {
311
+ const text = String(value || '');
312
+ if (text.length <= TEXT_PREVIEW_MAX_CHARS) {
313
+ return text;
314
+ }
315
+
316
+ let out = text.slice(0, TEXT_PREVIEW_MAX_CHARS);
317
+ const lastOpen = out.lastIndexOf('<');
318
+ const lastClose = out.lastIndexOf('>');
319
+ if (lastOpen > lastClose) {
320
+ out = out.slice(0, lastOpen);
321
+ }
322
+ return out.replace(/[|;, ]+$/g, '').trim();
323
+ }
324
+
325
+ /**
326
+ * Build the textPreview for a block, including image info if text is sparse.
327
+ */
328
+ function buildTextPreview(el) {
329
+ const text = (el.innerText || '').replace(/\s+/g, ' ').trim();
330
+ const actionInfo = extractActionInfo(el);
331
+ if (text.length >= 10) {
332
+ return actionInfo
333
+ ? truncateTextPreview(`[interactive-actions] ${actionInfo} | ${text}`)
334
+ : truncateTextPreview(text);
335
+ }
336
+
337
+ // Text is empty/sparse — try to provide image and action context
338
+ const imgInfo = extractImageInfo(el);
339
+ const segments = [];
340
+ if (text) segments.push(text);
341
+ if (imgInfo) segments.push(imgInfo);
342
+ if (actionInfo) segments.push(`[interactive-actions] ${actionInfo}`);
343
+ if (segments.length > 0) {
344
+ return truncateTextPreview(segments.join(' | '));
345
+ }
346
+
347
+ return truncateTextPreview(text);
348
+ }
349
+
350
+ /**
351
+ * Recursive top-down block identification based on visual dimensions.
352
+ *
353
+ * Logic:
354
+ * - Hidden + substantial content → accept as ONE block (no recursion)
355
+ * - Hidden + sparse content → skip
356
+ * - Visible + too small → skip
357
+ * - Visible + good size (≤ maxHeight) → accept as block
358
+ * - Visible + too tall (> maxHeight) → subdivide into children
359
+ */
360
+ const indent = d => ' '.repeat(d);
361
+
362
+ function textSnippet(el, len = 60) {
363
+ return (el.innerText || '').replace(/\s+/g, ' ').trim().slice(0, len);
364
+ }
365
+
366
+ function normalizePositionNumber(value) {
367
+ const parsed = Number(value);
368
+ return Number.isFinite(parsed) ? parsed : 0;
369
+ }
370
+
371
+ function buildBlockPosition({ left, top, width, height } = {}) {
372
+ return {
373
+ left: normalizePositionNumber(left),
374
+ top: normalizePositionNumber(top),
375
+ height: Math.max(0, normalizePositionNumber(height)),
376
+ width: Math.max(0, normalizePositionNumber(width))
377
+ };
378
+ }
379
+
380
+ function buildBlockCssPath(el) {
381
+ if (!(el instanceof Element) || !el.tagName) {
382
+ return '';
383
+ }
384
+
385
+ const parts = [];
386
+ let current = el;
387
+ while (current && current instanceof Element && current.tagName) {
388
+ const tag = current.tagName.toLowerCase();
389
+ if (tag === 'body') {
390
+ break;
391
+ }
392
+
393
+ const parent = current.parentElement;
394
+ if (!parent || parent.tagName.toLowerCase() === 'html') {
395
+ break;
396
+ }
397
+
398
+ const siblings = Array.from(parent.children).filter((child) => {
399
+ return child instanceof Element && child.tagName.toLowerCase() === tag;
400
+ });
401
+ const index = siblings.indexOf(current) + 1;
402
+ parts.unshift(`${tag}:nth-of-type(${Math.max(1, index)})`);
403
+ current = parent;
404
+ }
405
+
406
+ return parts.length > 0 ? `body > ${parts.join(' > ')}` : 'body';
407
+ }
408
+
409
+ function createBlockRecord(el, fields = {}) {
410
+ return {
411
+ ...fields,
412
+ blockCssPath: buildBlockCssPath(el),
413
+ blockPosition: buildBlockPosition(fields)
414
+ };
415
+ }
416
+
417
+ /**
418
+ * Walk inside hidden containers — since we can't use visual dimensions,
419
+ * we subdivide by text content length, recursing into children that are
420
+ * large enough to split further.
421
+ */
422
+ function walkHidden(el, depth, branchPath) {
423
+ if (depth > maxDepth) return;
424
+ const tag = el.tagName?.toLowerCase();
425
+ if (!tag || SKIP_TAGS.has(tag)) return;
426
+
427
+ const myDomOrder = ++domOrderCounter;
428
+ const text = (el.innerText || '').trim();
429
+ const textLen = text.length;
430
+
431
+ // Too little content → skip
432
+ if (textLen < HIDDEN_MIN_TEXT) return;
433
+
434
+ // Large child with multiple children → keep subdividing
435
+ if (textLen >= HIDDEN_SUBDIVIDE_TEXT && el.children.length > 1) {
436
+ if (debug) {
437
+ debugLog.push(`${indent(depth)}HIDDEN-sub ↓ ${el.children.length} children (${textLen} chars)`);
438
+ }
439
+ const countBefore = blocks.length;
440
+ const children = Array.from(el.children);
441
+ for (let i = 0; i < children.length; i++) {
442
+ walkHidden(children[i], depth + 1, `${branchPath}.${i}`);
443
+ }
444
+ if (blocks.length > countBefore) return; // subdivision succeeded
445
+ }
446
+
447
+ // Leaf-ish hidden block — accept
448
+ if (debug) {
449
+ const snippet = textSnippet(el);
450
+ debugLog.push(`${indent(depth)}HIDDEN-sub ✅ | ${snippet}`);
451
+ }
452
+ blocks.push(createBlockRecord(el, {
453
+ el,
454
+ tag,
455
+ top: 0,
456
+ left: 0,
457
+ width: 0,
458
+ height: 0,
459
+ hidden: true,
460
+ branchPath,
461
+ depth,
462
+ domOrder: myDomOrder,
463
+ textPreview: buildTextPreview(el),
464
+ childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
465
+ }));
466
+ }
467
+
468
+ function walk(el, depth, branchPath) {
469
+ if (depth > maxDepth) return;
470
+
471
+ const tag = el.tagName.toLowerCase();
472
+ if (SKIP_TAGS.has(tag)) return;
473
+
474
+ const myDomOrder = ++domOrderCounter;
475
+
476
+ // Detect fixed/sticky overlays (floating headers, chat widgets, modals).
477
+ const position = getComputedStyle(el).position;
478
+ const isFixed = position === 'fixed' || position === 'sticky';
479
+
480
+ // Fixed/sticky → treat as a single block (no subdivision), so each overlay
481
+ // maps to exactly one block row regardless of its height.
482
+ if (isFixed) {
483
+ const rect = el.getBoundingClientRect();
484
+ const preview = buildTextPreview(el);
485
+ if (debug) {
486
+ debugLog.push(`${indent(depth)}FIXED(${position}) ${Math.round(rect.width)}x${Math.round(rect.height)} ✅ | ${preview || '(empty)'}`);
487
+ }
488
+ blocks.push(createBlockRecord(el, {
489
+ el,
490
+ tag,
491
+ top: rect.top + window.scrollY,
492
+ left: rect.left + window.scrollX,
493
+ width: rect.width,
494
+ height: rect.height,
495
+ hidden: false,
496
+ fixed: true,
497
+ branchPath,
498
+ depth,
499
+ domOrder: myDomOrder,
500
+ textPreview: preview,
501
+ childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
502
+ }));
503
+ return;
504
+ }
505
+
506
+ // Hidden element handling
507
+ if (!isVisible(el)) {
508
+ const text = (el.innerText || '').trim();
509
+ const textLen = text.length;
510
+ const substantial = textLen >= HIDDEN_MIN_TEXT;
511
+
512
+ if (!substantial) {
513
+ if (debug && depth <= 4) {
514
+ const snippet = textSnippet(el);
515
+ if (snippet) debugLog.push(`${indent(depth)}HIDDEN → skip | ${snippet}`);
516
+ }
517
+ return;
518
+ }
519
+
520
+ // Large hidden container → subdivide into children
521
+ if (textLen >= HIDDEN_SUBDIVIDE_TEXT && el.children.length > 1) {
522
+ if (debug) {
523
+ debugLog.push(`${indent(depth)}HIDDEN → subdivide (${textLen} chars, ${el.children.length} children)`);
524
+ }
525
+ const countBefore = blocks.length;
526
+ const children = Array.from(el.children);
527
+ for (let i = 0; i < children.length; i++) {
528
+ walkHidden(children[i], depth + 1, `${branchPath}.${i}`);
529
+ }
530
+ // If subdivision produced nothing, fall through to treat as one block
531
+ if (blocks.length > countBefore) return;
532
+ if (debug) {
533
+ debugLog.push(`${indent(depth)}HIDDEN → no children qualified, treating as one block`);
534
+ }
535
+ }
536
+
537
+ // Small hidden element or subdivision failed → treat as one block
538
+ if (debug && depth <= 4) {
539
+ const snippet = textSnippet(el);
540
+ debugLog.push(`${indent(depth)}HIDDEN → block | ${snippet}`);
541
+ }
542
+ blocks.push(createBlockRecord(el, {
543
+ el,
544
+ tag,
545
+ top: 0,
546
+ left: 0,
547
+ width: 0,
548
+ height: 0,
549
+ hidden: true,
550
+ branchPath,
551
+ depth,
552
+ domOrder: myDomOrder,
553
+ textPreview: buildTextPreview(el),
554
+ childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
555
+ }));
556
+ return;
557
+ }
558
+
559
+ const rect = el.getBoundingClientRect();
560
+ const w = rect.width;
561
+ const h = rect.height;
562
+
563
+ // Too small → skip entirely
564
+ if (w < minWidth || h < minHeight) {
565
+ if (debug && depth <= 3) {
566
+ const snippet = textSnippet(el);
567
+ debugLog.push(`${indent(depth)}${Math.round(w)}x${Math.round(h)} skip | ${snippet || '(empty)'}`);
568
+ }
569
+ return;
570
+ }
571
+
572
+ // Good size → accept as block
573
+ if (h <= maxHeight) {
574
+ const preview = buildTextPreview(el);
575
+ if (debug) {
576
+ debugLog.push(`${indent(depth)}${Math.round(w)}x${Math.round(h)} ✅ | ${preview || '(empty)'}`);
577
+ }
578
+ const absTop = rect.top + window.scrollY;
579
+ const absLeft = rect.left + window.scrollX;
580
+
581
+ blocks.push(createBlockRecord(el, {
582
+ el,
583
+ tag,
584
+ top: absTop,
585
+ left: absLeft,
586
+ width: w,
587
+ height: h,
588
+ hidden: false,
589
+ fixed: isFixed,
590
+ branchPath,
591
+ depth,
592
+ domOrder: myDomOrder,
593
+ textPreview: preview,
594
+ childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
595
+ }));
596
+ return; // don't recurse further — this element is the block
597
+ }
598
+
599
+ // Too tall → try to subdivide into children
600
+ if (debug) {
601
+ debugLog.push(`${indent(depth)}${Math.round(w)}x${Math.round(h)} ↓ ${el.children.length} children`);
602
+ }
603
+ const countBefore = blocks.length;
604
+ const children = Array.from(el.children);
605
+ for (let i = 0; i < children.length; i++) {
606
+ walk(children[i], depth + 1, `${branchPath}.${i}`);
607
+ }
608
+
609
+ // If subdivision produced nothing, accept the large element itself
610
+ if (blocks.length === countBefore) {
611
+ const preview = buildTextPreview(el);
612
+ if (debug) {
613
+ debugLog.push(`${indent(depth)}${Math.round(w)}x${Math.round(h)} ✅ (no children qualified) | ${preview || '(empty)'}`);
614
+ }
615
+ const absTop = rect.top + window.scrollY;
616
+ const absLeft = rect.left + window.scrollX;
617
+ blocks.push(createBlockRecord(el, {
618
+ el,
619
+ tag,
620
+ top: absTop,
621
+ left: absLeft,
622
+ width: w,
623
+ height: h,
624
+ hidden: false,
625
+ fixed: isFixed,
626
+ branchPath,
627
+ depth,
628
+ domOrder: myDomOrder,
629
+ textPreview: preview,
630
+ childInteractiveCount: el.querySelectorAll(INTERACTIVE_SELECTOR).length,
631
+ }));
632
+ }
633
+ }
634
+
635
+ // Start from body's direct children (skip body itself — it's always full-page)
636
+ const bodyChildren = Array.from(document.body.children);
637
+ for (let i = 0; i < bodyChildren.length; i++) {
638
+ walk(bodyChildren[i], 0, String(i));
639
+ }
640
+
641
+ // ── Deduplication: remove blocks fully contained by another block ─────────
642
+ // Keep the inner (more specific) block, remove the outer container.
643
+ const toRemove = new Set();
644
+ for (let i = 0; i < blocks.length; i++) {
645
+ for (let j = 0; j < blocks.length; j++) {
646
+ if (i === j) continue;
647
+ if (blocks[i].el.contains(blocks[j].el)) {
648
+ // i contains j → remove i (the outer one)
649
+ toRemove.add(i);
650
+ }
651
+ }
652
+ }
653
+
654
+ const deduped = blocks
655
+ .filter((_, idx) => !toRemove.has(idx))
656
+ // Drop empty blocks: no text, no images, no interactive children
657
+ .filter(b => b.textPreview || b.childInteractiveCount > 0)
658
+ .map(({ el, ...rest }, idx) => ({ blockIdx: idx, ...rest }));
659
+
660
+ return { blocks: deduped, debugLog };
661
+ }
662
+
663
+ // ---------------------------------------------------------------------------
664
+ // Scroll to bottom: trigger lazy-load / JS rendering
665
+ // ---------------------------------------------------------------------------
666
+
667
+ /**
668
+ * Incrementally scroll from top to bottom of the page,
669
+ * pausing at each step to let lazy content load.
670
+ */
671
+ export async function scrollToBottom(page, { step = 800, delay = 300, maxScrolls = 30 } = {}) {
672
+ console.error('📜 Scrolling to bottom...');
673
+
674
+ let lastHeight = 0;
675
+ let scrollCount = 0;
676
+
677
+ while (scrollCount < maxScrolls) {
678
+ const currentHeight = await page.evaluate(() => document.documentElement.scrollHeight);
679
+
680
+ // Already at or past the bottom
681
+ const scrollY = await page.evaluate(() => window.scrollY + window.innerHeight);
682
+ if (scrollY >= currentHeight) break;
683
+
684
+ // Infinite scroll detection: if height grew since last check, page is extending
685
+ if (currentHeight > lastHeight && scrollCount > 0) {
686
+ const growth = currentHeight - lastHeight;
687
+ // If page grew more than 2x viewport in one scroll step → likely infinite scroll, stop
688
+ const viewportH = await page.evaluate(() => window.innerHeight);
689
+ if (growth > viewportH * 2) {
690
+ console.error(`📜 Infinite scroll detected (grew ${growth}px), stopping`);
691
+ break;
692
+ }
693
+ }
694
+ lastHeight = currentHeight;
695
+
696
+ await page.evaluate(s => window.scrollBy(0, s), step);
697
+ await page.waitForTimeout(delay);
698
+ scrollCount++;
699
+ }
700
+
701
+ console.error(`📜 Scrolled ${scrollCount} steps`);
702
+
703
+ // Scroll back to top
704
+ await page.evaluate(() => window.scrollTo(0, 0));
705
+ await page.waitForTimeout(300);
706
+ }
707
+
708
+ // ---------------------------------------------------------------------------
709
+ // Wait strategy: poll until page height stabilises
710
+ // ---------------------------------------------------------------------------
711
+
712
+ /**
713
+ * Poll document.body.scrollHeight every `interval` ms.
714
+ * Once it stays the same for `stableCount` consecutive checks, return.
715
+ * Gives up after `maxWait` ms total.
716
+ */
717
+ export async function waitForStableHeight(page, { interval = 500, stableCount = 3, maxWait = 15000 } = {}) {
718
+ const t0 = Date.now();
719
+ let lastHeight = 0;
720
+ let same = 0;
721
+
722
+ while (Date.now() - t0 < maxWait) {
723
+ const h = await page.evaluate(() => document.documentElement.scrollHeight);
724
+ if (h === lastHeight) {
725
+ same++;
726
+ if (same >= stableCount) {
727
+ console.error(`⏱ Page stable at height ${h}px (${Date.now() - t0}ms)`);
728
+ return;
729
+ }
730
+ } else {
731
+ same = 0;
732
+ lastHeight = h;
733
+ }
734
+ await page.waitForTimeout(interval);
735
+ }
736
+
737
+ console.error(`⏱ Max wait ${maxWait}ms reached, proceeding (height=${lastHeight}px)`);
738
+ }
739
+
740
+ // ---------------------------------------------------------------------------
741
+ // Main
742
+ // ---------------------------------------------------------------------------
743
+
744
+ async function main() {
745
+ const opts = parseArgs();
746
+ const viewport = { width: opts.viewportWidth, height: opts.viewportHeight };
747
+
748
+ // ── Load Playwright ──────────────────────────────────────────────────────
749
+ let pw;
750
+ try {
751
+ const mod = await import('playwright');
752
+ pw = mod.default || mod;
753
+ } catch {
754
+ console.error('❌ playwright is required. Install: npm i playwright');
755
+ process.exit(1);
756
+ }
757
+
758
+ // ── Launch browser & load page ───────────────────────────────────────────
759
+ const inputLabel = opts.input;
760
+ console.error(`🌐 Opening: ${inputLabel} (${viewport.width}x${viewport.height})`);
761
+
762
+ const browser = await pw.chromium.launch({ headless: true });
763
+ let page;
764
+ try {
765
+ page = await browser.newPage({ viewport });
766
+
767
+ if (isUrl(opts.input)) {
768
+ await page.goto(opts.input, { waitUntil: 'domcontentloaded', timeout: 30000 });
769
+ } else {
770
+ const inputPath = path.resolve(opts.input);
771
+ if (!fs.existsSync(inputPath)) {
772
+ console.error(`File not found: ${inputPath}`);
773
+ process.exit(1);
774
+ }
775
+ const html = fs.readFileSync(inputPath, 'utf-8');
776
+ await page.setContent(html, { waitUntil: 'domcontentloaded', timeout: 30000 });
777
+ }
778
+
779
+ // Scroll to bottom to trigger lazy-load content, then wait for stable height
780
+ await scrollToBottom(page);
781
+ await waitForStableHeight(page);
782
+
783
+ // Force-reveal hidden content so block extraction can see real dimensions.
784
+ // Two categories:
785
+ // 1. opacity:0 — scroll-triggered animations (IntersectionObserver) that never fire in headless
786
+ // 2. display:none — interactive panels (dropdown menus, accordions) with substantial content
787
+ const revealStats = await page.evaluate(() => {
788
+ const CONTENT_THRESHOLD = 20; // min text chars to keep a revealed element
789
+ let opacityCount = 0;
790
+ let displayCount = 0;
791
+
792
+ for (const el of document.querySelectorAll('*')) {
793
+ const s = getComputedStyle(el);
794
+
795
+ // Case 1: opacity:0 with real layout — animation that didn't trigger
796
+ // Only reveal if the element (or its subtree) has actual content,
797
+ // to avoid surfacing empty scroll-anchor / animation-placeholder divs.
798
+ if (parseFloat(s.opacity) === 0 && el.getBoundingClientRect().height > 0) {
799
+ const text = (el.innerText || '').trim();
800
+ if (text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0) {
801
+ el.style.setProperty('opacity', '1', 'important');
802
+ opacityCount++;
803
+ }
804
+ continue;
805
+ }
806
+
807
+ // Case 2: display:none — dropdown/accordion/menu panel
808
+ if (s.display === 'none') {
809
+ // Only process "root" hidden elements (parent is visible).
810
+ // Children of a display:none parent inherit none; they'll be handled
811
+ // once we reveal their parent.
812
+ const parent = el.parentElement;
813
+ if (parent && getComputedStyle(parent).display === 'none') continue;
814
+
815
+ // Temporarily reveal to measure content
816
+ const origDisplay = el.style.display;
817
+ el.style.setProperty('display', 'block', 'important');
818
+ const text = (el.innerText || '').trim();
819
+
820
+ if (text.length >= CONTENT_THRESHOLD) {
821
+ displayCount++;
822
+ // Keep visible — has substantial content
823
+ } else {
824
+ // Revert — not enough content to qualify
825
+ if (origDisplay) {
826
+ el.style.display = origDisplay;
827
+ } else {
828
+ el.style.removeProperty('display');
829
+ }
830
+ }
831
+ }
832
+ }
833
+ return { opacityCount, displayCount };
834
+ });
835
+ if (revealStats.opacityCount > 0 || revealStats.displayCount > 0) {
836
+ console.error(`👁 Force-revealed: ${revealStats.opacityCount} opacity:0, ${revealStats.displayCount} display:none`);
837
+ }
838
+
839
+ // Save rendered HTML if requested
840
+ if (opts.saveHtml) {
841
+ const rendered = await page.content();
842
+ const savePath = path.resolve(opts.saveHtml);
843
+ fs.writeFileSync(savePath, rendered, 'utf-8');
844
+ console.error(`💾 HTML saved to: ${savePath}`);
845
+ }
846
+
847
+ const pageSize = await page.evaluate(() => ({
848
+ w: document.documentElement.scrollWidth,
849
+ h: document.documentElement.scrollHeight
850
+ }));
851
+ console.error(`📄 Page size: ${pageSize.w}x${pageSize.h}`);
852
+
853
+ // ── Extract blocks inside browser ────────────────────────────────────
854
+ const minWidth = Math.round(viewport.width * opts.minWidthRatio);
855
+ const result = await page.evaluate(extractBlocksInBrowser, {
856
+ minHeight: opts.minHeight,
857
+ minWidth,
858
+ maxHeight: opts.maxHeight,
859
+ maxDepth: 15,
860
+ textPreviewMaxChars: 1200,
861
+ debug: opts.debug
862
+ });
863
+
864
+ const blocks = result.blocks;
865
+ if (opts.debug && result.debugLog.length > 0) {
866
+ console.error('\n🔍 Walk tree:');
867
+ for (const line of result.debugLog) {
868
+ console.error(line);
869
+ }
870
+ console.error('');
871
+ }
872
+
873
+ console.error(`📦 Blocks found: ${blocks.length}`);
874
+
875
+ // ── Output ───────────────────────────────────────────────────────────
876
+ let output;
877
+ if (opts.json) {
878
+ output = JSON.stringify(blocks, null, 2);
879
+ } else {
880
+ output = blocksToCsv(blocks);
881
+ }
882
+
883
+ if (opts.out) {
884
+ const outPath = path.resolve(opts.out);
885
+ fs.writeFileSync(outPath, output, 'utf-8');
886
+ console.error(`💾 Written to: ${outPath}`);
887
+ } else {
888
+ console.log(output);
889
+ }
890
+ } finally {
891
+ await browser.close();
892
+ }
893
+ }
894
+
895
+ const cliArgvPath = process.argv[1] ? path.resolve(process.argv[1]) : '';
896
+ const isCliEntry = cliArgvPath && pathToFileURL(cliArgvPath).href === import.meta.url;
897
+
898
+ if (isCliEntry) {
899
+ main().catch(err => {
900
+ console.error(`❌ ${err.message}`);
901
+ process.exit(1);
902
+ });
903
+ }