@d-zero/beholder 3.0.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ /**
2
+ * DOM-side raw `<head>` collector.
3
+ *
4
+ * `collectHeadFromDocument` walks a `Document` (Puppeteer page realm or jsdom realm
5
+ * alike) and produces a serializable {@link RawHeadEntry}[] that
6
+ * {@link ../meta/classify.ts | classify} can turn into a typed `Meta`.
7
+ *
8
+ * WHY this function is realm-agnostic:
9
+ *
10
+ * - The Puppeteer path stringifies this function via `Function.prototype.toString`
11
+ * and runs it as a `page.evaluate(string)` expression, so any closure over
12
+ * module-scope bindings would resolve to `undefined` in the browser realm.
13
+ * - The jsdom (Node) path calls it directly with the jsdom `Window`. Because
14
+ * `HTMLLinkElement` (etc.) in jsdom is a *different class instance* from the
15
+ * one in the page realm, `instanceof` only works when the constructor is read
16
+ * from the *passed* `window` rather than from bare globals.
17
+ *
18
+ * Together those constraints dictate that the function MUST:
19
+ *
20
+ * 1. Reference no module-level variables — only its own parameters and inner locals.
21
+ * 2. Take every HTML class constructor (`HTMLBaseElement`, …) from the passed
22
+ * `window` via destructuring instead of relying on ambient globals.
23
+ * 3. Stay in plain ES syntax (no TS-only constructs that need helper imports).
24
+ * @module
25
+ */
26
+ /**
27
+ * Curated list of `window` globals whose presence indicates that a third-party
28
+ * tag library has been loaded on the page. Surfaced as a single
29
+ * `kind: 'window-global'` entry so that downstream consumers (e.g. tag-detection)
30
+ * can cross-reference the script/iframe signals.
31
+ *
32
+ * Kept here (rather than in `dom-evaluation.ts`) so the Puppeteer path and the
33
+ * jsdom path share one source of truth.
34
+ */
35
+ export const WINDOW_GLOBALS_TO_CHECK = [
36
+ 'dataLayer',
37
+ 'gtag',
38
+ 'ga',
39
+ '_gaq',
40
+ 'fbq',
41
+ '_fbq',
42
+ 'clarity',
43
+ '_hjSettings',
44
+ '_hjid',
45
+ 'twq',
46
+ 'ttq',
47
+ '_linkedin_partner_id',
48
+ 'pintrk',
49
+ 'amplitude',
50
+ 'mixpanel',
51
+ 'analytics',
52
+ 'heap',
53
+ 'posthog',
54
+ 'plausible',
55
+ 'fathom',
56
+ '_paq',
57
+ 's_account',
58
+ 's',
59
+ 'ym',
60
+ 'UET',
61
+ 'optimizely',
62
+ '_hsq',
63
+ 'Sentry',
64
+ 'Intercom',
65
+ 'intercomSettings',
66
+ 'drift',
67
+ 'Tawk_API',
68
+ 'zE',
69
+ 'OneTrust',
70
+ 'Cookiebot',
71
+ 'Stripe',
72
+ 'grecaptcha',
73
+ ];
74
+ /**
75
+ * Walks the given window's `Document` and returns a serializable list of raw
76
+ * head entries.
77
+ *
78
+ * Two realms are supported:
79
+ *
80
+ * - Browser realm (Puppeteer): the function source is `.toString()`'d and run
81
+ * inside the page via `page.evaluate(string)`. Inside the page, `window`
82
+ * resolves to the page's global object, so destructured class constructors
83
+ * match `instanceof` checks against elements returned from `querySelectorAll`.
84
+ * - Node realm (jsdom et al.): the caller passes `dom.window` directly. jsdom's
85
+ * HTML element prototypes are distinct from the host Node's bare globals, so
86
+ * reading the constructors off the passed `window` is what makes `instanceof`
87
+ * succeed.
88
+ *
89
+ * The function MUST NOT close over any module-scope binding — all data it needs
90
+ * is reached through its two parameters.
91
+ * @param window - The window object whose `document` will be inspected. Provides
92
+ * both the DOM tree and the HTML element constructors used for
93
+ * `instanceof` narrowing.
94
+ * @param knownGlobals - Names of `window` properties that, when present,
95
+ * indicate a third-party tag library is loaded. Required
96
+ * (no default) so the Puppeteer-side string-eval path
97
+ * does not have to inline a default value list.
98
+ * @returns Serializable list of raw head entries for {@link ../meta/classify.ts | classify}.
99
+ */
100
+ export function collectHeadFromDocument(window, knownGlobals) {
101
+ const document = window.document;
102
+ // TypeScript's `Window` interface in lib.dom does not directly expose the
103
+ // HTML element constructors (`HTMLLinkElement`, `HTMLScriptElement`, …)
104
+ // even though every real window object — browser realm AND jsdom realm —
105
+ // carries them at runtime. Widening the type here lets us destructure them
106
+ // uniformly; the runtime values come straight from the passed window, so
107
+ // the cast is purely cosmetic for TS and erased at compile time.
108
+ const w = window;
109
+ const { HTMLBaseElement, HTMLMetaElement, HTMLLinkElement, HTMLScriptElement, HTMLIFrameElement, } = w;
110
+ const entries = [];
111
+ const html = document.documentElement;
112
+ entries.push({
113
+ kind: 'html',
114
+ lang: html.lang || undefined,
115
+ dir: html.dir || undefined,
116
+ xmlns: html.getAttribute('xmlns') ?? undefined,
117
+ prefix: html.getAttribute('prefix') ?? undefined,
118
+ vocab: html.getAttribute('vocab') ?? undefined,
119
+ typeOf: html.getAttribute('typeof') ?? undefined,
120
+ itemscope: html.hasAttribute('itemscope') || undefined,
121
+ itemtype: html.getAttribute('itemtype') ?? undefined,
122
+ amp: html.hasAttribute('amp') || undefined,
123
+ lightning: html.hasAttribute('⚡') || undefined,
124
+ }, { kind: 'title', content: document.title });
125
+ for (const base of document.querySelectorAll('base')) {
126
+ if (!(base instanceof HTMLBaseElement))
127
+ continue;
128
+ entries.push({
129
+ kind: 'base',
130
+ href: base.getAttribute('href') ?? undefined,
131
+ target: base.getAttribute('target') ?? undefined,
132
+ });
133
+ }
134
+ for (const meta of document.querySelectorAll('meta')) {
135
+ if (!(meta instanceof HTMLMetaElement))
136
+ continue;
137
+ const name = meta.getAttribute('name');
138
+ const property = meta.getAttribute('property');
139
+ const httpEquiv = meta.getAttribute('http-equiv');
140
+ const itemprop = meta.getAttribute('itemprop');
141
+ const charset = meta.getAttribute('charset');
142
+ const content = meta.getAttribute('content');
143
+ const media = meta.getAttribute('media');
144
+ entries.push({
145
+ kind: 'meta',
146
+ name: name ? name.toLowerCase() : undefined,
147
+ property: property ? property.toLowerCase() : undefined,
148
+ httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
149
+ itemprop: itemprop ?? undefined,
150
+ charset: charset ?? undefined,
151
+ content: content ?? undefined,
152
+ media: media ?? undefined,
153
+ });
154
+ }
155
+ for (const link of document.querySelectorAll('link[href]')) {
156
+ if (!(link instanceof HTMLLinkElement))
157
+ continue;
158
+ const relRaw = link.getAttribute('rel') ?? '';
159
+ const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
160
+ entries.push({
161
+ kind: 'link',
162
+ rel,
163
+ href: link.getAttribute('href') ?? '',
164
+ type: link.getAttribute('type') ?? undefined,
165
+ media: link.getAttribute('media') ?? undefined,
166
+ sizes: link.getAttribute('sizes') ?? undefined,
167
+ title: link.getAttribute('title') ?? undefined,
168
+ hreflang: link.getAttribute('hreflang') ?? undefined,
169
+ as: link.getAttribute('as') ?? undefined,
170
+ crossorigin: link.getAttribute('crossorigin') ?? undefined,
171
+ color: link.getAttribute('color') ?? undefined,
172
+ blocking: link.getAttribute('blocking') ?? undefined,
173
+ imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
174
+ });
175
+ }
176
+ const STRUCTURED_TYPES = new Set([
177
+ 'application/ld+json',
178
+ 'speculationrules',
179
+ 'application/json+oembed',
180
+ 'application/xml+oembed',
181
+ ]);
182
+ for (const script of document.querySelectorAll('script[type]')) {
183
+ if (!(script instanceof HTMLScriptElement))
184
+ continue;
185
+ const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
186
+ if (!STRUCTURED_TYPES.has(scriptType))
187
+ continue;
188
+ const src = script.getAttribute('src') ?? undefined;
189
+ const text = script.textContent ?? '';
190
+ const inHead = !!script.closest('head');
191
+ const inNoscript = !!script.closest('noscript');
192
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
193
+ entries.push({
194
+ kind: 'script',
195
+ scriptType,
196
+ content: text || undefined,
197
+ src,
198
+ location,
199
+ });
200
+ }
201
+ for (const iframe of document.querySelectorAll('iframe[src]')) {
202
+ if (!(iframe instanceof HTMLIFrameElement))
203
+ continue;
204
+ const src = iframe.getAttribute('src') ?? '';
205
+ if (!src)
206
+ continue;
207
+ const inHead = !!iframe.closest('head');
208
+ const inNoscript = !!iframe.closest('noscript');
209
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
210
+ entries.push({ kind: 'iframe', src, location });
211
+ }
212
+ const win = window;
213
+ const presentGlobals = [];
214
+ for (const name of knownGlobals) {
215
+ if (win[name] !== undefined) {
216
+ presentGlobals.push(name);
217
+ }
218
+ }
219
+ if (presentGlobals.length > 0) {
220
+ entries.push({ kind: 'window-global', names: presentGlobals });
221
+ }
222
+ return entries;
223
+ }
package/dist/scraper.js CHANGED
@@ -51,6 +51,17 @@ import { parseUrl } from './parse-url.js';
51
51
  const pid = `${process.pid}`;
52
52
  const log = scraperLog.extend(pid);
53
53
  const rLog = resourceLog.extend(pid);
54
+ /**
55
+ * Upper bound for `document.body.scrollHeight` tolerated by `#fetchImages`.
56
+ * Pages exceeding this at a given device preset are skipped to keep
57
+ * `scrollAllOver` from running long enough to outlast the @retryable
58
+ * timeout and collide with a follow-up retry on the same Puppeteer page.
59
+ *
60
+ * 1,000,000 px is roughly 3× the worst real-world value we have measured
61
+ * (a responsive data-table page reached ~321k px at 320px viewport), so
62
+ * normal responsive sites complete well within the 20 min retry budget.
63
+ */
64
+ const MAX_SCROLL_HEIGHT = 1_000_000;
54
65
  let Scraper = (() => {
55
66
  let _classSuper = EventEmitter;
56
67
  let _instanceExtraInitializers = [];
@@ -62,7 +73,7 @@ let Scraper = (() => {
62
73
  static {
63
74
  const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
64
75
  _private_fetchData_decorators = [retryable({
65
- timeout: 3 * 60 * 1000,
76
+ timeout: 25 * 60 * 1000,
66
77
  onWait(determinedInterval, retryCount, methodName, error) {
67
78
  void this.emit('changePhase', {
68
79
  pid: process.pid,
@@ -83,7 +94,7 @@ let Scraper = (() => {
83
94
  },
84
95
  })];
85
96
  _private_fetchImages_decorators = [retryable({
86
- timeout: 5 * 60 * 1000,
97
+ timeout: 20 * 60 * 1000,
87
98
  fallback: [],
88
99
  onWait(determinedInterval, retryCount, methodName, error) {
89
100
  void this.emit('changePhase', {
@@ -402,13 +413,24 @@ let Scraper = (() => {
402
413
  isExternal,
403
414
  message: `📷 ${key} ↔️ ${preset.width}px`,
404
415
  });
405
- await beforePageScan(page, url, {
416
+ const scanResult = await beforePageScan(page, url, {
406
417
  name: key,
407
418
  width: preset.width,
408
419
  resolution: preset.resolution,
409
420
  listener,
410
421
  timeout: 5000,
422
+ maxScrollHeight: MAX_SCROLL_HEIGHT,
411
423
  });
424
+ if (!scanResult.scrolled) {
425
+ void this.emit('changePhase', {
426
+ pid: process.pid,
427
+ name: 'retryExhausted',
428
+ url: null,
429
+ isExternal: false,
430
+ message: `📷 ${key}: skipped — scrollHeight ${scanResult.scrollHeight} exceeds limit ${MAX_SCROLL_HEIGHT}`,
431
+ });
432
+ continue;
433
+ }
412
434
  void this.emit('changePhase', {
413
435
  pid: process.pid,
414
436
  name: 'waitImageLoad',
@@ -667,9 +689,18 @@ let Scraper = (() => {
667
689
  /**
668
690
  * Navigates the page to the target URL and extracts full page data.
669
691
  *
670
- * WHY retryable with 3-min timeout: Page navigation can fail due to transient
671
- * network issues or slow-loading pages. The decorator retries automatically,
672
- * emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
692
+ * WHY retryable with 25-min timeout: Page navigation can fail due to
693
+ * transient network issues or slow-loading pages. The decorator retries
694
+ * automatically, emitting `retryWait` / `retryExhausted` phase events for
695
+ * progress monitoring. The timeout must accommodate the worst-case
696
+ * `#fetchImages` runtime (its own @retryable allows up to 20 min for
697
+ * pages with very large `scrollHeight` at narrow viewports). A shorter
698
+ * `#fetchData` timeout would race `#fetchImages` to completion: when the
699
+ * outer race fires first, `Promise.race` does not cancel the inner
700
+ * `#fetchImages`, so a new `#fetchData` retry starts while the previous
701
+ * attempt's scroll evaluates are still running on the same page —
702
+ * exactly the collision that surfaces as "Attempted to use detached
703
+ * Frame" or "Session closed".
673
704
  *
674
705
  * Flow:
675
706
  * 1. Register request/response/requestfailed listeners to capture sub-resources (internal pages only)
@@ -701,9 +732,23 @@ let Scraper = (() => {
701
732
  * changes and triggers a reload. Isolating each device preset allows partial
702
733
  * results — if one viewport fails, the other can still succeed.
703
734
  *
704
- * WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
735
+ * WHY retryable with 20-min timeout and `fallback: []`: Image extraction is
705
736
  * best-effort. If all retries fail, an empty array is returned rather than
706
- * failing the entire page scrape.
737
+ * failing the entire page scrape. The 20-min wall clock accommodates pages
738
+ * whose mobile-small `scrollHeight` reaches ~300k px (observed on
739
+ * responsive data tables, which take ~5 min to scroll). A shorter timeout
740
+ * causes a second retry to start while the previous attempt's
741
+ * `scrollAllOver` is still running its `page.evaluate` calls in the
742
+ * background — `Promise.race` in `retry.ts` does not cancel `fn()`. The
743
+ * collision then surfaces as "Attempted to use detached Frame" or
744
+ * "Session closed" when the new attempt's reload / setViewport runs on
745
+ * the same page as the old attempt's pending evaluates.
746
+ *
747
+ * WHY pass `maxScrollHeight`: Even 20 min is not enough for pathological
748
+ * pages whose layout explodes at narrow viewports. Skipping the device
749
+ * preset entirely keeps the timeout-vs-background-evaluate collision from
750
+ * ever being triggered, at the cost of losing that viewport's image data
751
+ * for those pages. See {@link MAX_SCROLL_HEIGHT} for the chosen threshold.
707
752
  * @param page - Puppeteer page instance
708
753
  * @param url - The page URL string (without hash and auth)
709
754
  * @param isExternal - Whether the page is external
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@d-zero/beholder",
3
- "version": "3.0.0",
3
+ "version": "3.1.1",
4
4
  "description": "Page-level scraper for web crawling and auditing",
5
5
  "author": "D-ZERO",
6
6
  "license": "MIT",
@@ -20,19 +20,21 @@
20
20
  "clean": "tsc --build --clean"
21
21
  },
22
22
  "dependencies": {
23
- "@d-zero/puppeteer-page-scan": "4.5.2",
23
+ "@d-zero/puppeteer-page-scan": "4.6.0",
24
24
  "@d-zero/shared": "0.22.0",
25
25
  "debug": "4.4.3",
26
26
  "puppeteer": "24.37.5",
27
27
  "simple-wappalyzer": "1.1.99"
28
28
  },
29
29
  "devDependencies": {
30
- "@types/debug": "4.1.12"
30
+ "@types/debug": "4.1.12",
31
+ "@types/jsdom": "28.0.3",
32
+ "jsdom": "29.1.1"
31
33
  },
32
34
  "repository": {
33
35
  "type": "git",
34
36
  "url": "https://github.com/d-zero-dev/tools.git",
35
37
  "directory": "packages/@d-zero/beholder"
36
38
  },
37
- "gitHead": "16c831105a12bb635d49130e7f5add25b6643c40"
39
+ "gitHead": "d876ace142711051c337f7922931776526047cb0"
38
40
  }
@@ -22,6 +22,7 @@ import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
22
22
 
23
23
  import { domDetailsLog, domLog } from './debug.js';
24
24
  import { classify, emptyMeta } from './meta/classify.js';
25
+ import { WINDOW_GLOBALS_TO_CHECK, collectHeadFromDocument } from './meta/collect-head.js';
25
26
  import { detectTags } from './meta/tag-detection.js';
26
27
  import { parseUrl } from './parse-url.js';
27
28
 
@@ -515,46 +516,6 @@ export type GetMetaContext = {
515
516
  readonly includeRaw?: boolean;
516
517
  };
517
518
 
518
- const WINDOW_GLOBALS_TO_CHECK: readonly string[] = [
519
- 'dataLayer',
520
- 'gtag',
521
- 'ga',
522
- '_gaq',
523
- 'fbq',
524
- '_fbq',
525
- 'clarity',
526
- '_hjSettings',
527
- '_hjid',
528
- 'twq',
529
- 'ttq',
530
- '_linkedin_partner_id',
531
- 'pintrk',
532
- 'amplitude',
533
- 'mixpanel',
534
- 'analytics',
535
- 'heap',
536
- 'posthog',
537
- 'plausible',
538
- 'fathom',
539
- '_paq',
540
- 's_account',
541
- 's',
542
- 'ym',
543
- 'UET',
544
- 'optimizely',
545
- '_hsq',
546
- 'Sentry',
547
- 'Intercom',
548
- 'intercomSettings',
549
- 'drift',
550
- 'Tawk_API',
551
- 'zE',
552
- 'OneTrust',
553
- 'Cookiebot',
554
- 'Stripe',
555
- 'grecaptcha',
556
- ];
557
-
558
519
  /**
559
520
  * Extracts comprehensive metadata from the page.
560
521
  *
@@ -639,129 +600,27 @@ async function runGetMeta(page: Page, context: GetMetaContext): Promise<Meta | n
639
600
  }
640
601
 
641
602
  /**
603
+ * Collects raw `<head>` entries from a Puppeteer page by injecting
604
+ * {@link collectHeadFromDocument} into the page realm.
642
605
  *
643
- * @param page
606
+ * WHY string-eval instead of `page.evaluate(fn, args)`: the shared
607
+ * implementation lives in this module (`collectHeadFromDocument`), and a
608
+ * `page.evaluate(() => collectHeadFromDocument(window, …))` wrapper cannot
609
+ * reach that module-scope binding inside the page realm — only the wrapper's
610
+ * own source crosses the CDP boundary. Serializing the implementation via
611
+ * `Function.prototype.toString` and invoking it through
612
+ * `page.evaluate(string)` is what keeps the Puppeteer path and the
613
+ * jsdom path on one source of truth.
614
+ *
615
+ * The same {@link collectHeadFromDocument} function is also exposed via
616
+ * {@link ../extract-meta.ts | extractMetaFromDocument} for jsdom/Node callers,
617
+ * so the two paths cannot drift apart.
618
+ * @param page - The Puppeteer page whose document will be inspected.
644
619
  */
645
620
  async function collectHeadOnPage(page: Page): Promise<RawHeadEntry[]> {
646
- const raw = await page
647
- .evaluate((knownGlobals: readonly string[]) => {
648
- /* global document, HTMLLinkElement, HTMLMetaElement, HTMLBaseElement,
649
- HTMLScriptElement, HTMLIFrameElement */
650
- type Out = unknown;
651
- const entries: Out[] = [];
652
-
653
- const html = document.documentElement;
654
- entries.push(
655
- {
656
- kind: 'html',
657
- lang: html.lang || undefined,
658
- dir: html.dir || undefined,
659
- xmlns: html.getAttribute('xmlns') ?? undefined,
660
- prefix: html.getAttribute('prefix') ?? undefined,
661
- vocab: html.getAttribute('vocab') ?? undefined,
662
- typeOf: html.getAttribute('typeof') ?? undefined,
663
- itemscope: html.hasAttribute('itemscope') || undefined,
664
- itemtype: html.getAttribute('itemtype') ?? undefined,
665
- amp: html.hasAttribute('amp') || undefined,
666
- lightning: html.hasAttribute('⚡') || undefined,
667
- },
668
- { kind: 'title', content: document.title },
669
- );
670
-
671
- for (const base of document.querySelectorAll('base')) {
672
- if (!(base instanceof HTMLBaseElement)) continue;
673
- entries.push({
674
- kind: 'base',
675
- href: base.getAttribute('href') ?? undefined,
676
- target: base.getAttribute('target') ?? undefined,
677
- });
678
- }
679
-
680
- for (const meta of document.querySelectorAll('meta')) {
681
- if (!(meta instanceof HTMLMetaElement)) continue;
682
- const name = meta.getAttribute('name');
683
- const property = meta.getAttribute('property');
684
- const httpEquiv = meta.getAttribute('http-equiv');
685
- const itemprop = meta.getAttribute('itemprop');
686
- const charset = meta.getAttribute('charset');
687
- const content = meta.getAttribute('content');
688
- const media = meta.getAttribute('media');
689
- entries.push({
690
- kind: 'meta',
691
- name: name ? name.toLowerCase() : undefined,
692
- property: property ? property.toLowerCase() : undefined,
693
- httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
694
- itemprop: itemprop ?? undefined,
695
- charset: charset ?? undefined,
696
- content: content ?? undefined,
697
- media: media ?? undefined,
698
- });
699
- }
700
-
701
- for (const link of document.querySelectorAll('link[href]')) {
702
- if (!(link instanceof HTMLLinkElement)) continue;
703
- const relRaw = link.getAttribute('rel') ?? '';
704
- const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
705
- entries.push({
706
- kind: 'link',
707
- rel,
708
- href: link.getAttribute('href') ?? '',
709
- type: link.getAttribute('type') ?? undefined,
710
- media: link.getAttribute('media') ?? undefined,
711
- sizes: link.getAttribute('sizes') ?? undefined,
712
- title: link.getAttribute('title') ?? undefined,
713
- hreflang: link.getAttribute('hreflang') ?? undefined,
714
- as: link.getAttribute('as') ?? undefined,
715
- crossorigin: link.getAttribute('crossorigin') ?? undefined,
716
- color: link.getAttribute('color') ?? undefined,
717
- blocking: link.getAttribute('blocking') ?? undefined,
718
- imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
719
- });
720
- }
721
-
722
- const STRUCTURED_TYPES = new Set([
723
- 'application/ld+json',
724
- 'speculationrules',
725
- 'application/json+oembed',
726
- 'application/xml+oembed',
727
- ]);
728
- for (const script of document.querySelectorAll('script[type]')) {
729
- if (!(script instanceof HTMLScriptElement)) continue;
730
- const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
731
- if (!STRUCTURED_TYPES.has(scriptType)) continue;
732
- const src = script.getAttribute('src') ?? undefined;
733
- const text = script.textContent ?? '';
734
- const inHead = !!script.closest('head');
735
- const inNoscript = !!script.closest('noscript');
736
- const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
737
- entries.push({
738
- kind: 'script',
739
- scriptType,
740
- content: text || undefined,
741
- src,
742
- location,
743
- });
744
- }
745
-
746
- for (const iframe of document.querySelectorAll('iframe[src]')) {
747
- if (!(iframe instanceof HTMLIFrameElement)) continue;
748
- const src = iframe.getAttribute('src') ?? '';
749
- if (!src) continue;
750
- const inHead = !!iframe.closest('head');
751
- const inNoscript = !!iframe.closest('noscript');
752
- const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
753
- entries.push({ kind: 'iframe', src, location });
754
- }
755
-
756
- const win = window as unknown as Record<string, unknown>;
757
- const presentGlobals = knownGlobals.filter((name) => win[name] !== undefined);
758
- if (presentGlobals.length > 0) {
759
- entries.push({ kind: 'window-global', names: presentGlobals });
760
- }
761
-
762
- return entries;
763
- }, WINDOW_GLOBALS_TO_CHECK)
764
- .catch(() => [] as unknown[]);
765
-
621
+ const fnSource = collectHeadFromDocument.toString();
622
+ const globalsLiteral = JSON.stringify(WINDOW_GLOBALS_TO_CHECK);
623
+ const expr = `(${fnSource})(window, ${globalsLiteral})`;
624
+ const raw = await page.evaluate(expr).catch(() => [] as unknown[]);
766
625
  return raw as RawHeadEntry[];
767
626
  }