@d-zero/beholder 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ /**
2
+ * DOM-side raw `<head>` collector.
3
+ *
4
+ * `collectHeadFromDocument` walks a `Document` (Puppeteer page realm or jsdom realm
5
+ * alike) and produces a serializable {@link RawHeadEntry}[] that
6
+ * {@link ../meta/classify.ts | classify} can turn into a typed `Meta`.
7
+ *
8
+ * WHY this function is realm-agnostic:
9
+ *
10
+ * - The Puppeteer path stringifies this function via `Function.prototype.toString`
11
+ * and runs it as a `page.evaluate(string)` expression, so any closure over
12
+ * module-scope bindings would resolve to `undefined` in the browser realm.
13
+ * - The jsdom (Node) path calls it directly with the jsdom `Window`. Because
14
+ * `HTMLLinkElement` (etc.) in jsdom is a *different class instance* from the
15
+ * one in the page realm, `instanceof` only works when the constructor is read
16
+ * from the *passed* `window` rather than from bare globals.
17
+ *
18
+ * Together those constraints dictate that the function MUST:
19
+ *
20
+ * 1. Reference no module-level variables — only its own parameters and inner locals.
21
+ * 2. Take every HTML class constructor (`HTMLBaseElement`, …) from the passed
22
+ * `window` via destructuring instead of relying on ambient globals.
23
+ * 3. Stay in plain ES syntax (no TS-only constructs that need helper imports).
24
+ * @module
25
+ */
26
+ /**
27
+ * Curated list of `window` globals whose presence indicates that a third-party
28
+ * tag library has been loaded on the page. Surfaced as a single
29
+ * `kind: 'window-global'` entry so that downstream consumers (e.g. tag-detection)
30
+ * can cross-reference the script/iframe signals.
31
+ *
32
+ * Kept here (rather than in `dom-evaluation.ts`) so the Puppeteer path and the
33
+ * jsdom path share one source of truth.
34
+ */
35
+ export const WINDOW_GLOBALS_TO_CHECK = [
36
+ 'dataLayer',
37
+ 'gtag',
38
+ 'ga',
39
+ '_gaq',
40
+ 'fbq',
41
+ '_fbq',
42
+ 'clarity',
43
+ '_hjSettings',
44
+ '_hjid',
45
+ 'twq',
46
+ 'ttq',
47
+ '_linkedin_partner_id',
48
+ 'pintrk',
49
+ 'amplitude',
50
+ 'mixpanel',
51
+ 'analytics',
52
+ 'heap',
53
+ 'posthog',
54
+ 'plausible',
55
+ 'fathom',
56
+ '_paq',
57
+ 's_account',
58
+ 's',
59
+ 'ym',
60
+ 'UET',
61
+ 'optimizely',
62
+ '_hsq',
63
+ 'Sentry',
64
+ 'Intercom',
65
+ 'intercomSettings',
66
+ 'drift',
67
+ 'Tawk_API',
68
+ 'zE',
69
+ 'OneTrust',
70
+ 'Cookiebot',
71
+ 'Stripe',
72
+ 'grecaptcha',
73
+ ];
74
+ /**
75
+ * Walks the given window's `Document` and returns a serializable list of raw
76
+ * head entries.
77
+ *
78
+ * Two realms are supported:
79
+ *
80
+ * - Browser realm (Puppeteer): the function source is `.toString()`'d and run
81
+ * inside the page via `page.evaluate(string)`. Inside the page, `window`
82
+ * resolves to the page's global object, so destructured class constructors
83
+ * match `instanceof` checks against elements returned from `querySelectorAll`.
84
+ * - Node realm (jsdom et al.): the caller passes `dom.window` directly. jsdom's
85
+ * HTML element prototypes are distinct from the host Node's bare globals, so
86
+ * reading the constructors off the passed `window` is what makes `instanceof`
87
+ * succeed.
88
+ *
89
+ * The function MUST NOT close over any module-scope binding — all data it needs
90
+ * is reached through its two parameters.
91
+ * @param window - The window object whose `document` will be inspected. Provides
92
+ * both the DOM tree and the HTML element constructors used for
93
+ * `instanceof` narrowing.
94
+ * @param knownGlobals - Names of `window` properties that, when present,
95
+ * indicate a third-party tag library is loaded. Required
96
+ * (no default) so the Puppeteer-side string-eval path
97
+ * does not have to inline a default value list.
98
+ * @returns Serializable list of raw head entries for {@link ../meta/classify.ts | classify}.
99
+ */
100
+ export function collectHeadFromDocument(window, knownGlobals) {
101
+ const document = window.document;
102
+ // TypeScript's `Window` interface in lib.dom does not directly expose the
103
+ // HTML element constructors (`HTMLLinkElement`, `HTMLScriptElement`, …)
104
+ // even though every real window object — browser realm AND jsdom realm —
105
+ // carries them at runtime. Widening the type here lets us destructure them
106
+ // uniformly; the runtime values come straight from the passed window, so
107
+ // the cast is purely cosmetic for TS and erased at compile time.
108
+ const w = window;
109
+ const { HTMLBaseElement, HTMLMetaElement, HTMLLinkElement, HTMLScriptElement, HTMLIFrameElement, } = w;
110
+ const entries = [];
111
+ const html = document.documentElement;
112
+ entries.push({
113
+ kind: 'html',
114
+ lang: html.lang || undefined,
115
+ dir: html.dir || undefined,
116
+ xmlns: html.getAttribute('xmlns') ?? undefined,
117
+ prefix: html.getAttribute('prefix') ?? undefined,
118
+ vocab: html.getAttribute('vocab') ?? undefined,
119
+ typeOf: html.getAttribute('typeof') ?? undefined,
120
+ itemscope: html.hasAttribute('itemscope') || undefined,
121
+ itemtype: html.getAttribute('itemtype') ?? undefined,
122
+ amp: html.hasAttribute('amp') || undefined,
123
+ lightning: html.hasAttribute('⚡') || undefined,
124
+ }, { kind: 'title', content: document.title });
125
+ for (const base of document.querySelectorAll('base')) {
126
+ if (!(base instanceof HTMLBaseElement))
127
+ continue;
128
+ entries.push({
129
+ kind: 'base',
130
+ href: base.getAttribute('href') ?? undefined,
131
+ target: base.getAttribute('target') ?? undefined,
132
+ });
133
+ }
134
+ for (const meta of document.querySelectorAll('meta')) {
135
+ if (!(meta instanceof HTMLMetaElement))
136
+ continue;
137
+ const name = meta.getAttribute('name');
138
+ const property = meta.getAttribute('property');
139
+ const httpEquiv = meta.getAttribute('http-equiv');
140
+ const itemprop = meta.getAttribute('itemprop');
141
+ const charset = meta.getAttribute('charset');
142
+ const content = meta.getAttribute('content');
143
+ const media = meta.getAttribute('media');
144
+ entries.push({
145
+ kind: 'meta',
146
+ name: name ? name.toLowerCase() : undefined,
147
+ property: property ? property.toLowerCase() : undefined,
148
+ httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
149
+ itemprop: itemprop ?? undefined,
150
+ charset: charset ?? undefined,
151
+ content: content ?? undefined,
152
+ media: media ?? undefined,
153
+ });
154
+ }
155
+ for (const link of document.querySelectorAll('link[href]')) {
156
+ if (!(link instanceof HTMLLinkElement))
157
+ continue;
158
+ const relRaw = link.getAttribute('rel') ?? '';
159
+ const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
160
+ entries.push({
161
+ kind: 'link',
162
+ rel,
163
+ href: link.getAttribute('href') ?? '',
164
+ type: link.getAttribute('type') ?? undefined,
165
+ media: link.getAttribute('media') ?? undefined,
166
+ sizes: link.getAttribute('sizes') ?? undefined,
167
+ title: link.getAttribute('title') ?? undefined,
168
+ hreflang: link.getAttribute('hreflang') ?? undefined,
169
+ as: link.getAttribute('as') ?? undefined,
170
+ crossorigin: link.getAttribute('crossorigin') ?? undefined,
171
+ color: link.getAttribute('color') ?? undefined,
172
+ blocking: link.getAttribute('blocking') ?? undefined,
173
+ imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
174
+ });
175
+ }
176
+ const STRUCTURED_TYPES = new Set([
177
+ 'application/ld+json',
178
+ 'speculationrules',
179
+ 'application/json+oembed',
180
+ 'application/xml+oembed',
181
+ ]);
182
+ for (const script of document.querySelectorAll('script[type]')) {
183
+ if (!(script instanceof HTMLScriptElement))
184
+ continue;
185
+ const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
186
+ if (!STRUCTURED_TYPES.has(scriptType))
187
+ continue;
188
+ const src = script.getAttribute('src') ?? undefined;
189
+ const text = script.textContent ?? '';
190
+ const inHead = !!script.closest('head');
191
+ const inNoscript = !!script.closest('noscript');
192
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
193
+ entries.push({
194
+ kind: 'script',
195
+ scriptType,
196
+ content: text || undefined,
197
+ src,
198
+ location,
199
+ });
200
+ }
201
+ for (const iframe of document.querySelectorAll('iframe[src]')) {
202
+ if (!(iframe instanceof HTMLIFrameElement))
203
+ continue;
204
+ const src = iframe.getAttribute('src') ?? '';
205
+ if (!src)
206
+ continue;
207
+ const inHead = !!iframe.closest('head');
208
+ const inNoscript = !!iframe.closest('noscript');
209
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
210
+ entries.push({ kind: 'iframe', src, location });
211
+ }
212
+ const win = window;
213
+ const presentGlobals = [];
214
+ for (const name of knownGlobals) {
215
+ if (win[name] !== undefined) {
216
+ presentGlobals.push(name);
217
+ }
218
+ }
219
+ if (presentGlobals.length > 0) {
220
+ entries.push({ kind: 'window-global', names: presentGlobals });
221
+ }
222
+ return entries;
223
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@d-zero/beholder",
3
- "version": "3.0.0",
3
+ "version": "3.1.0",
4
4
  "description": "Page-level scraper for web crawling and auditing",
5
5
  "author": "D-ZERO",
6
6
  "license": "MIT",
@@ -27,12 +27,14 @@
27
27
  "simple-wappalyzer": "1.1.99"
28
28
  },
29
29
  "devDependencies": {
30
- "@types/debug": "4.1.12"
30
+ "@types/debug": "4.1.12",
31
+ "@types/jsdom": "28.0.3",
32
+ "jsdom": "29.1.1"
31
33
  },
32
34
  "repository": {
33
35
  "type": "git",
34
36
  "url": "https://github.com/d-zero-dev/tools.git",
35
37
  "directory": "packages/@d-zero/beholder"
36
38
  },
37
- "gitHead": "16c831105a12bb635d49130e7f5add25b6643c40"
39
+ "gitHead": "e69344a9d4d45b0ec0ee942f920b84bbd0fb77ae"
38
40
  }
@@ -22,6 +22,7 @@ import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
22
22
 
23
23
  import { domDetailsLog, domLog } from './debug.js';
24
24
  import { classify, emptyMeta } from './meta/classify.js';
25
+ import { WINDOW_GLOBALS_TO_CHECK, collectHeadFromDocument } from './meta/collect-head.js';
25
26
  import { detectTags } from './meta/tag-detection.js';
26
27
  import { parseUrl } from './parse-url.js';
27
28
 
@@ -515,46 +516,6 @@ export type GetMetaContext = {
515
516
  readonly includeRaw?: boolean;
516
517
  };
517
518
 
518
- const WINDOW_GLOBALS_TO_CHECK: readonly string[] = [
519
- 'dataLayer',
520
- 'gtag',
521
- 'ga',
522
- '_gaq',
523
- 'fbq',
524
- '_fbq',
525
- 'clarity',
526
- '_hjSettings',
527
- '_hjid',
528
- 'twq',
529
- 'ttq',
530
- '_linkedin_partner_id',
531
- 'pintrk',
532
- 'amplitude',
533
- 'mixpanel',
534
- 'analytics',
535
- 'heap',
536
- 'posthog',
537
- 'plausible',
538
- 'fathom',
539
- '_paq',
540
- 's_account',
541
- 's',
542
- 'ym',
543
- 'UET',
544
- 'optimizely',
545
- '_hsq',
546
- 'Sentry',
547
- 'Intercom',
548
- 'intercomSettings',
549
- 'drift',
550
- 'Tawk_API',
551
- 'zE',
552
- 'OneTrust',
553
- 'Cookiebot',
554
- 'Stripe',
555
- 'grecaptcha',
556
- ];
557
-
558
519
  /**
559
520
  * Extracts comprehensive metadata from the page.
560
521
  *
@@ -639,129 +600,27 @@ async function runGetMeta(page: Page, context: GetMetaContext): Promise<Meta | n
639
600
  }
640
601
 
641
602
  /**
603
+ * Collects raw `<head>` entries from a Puppeteer page by injecting
604
+ * {@link collectHeadFromDocument} into the page realm.
642
605
  *
643
- * @param page
606
+ * WHY string-eval instead of `page.evaluate(fn, args)`: the shared
607
+ * implementation lives in this module (`collectHeadFromDocument`), and a
608
+ * `page.evaluate(() => collectHeadFromDocument(window, …))` wrapper cannot
609
+ * reach that module-scope binding inside the page realm — only the wrapper's
610
+ * own source crosses the CDP boundary. Serializing the implementation via
611
+ * `Function.prototype.toString` and invoking it through
612
+ * `page.evaluate(string)` is what keeps the Puppeteer path and the
613
+ * jsdom path on one source of truth.
614
+ *
615
+ * The same {@link collectHeadFromDocument} function is also exposed via
616
+ * {@link ../extract-meta.ts | extractMetaFromDocument} for jsdom/Node callers,
617
+ * so the two paths cannot drift apart.
618
+ * @param page - The Puppeteer page whose document will be inspected.
644
619
  */
645
620
  async function collectHeadOnPage(page: Page): Promise<RawHeadEntry[]> {
646
- const raw = await page
647
- .evaluate((knownGlobals: readonly string[]) => {
648
- /* global document, HTMLLinkElement, HTMLMetaElement, HTMLBaseElement,
649
- HTMLScriptElement, HTMLIFrameElement */
650
- type Out = unknown;
651
- const entries: Out[] = [];
652
-
653
- const html = document.documentElement;
654
- entries.push(
655
- {
656
- kind: 'html',
657
- lang: html.lang || undefined,
658
- dir: html.dir || undefined,
659
- xmlns: html.getAttribute('xmlns') ?? undefined,
660
- prefix: html.getAttribute('prefix') ?? undefined,
661
- vocab: html.getAttribute('vocab') ?? undefined,
662
- typeOf: html.getAttribute('typeof') ?? undefined,
663
- itemscope: html.hasAttribute('itemscope') || undefined,
664
- itemtype: html.getAttribute('itemtype') ?? undefined,
665
- amp: html.hasAttribute('amp') || undefined,
666
- lightning: html.hasAttribute('⚡') || undefined,
667
- },
668
- { kind: 'title', content: document.title },
669
- );
670
-
671
- for (const base of document.querySelectorAll('base')) {
672
- if (!(base instanceof HTMLBaseElement)) continue;
673
- entries.push({
674
- kind: 'base',
675
- href: base.getAttribute('href') ?? undefined,
676
- target: base.getAttribute('target') ?? undefined,
677
- });
678
- }
679
-
680
- for (const meta of document.querySelectorAll('meta')) {
681
- if (!(meta instanceof HTMLMetaElement)) continue;
682
- const name = meta.getAttribute('name');
683
- const property = meta.getAttribute('property');
684
- const httpEquiv = meta.getAttribute('http-equiv');
685
- const itemprop = meta.getAttribute('itemprop');
686
- const charset = meta.getAttribute('charset');
687
- const content = meta.getAttribute('content');
688
- const media = meta.getAttribute('media');
689
- entries.push({
690
- kind: 'meta',
691
- name: name ? name.toLowerCase() : undefined,
692
- property: property ? property.toLowerCase() : undefined,
693
- httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
694
- itemprop: itemprop ?? undefined,
695
- charset: charset ?? undefined,
696
- content: content ?? undefined,
697
- media: media ?? undefined,
698
- });
699
- }
700
-
701
- for (const link of document.querySelectorAll('link[href]')) {
702
- if (!(link instanceof HTMLLinkElement)) continue;
703
- const relRaw = link.getAttribute('rel') ?? '';
704
- const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
705
- entries.push({
706
- kind: 'link',
707
- rel,
708
- href: link.getAttribute('href') ?? '',
709
- type: link.getAttribute('type') ?? undefined,
710
- media: link.getAttribute('media') ?? undefined,
711
- sizes: link.getAttribute('sizes') ?? undefined,
712
- title: link.getAttribute('title') ?? undefined,
713
- hreflang: link.getAttribute('hreflang') ?? undefined,
714
- as: link.getAttribute('as') ?? undefined,
715
- crossorigin: link.getAttribute('crossorigin') ?? undefined,
716
- color: link.getAttribute('color') ?? undefined,
717
- blocking: link.getAttribute('blocking') ?? undefined,
718
- imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
719
- });
720
- }
721
-
722
- const STRUCTURED_TYPES = new Set([
723
- 'application/ld+json',
724
- 'speculationrules',
725
- 'application/json+oembed',
726
- 'application/xml+oembed',
727
- ]);
728
- for (const script of document.querySelectorAll('script[type]')) {
729
- if (!(script instanceof HTMLScriptElement)) continue;
730
- const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
731
- if (!STRUCTURED_TYPES.has(scriptType)) continue;
732
- const src = script.getAttribute('src') ?? undefined;
733
- const text = script.textContent ?? '';
734
- const inHead = !!script.closest('head');
735
- const inNoscript = !!script.closest('noscript');
736
- const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
737
- entries.push({
738
- kind: 'script',
739
- scriptType,
740
- content: text || undefined,
741
- src,
742
- location,
743
- });
744
- }
745
-
746
- for (const iframe of document.querySelectorAll('iframe[src]')) {
747
- if (!(iframe instanceof HTMLIFrameElement)) continue;
748
- const src = iframe.getAttribute('src') ?? '';
749
- if (!src) continue;
750
- const inHead = !!iframe.closest('head');
751
- const inNoscript = !!iframe.closest('noscript');
752
- const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
753
- entries.push({ kind: 'iframe', src, location });
754
- }
755
-
756
- const win = window as unknown as Record<string, unknown>;
757
- const presentGlobals = knownGlobals.filter((name) => win[name] !== undefined);
758
- if (presentGlobals.length > 0) {
759
- entries.push({ kind: 'window-global', names: presentGlobals });
760
- }
761
-
762
- return entries;
763
- }, WINDOW_GLOBALS_TO_CHECK)
764
- .catch(() => [] as unknown[]);
765
-
621
+ const fnSource = collectHeadFromDocument.toString();
622
+ const globalsLiteral = JSON.stringify(WINDOW_GLOBALS_TO_CHECK);
623
+ const expr = `(${fnSource})(window, ${globalsLiteral})`;
624
+ const raw = await page.evaluate(expr).catch(() => [] as unknown[]);
766
625
  return raw as RawHeadEntry[];
767
626
  }