@d-zero/beholder 3.0.0 → 3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/README.md +26 -0
- package/dist/dom-evaluation.js +20 -152
- package/dist/extract-meta.d.ts +98 -0
- package/dist/extract-meta.js +75 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +1 -0
- package/dist/meta/collect-head.d.ts +63 -0
- package/dist/meta/collect-head.js +223 -0
- package/dist/scraper.js +53 -8
- package/package.json +6 -4
- package/src/dom-evaluation.ts +20 -161
- package/src/extract-meta.spec.ts +247 -0
- package/src/extract-meta.ts +121 -0
- package/src/index.ts +2 -0
- package/src/meta/collect-head.ts +247 -0
- package/src/scraper.ts +55 -8
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOM-side raw `<head>` collector.
|
|
3
|
+
*
|
|
4
|
+
* `collectHeadFromDocument` walks a `Document` (Puppeteer page realm or jsdom realm
|
|
5
|
+
* alike) and produces a serializable {@link RawHeadEntry}[] that
|
|
6
|
+
* {@link ../meta/classify.ts | classify} can turn into a typed `Meta`.
|
|
7
|
+
*
|
|
8
|
+
* WHY this function is realm-agnostic:
|
|
9
|
+
*
|
|
10
|
+
* - The Puppeteer path stringifies this function via `Function.prototype.toString`
|
|
11
|
+
* and runs it as a `page.evaluate(string)` expression, so any closure over
|
|
12
|
+
* module-scope bindings would resolve to `undefined` in the browser realm.
|
|
13
|
+
* - The jsdom (Node) path calls it directly with the jsdom `Window`. Because
|
|
14
|
+
* `HTMLLinkElement` (etc.) in jsdom is a *different class instance* from the
|
|
15
|
+
* one in the page realm, `instanceof` only works when the constructor is read
|
|
16
|
+
* from the *passed* `window` rather than from bare globals.
|
|
17
|
+
*
|
|
18
|
+
* Together those constraints dictate that the function MUST:
|
|
19
|
+
*
|
|
20
|
+
* 1. Reference no module-level variables — only its own parameters and inner locals.
|
|
21
|
+
* 2. Take every HTML class constructor (`HTMLBaseElement`, …) from the passed
|
|
22
|
+
* `window` via destructuring instead of relying on ambient globals.
|
|
23
|
+
* 3. Stay in plain ES syntax (no TS-only constructs that need helper imports).
|
|
24
|
+
* @module
|
|
25
|
+
*/
|
|
26
|
+
/**
|
|
27
|
+
* Curated list of `window` globals whose presence indicates that a third-party
|
|
28
|
+
* tag library has been loaded on the page. Surfaced as a single
|
|
29
|
+
* `kind: 'window-global'` entry so that downstream consumers (e.g. tag-detection)
|
|
30
|
+
* can cross-reference the script/iframe signals.
|
|
31
|
+
*
|
|
32
|
+
* Kept here (rather than in `dom-evaluation.ts`) so the Puppeteer path and the
|
|
33
|
+
* jsdom path share one source of truth.
|
|
34
|
+
*/
|
|
35
|
+
export const WINDOW_GLOBALS_TO_CHECK = [
|
|
36
|
+
'dataLayer',
|
|
37
|
+
'gtag',
|
|
38
|
+
'ga',
|
|
39
|
+
'_gaq',
|
|
40
|
+
'fbq',
|
|
41
|
+
'_fbq',
|
|
42
|
+
'clarity',
|
|
43
|
+
'_hjSettings',
|
|
44
|
+
'_hjid',
|
|
45
|
+
'twq',
|
|
46
|
+
'ttq',
|
|
47
|
+
'_linkedin_partner_id',
|
|
48
|
+
'pintrk',
|
|
49
|
+
'amplitude',
|
|
50
|
+
'mixpanel',
|
|
51
|
+
'analytics',
|
|
52
|
+
'heap',
|
|
53
|
+
'posthog',
|
|
54
|
+
'plausible',
|
|
55
|
+
'fathom',
|
|
56
|
+
'_paq',
|
|
57
|
+
's_account',
|
|
58
|
+
's',
|
|
59
|
+
'ym',
|
|
60
|
+
'UET',
|
|
61
|
+
'optimizely',
|
|
62
|
+
'_hsq',
|
|
63
|
+
'Sentry',
|
|
64
|
+
'Intercom',
|
|
65
|
+
'intercomSettings',
|
|
66
|
+
'drift',
|
|
67
|
+
'Tawk_API',
|
|
68
|
+
'zE',
|
|
69
|
+
'OneTrust',
|
|
70
|
+
'Cookiebot',
|
|
71
|
+
'Stripe',
|
|
72
|
+
'grecaptcha',
|
|
73
|
+
];
|
|
74
|
+
/**
|
|
75
|
+
* Walks the given window's `Document` and returns a serializable list of raw
|
|
76
|
+
* head entries.
|
|
77
|
+
*
|
|
78
|
+
* Two realms are supported:
|
|
79
|
+
*
|
|
80
|
+
* - Browser realm (Puppeteer): the function source is `.toString()`'d and run
|
|
81
|
+
* inside the page via `page.evaluate(string)`. Inside the page, `window`
|
|
82
|
+
* resolves to the page's global object, so destructured class constructors
|
|
83
|
+
* match `instanceof` checks against elements returned from `querySelectorAll`.
|
|
84
|
+
* - Node realm (jsdom et al.): the caller passes `dom.window` directly. jsdom's
|
|
85
|
+
* HTML element prototypes are distinct from the host Node's bare globals, so
|
|
86
|
+
* reading the constructors off the passed `window` is what makes `instanceof`
|
|
87
|
+
* succeed.
|
|
88
|
+
*
|
|
89
|
+
* The function MUST NOT close over any module-scope binding — all data it needs
|
|
90
|
+
* is reached through its two parameters.
|
|
91
|
+
* @param window - The window object whose `document` will be inspected. Provides
|
|
92
|
+
* both the DOM tree and the HTML element constructors used for
|
|
93
|
+
* `instanceof` narrowing.
|
|
94
|
+
* @param knownGlobals - Names of `window` properties that, when present,
|
|
95
|
+
* indicate a third-party tag library is loaded. Required
|
|
96
|
+
* (no default) so the Puppeteer-side string-eval path
|
|
97
|
+
* does not have to inline a default value list.
|
|
98
|
+
* @returns Serializable list of raw head entries for {@link ../meta/classify.ts | classify}.
|
|
99
|
+
*/
|
|
100
|
+
export function collectHeadFromDocument(window, knownGlobals) {
|
|
101
|
+
const document = window.document;
|
|
102
|
+
// TypeScript's `Window` interface in lib.dom does not directly expose the
|
|
103
|
+
// HTML element constructors (`HTMLLinkElement`, `HTMLScriptElement`, …)
|
|
104
|
+
// even though every real window object — browser realm AND jsdom realm —
|
|
105
|
+
// carries them at runtime. Widening the type here lets us destructure them
|
|
106
|
+
// uniformly; the runtime values come straight from the passed window, so
|
|
107
|
+
// the cast is purely cosmetic for TS and erased at compile time.
|
|
108
|
+
const w = window;
|
|
109
|
+
const { HTMLBaseElement, HTMLMetaElement, HTMLLinkElement, HTMLScriptElement, HTMLIFrameElement, } = w;
|
|
110
|
+
const entries = [];
|
|
111
|
+
const html = document.documentElement;
|
|
112
|
+
entries.push({
|
|
113
|
+
kind: 'html',
|
|
114
|
+
lang: html.lang || undefined,
|
|
115
|
+
dir: html.dir || undefined,
|
|
116
|
+
xmlns: html.getAttribute('xmlns') ?? undefined,
|
|
117
|
+
prefix: html.getAttribute('prefix') ?? undefined,
|
|
118
|
+
vocab: html.getAttribute('vocab') ?? undefined,
|
|
119
|
+
typeOf: html.getAttribute('typeof') ?? undefined,
|
|
120
|
+
itemscope: html.hasAttribute('itemscope') || undefined,
|
|
121
|
+
itemtype: html.getAttribute('itemtype') ?? undefined,
|
|
122
|
+
amp: html.hasAttribute('amp') || undefined,
|
|
123
|
+
lightning: html.hasAttribute('⚡') || undefined,
|
|
124
|
+
}, { kind: 'title', content: document.title });
|
|
125
|
+
for (const base of document.querySelectorAll('base')) {
|
|
126
|
+
if (!(base instanceof HTMLBaseElement))
|
|
127
|
+
continue;
|
|
128
|
+
entries.push({
|
|
129
|
+
kind: 'base',
|
|
130
|
+
href: base.getAttribute('href') ?? undefined,
|
|
131
|
+
target: base.getAttribute('target') ?? undefined,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
for (const meta of document.querySelectorAll('meta')) {
|
|
135
|
+
if (!(meta instanceof HTMLMetaElement))
|
|
136
|
+
continue;
|
|
137
|
+
const name = meta.getAttribute('name');
|
|
138
|
+
const property = meta.getAttribute('property');
|
|
139
|
+
const httpEquiv = meta.getAttribute('http-equiv');
|
|
140
|
+
const itemprop = meta.getAttribute('itemprop');
|
|
141
|
+
const charset = meta.getAttribute('charset');
|
|
142
|
+
const content = meta.getAttribute('content');
|
|
143
|
+
const media = meta.getAttribute('media');
|
|
144
|
+
entries.push({
|
|
145
|
+
kind: 'meta',
|
|
146
|
+
name: name ? name.toLowerCase() : undefined,
|
|
147
|
+
property: property ? property.toLowerCase() : undefined,
|
|
148
|
+
httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
|
|
149
|
+
itemprop: itemprop ?? undefined,
|
|
150
|
+
charset: charset ?? undefined,
|
|
151
|
+
content: content ?? undefined,
|
|
152
|
+
media: media ?? undefined,
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
for (const link of document.querySelectorAll('link[href]')) {
|
|
156
|
+
if (!(link instanceof HTMLLinkElement))
|
|
157
|
+
continue;
|
|
158
|
+
const relRaw = link.getAttribute('rel') ?? '';
|
|
159
|
+
const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
|
|
160
|
+
entries.push({
|
|
161
|
+
kind: 'link',
|
|
162
|
+
rel,
|
|
163
|
+
href: link.getAttribute('href') ?? '',
|
|
164
|
+
type: link.getAttribute('type') ?? undefined,
|
|
165
|
+
media: link.getAttribute('media') ?? undefined,
|
|
166
|
+
sizes: link.getAttribute('sizes') ?? undefined,
|
|
167
|
+
title: link.getAttribute('title') ?? undefined,
|
|
168
|
+
hreflang: link.getAttribute('hreflang') ?? undefined,
|
|
169
|
+
as: link.getAttribute('as') ?? undefined,
|
|
170
|
+
crossorigin: link.getAttribute('crossorigin') ?? undefined,
|
|
171
|
+
color: link.getAttribute('color') ?? undefined,
|
|
172
|
+
blocking: link.getAttribute('blocking') ?? undefined,
|
|
173
|
+
imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
const STRUCTURED_TYPES = new Set([
|
|
177
|
+
'application/ld+json',
|
|
178
|
+
'speculationrules',
|
|
179
|
+
'application/json+oembed',
|
|
180
|
+
'application/xml+oembed',
|
|
181
|
+
]);
|
|
182
|
+
for (const script of document.querySelectorAll('script[type]')) {
|
|
183
|
+
if (!(script instanceof HTMLScriptElement))
|
|
184
|
+
continue;
|
|
185
|
+
const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
|
|
186
|
+
if (!STRUCTURED_TYPES.has(scriptType))
|
|
187
|
+
continue;
|
|
188
|
+
const src = script.getAttribute('src') ?? undefined;
|
|
189
|
+
const text = script.textContent ?? '';
|
|
190
|
+
const inHead = !!script.closest('head');
|
|
191
|
+
const inNoscript = !!script.closest('noscript');
|
|
192
|
+
const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
|
|
193
|
+
entries.push({
|
|
194
|
+
kind: 'script',
|
|
195
|
+
scriptType,
|
|
196
|
+
content: text || undefined,
|
|
197
|
+
src,
|
|
198
|
+
location,
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
for (const iframe of document.querySelectorAll('iframe[src]')) {
|
|
202
|
+
if (!(iframe instanceof HTMLIFrameElement))
|
|
203
|
+
continue;
|
|
204
|
+
const src = iframe.getAttribute('src') ?? '';
|
|
205
|
+
if (!src)
|
|
206
|
+
continue;
|
|
207
|
+
const inHead = !!iframe.closest('head');
|
|
208
|
+
const inNoscript = !!iframe.closest('noscript');
|
|
209
|
+
const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
|
|
210
|
+
entries.push({ kind: 'iframe', src, location });
|
|
211
|
+
}
|
|
212
|
+
const win = window;
|
|
213
|
+
const presentGlobals = [];
|
|
214
|
+
for (const name of knownGlobals) {
|
|
215
|
+
if (win[name] !== undefined) {
|
|
216
|
+
presentGlobals.push(name);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
if (presentGlobals.length > 0) {
|
|
220
|
+
entries.push({ kind: 'window-global', names: presentGlobals });
|
|
221
|
+
}
|
|
222
|
+
return entries;
|
|
223
|
+
}
|
package/dist/scraper.js
CHANGED
|
@@ -51,6 +51,17 @@ import { parseUrl } from './parse-url.js';
|
|
|
51
51
|
const pid = `${process.pid}`;
|
|
52
52
|
const log = scraperLog.extend(pid);
|
|
53
53
|
const rLog = resourceLog.extend(pid);
|
|
54
|
+
/**
|
|
55
|
+
* Upper bound for `document.body.scrollHeight` tolerated by `#fetchImages`.
|
|
56
|
+
* Pages exceeding this at a given device preset are skipped to keep
|
|
57
|
+
* `scrollAllOver` from running long enough to outlast the @retryable
|
|
58
|
+
* timeout and collide with a follow-up retry on the same Puppeteer page.
|
|
59
|
+
*
|
|
60
|
+
* 1,000,000 px is roughly 3× the worst real-world value we have measured
|
|
61
|
+
* (a responsive data-table page reached ~321k px at 320px viewport), so
|
|
62
|
+
* normal responsive sites complete well within the 20 min retry budget.
|
|
63
|
+
*/
|
|
64
|
+
const MAX_SCROLL_HEIGHT = 1_000_000;
|
|
54
65
|
let Scraper = (() => {
|
|
55
66
|
let _classSuper = EventEmitter;
|
|
56
67
|
let _instanceExtraInitializers = [];
|
|
@@ -62,7 +73,7 @@ let Scraper = (() => {
|
|
|
62
73
|
static {
|
|
63
74
|
const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
|
|
64
75
|
_private_fetchData_decorators = [retryable({
|
|
65
|
-
timeout:
|
|
76
|
+
timeout: 25 * 60 * 1000,
|
|
66
77
|
onWait(determinedInterval, retryCount, methodName, error) {
|
|
67
78
|
void this.emit('changePhase', {
|
|
68
79
|
pid: process.pid,
|
|
@@ -83,7 +94,7 @@ let Scraper = (() => {
|
|
|
83
94
|
},
|
|
84
95
|
})];
|
|
85
96
|
_private_fetchImages_decorators = [retryable({
|
|
86
|
-
timeout:
|
|
97
|
+
timeout: 20 * 60 * 1000,
|
|
87
98
|
fallback: [],
|
|
88
99
|
onWait(determinedInterval, retryCount, methodName, error) {
|
|
89
100
|
void this.emit('changePhase', {
|
|
@@ -402,13 +413,24 @@ let Scraper = (() => {
|
|
|
402
413
|
isExternal,
|
|
403
414
|
message: `📷 ${key} ↔️ ${preset.width}px`,
|
|
404
415
|
});
|
|
405
|
-
await beforePageScan(page, url, {
|
|
416
|
+
const scanResult = await beforePageScan(page, url, {
|
|
406
417
|
name: key,
|
|
407
418
|
width: preset.width,
|
|
408
419
|
resolution: preset.resolution,
|
|
409
420
|
listener,
|
|
410
421
|
timeout: 5000,
|
|
422
|
+
maxScrollHeight: MAX_SCROLL_HEIGHT,
|
|
411
423
|
});
|
|
424
|
+
if (!scanResult.scrolled) {
|
|
425
|
+
void this.emit('changePhase', {
|
|
426
|
+
pid: process.pid,
|
|
427
|
+
name: 'retryExhausted',
|
|
428
|
+
url: null,
|
|
429
|
+
isExternal: false,
|
|
430
|
+
message: `📷 ${key}: skipped — scrollHeight ${scanResult.scrollHeight} exceeds limit ${MAX_SCROLL_HEIGHT}`,
|
|
431
|
+
});
|
|
432
|
+
continue;
|
|
433
|
+
}
|
|
412
434
|
void this.emit('changePhase', {
|
|
413
435
|
pid: process.pid,
|
|
414
436
|
name: 'waitImageLoad',
|
|
@@ -667,9 +689,18 @@ let Scraper = (() => {
|
|
|
667
689
|
/**
|
|
668
690
|
* Navigates the page to the target URL and extracts full page data.
|
|
669
691
|
*
|
|
670
|
-
* WHY retryable with
|
|
671
|
-
* network issues or slow-loading pages. The decorator retries
|
|
672
|
-
* emitting `retryWait` / `retryExhausted` phase events for
|
|
692
|
+
* WHY retryable with 25-min timeout: Page navigation can fail due to
|
|
693
|
+
* transient network issues or slow-loading pages. The decorator retries
|
|
694
|
+
* automatically, emitting `retryWait` / `retryExhausted` phase events for
|
|
695
|
+
* progress monitoring. The timeout must accommodate the worst-case
|
|
696
|
+
* `#fetchImages` runtime (its own @retryable allows up to 20 min for
|
|
697
|
+
* pages with very large `scrollHeight` at narrow viewports). A shorter
|
|
698
|
+
* `#fetchData` timeout would race `#fetchImages` to completion: when the
|
|
699
|
+
* outer race fires first, `Promise.race` does not cancel the inner
|
|
700
|
+
* `#fetchImages`, so a new `#fetchData` retry starts while the previous
|
|
701
|
+
* attempt's scroll evaluates are still running on the same page —
|
|
702
|
+
* exactly the collision that surfaces as "Attempted to use detached
|
|
703
|
+
* Frame" or "Session closed".
|
|
673
704
|
*
|
|
674
705
|
* Flow:
|
|
675
706
|
* 1. Register request/response/requestfailed listeners to capture sub-resources (internal pages only)
|
|
@@ -701,9 +732,23 @@ let Scraper = (() => {
|
|
|
701
732
|
* changes and triggers a reload. Isolating each device preset allows partial
|
|
702
733
|
* results — if one viewport fails, the other can still succeed.
|
|
703
734
|
*
|
|
704
|
-
* WHY retryable with
|
|
735
|
+
* WHY retryable with 20-min timeout and `fallback: []`: Image extraction is
|
|
705
736
|
* best-effort. If all retries fail, an empty array is returned rather than
|
|
706
|
-
* failing the entire page scrape.
|
|
737
|
+
* failing the entire page scrape. The 20-min wall clock accommodates pages
|
|
738
|
+
* whose mobile-small `scrollHeight` reaches ~300k px (observed on
|
|
739
|
+
* responsive data tables, which take ~5 min to scroll). A shorter timeout
|
|
740
|
+
* causes a second retry to start while the previous attempt's
|
|
741
|
+
* `scrollAllOver` is still running its `page.evaluate` calls in the
|
|
742
|
+
* background — `Promise.race` in `retry.ts` does not cancel `fn()`. The
|
|
743
|
+
* collision then surfaces as "Attempted to use detached Frame" or
|
|
744
|
+
* "Session closed" when the new attempt's reload / setViewport runs on
|
|
745
|
+
* the same page as the old attempt's pending evaluates.
|
|
746
|
+
*
|
|
747
|
+
* WHY pass `maxScrollHeight`: Even 20 min is not enough for pathological
|
|
748
|
+
* pages whose layout explodes at narrow viewports. Skipping the device
|
|
749
|
+
* preset entirely keeps the timeout-vs-background-evaluate collision from
|
|
750
|
+
* ever being triggered, at the cost of losing that viewport's image data
|
|
751
|
+
* for those pages. See {@link MAX_SCROLL_HEIGHT} for the chosen threshold.
|
|
707
752
|
* @param page - Puppeteer page instance
|
|
708
753
|
* @param url - The page URL string (without hash and auth)
|
|
709
754
|
* @param isExternal - Whether the page is external
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/beholder",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.1.1",
|
|
4
4
|
"description": "Page-level scraper for web crawling and auditing",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -20,19 +20,21 @@
|
|
|
20
20
|
"clean": "tsc --build --clean"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@d-zero/puppeteer-page-scan": "4.
|
|
23
|
+
"@d-zero/puppeteer-page-scan": "4.6.0",
|
|
24
24
|
"@d-zero/shared": "0.22.0",
|
|
25
25
|
"debug": "4.4.3",
|
|
26
26
|
"puppeteer": "24.37.5",
|
|
27
27
|
"simple-wappalyzer": "1.1.99"
|
|
28
28
|
},
|
|
29
29
|
"devDependencies": {
|
|
30
|
-
"@types/debug": "4.1.12"
|
|
30
|
+
"@types/debug": "4.1.12",
|
|
31
|
+
"@types/jsdom": "28.0.3",
|
|
32
|
+
"jsdom": "29.1.1"
|
|
31
33
|
},
|
|
32
34
|
"repository": {
|
|
33
35
|
"type": "git",
|
|
34
36
|
"url": "https://github.com/d-zero-dev/tools.git",
|
|
35
37
|
"directory": "packages/@d-zero/beholder"
|
|
36
38
|
},
|
|
37
|
-
"gitHead": "
|
|
39
|
+
"gitHead": "d876ace142711051c337f7922931776526047cb0"
|
|
38
40
|
}
|
package/src/dom-evaluation.ts
CHANGED
|
@@ -22,6 +22,7 @@ import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
|
|
|
22
22
|
|
|
23
23
|
import { domDetailsLog, domLog } from './debug.js';
|
|
24
24
|
import { classify, emptyMeta } from './meta/classify.js';
|
|
25
|
+
import { WINDOW_GLOBALS_TO_CHECK, collectHeadFromDocument } from './meta/collect-head.js';
|
|
25
26
|
import { detectTags } from './meta/tag-detection.js';
|
|
26
27
|
import { parseUrl } from './parse-url.js';
|
|
27
28
|
|
|
@@ -515,46 +516,6 @@ export type GetMetaContext = {
|
|
|
515
516
|
readonly includeRaw?: boolean;
|
|
516
517
|
};
|
|
517
518
|
|
|
518
|
-
const WINDOW_GLOBALS_TO_CHECK: readonly string[] = [
|
|
519
|
-
'dataLayer',
|
|
520
|
-
'gtag',
|
|
521
|
-
'ga',
|
|
522
|
-
'_gaq',
|
|
523
|
-
'fbq',
|
|
524
|
-
'_fbq',
|
|
525
|
-
'clarity',
|
|
526
|
-
'_hjSettings',
|
|
527
|
-
'_hjid',
|
|
528
|
-
'twq',
|
|
529
|
-
'ttq',
|
|
530
|
-
'_linkedin_partner_id',
|
|
531
|
-
'pintrk',
|
|
532
|
-
'amplitude',
|
|
533
|
-
'mixpanel',
|
|
534
|
-
'analytics',
|
|
535
|
-
'heap',
|
|
536
|
-
'posthog',
|
|
537
|
-
'plausible',
|
|
538
|
-
'fathom',
|
|
539
|
-
'_paq',
|
|
540
|
-
's_account',
|
|
541
|
-
's',
|
|
542
|
-
'ym',
|
|
543
|
-
'UET',
|
|
544
|
-
'optimizely',
|
|
545
|
-
'_hsq',
|
|
546
|
-
'Sentry',
|
|
547
|
-
'Intercom',
|
|
548
|
-
'intercomSettings',
|
|
549
|
-
'drift',
|
|
550
|
-
'Tawk_API',
|
|
551
|
-
'zE',
|
|
552
|
-
'OneTrust',
|
|
553
|
-
'Cookiebot',
|
|
554
|
-
'Stripe',
|
|
555
|
-
'grecaptcha',
|
|
556
|
-
];
|
|
557
|
-
|
|
558
519
|
/**
|
|
559
520
|
* Extracts comprehensive metadata from the page.
|
|
560
521
|
*
|
|
@@ -639,129 +600,27 @@ async function runGetMeta(page: Page, context: GetMetaContext): Promise<Meta | n
|
|
|
639
600
|
}
|
|
640
601
|
|
|
641
602
|
/**
|
|
603
|
+
* Collects raw `<head>` entries from a Puppeteer page by injecting
|
|
604
|
+
* {@link collectHeadFromDocument} into the page realm.
|
|
642
605
|
*
|
|
643
|
-
*
|
|
606
|
+
* WHY string-eval instead of `page.evaluate(fn, args)`: the shared
|
|
607
|
+
* implementation lives in this module (`collectHeadFromDocument`), and a
|
|
608
|
+
* `page.evaluate(() => collectHeadFromDocument(window, …))` wrapper cannot
|
|
609
|
+
* reach that module-scope binding inside the page realm — only the wrapper's
|
|
610
|
+
* own source crosses the CDP boundary. Serializing the implementation via
|
|
611
|
+
* `Function.prototype.toString` and invoking it through
|
|
612
|
+
* `page.evaluate(string)` is what keeps the Puppeteer path and the
|
|
613
|
+
* jsdom path on one source of truth.
|
|
614
|
+
*
|
|
615
|
+
* The same {@link collectHeadFromDocument} function is also exposed via
|
|
616
|
+
* {@link ../extract-meta.ts | extractMetaFromDocument} for jsdom/Node callers,
|
|
617
|
+
* so the two paths cannot drift apart.
|
|
618
|
+
* @param page - The Puppeteer page whose document will be inspected.
|
|
644
619
|
*/
|
|
645
620
|
async function collectHeadOnPage(page: Page): Promise<RawHeadEntry[]> {
|
|
646
|
-
const
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
type Out = unknown;
|
|
651
|
-
const entries: Out[] = [];
|
|
652
|
-
|
|
653
|
-
const html = document.documentElement;
|
|
654
|
-
entries.push(
|
|
655
|
-
{
|
|
656
|
-
kind: 'html',
|
|
657
|
-
lang: html.lang || undefined,
|
|
658
|
-
dir: html.dir || undefined,
|
|
659
|
-
xmlns: html.getAttribute('xmlns') ?? undefined,
|
|
660
|
-
prefix: html.getAttribute('prefix') ?? undefined,
|
|
661
|
-
vocab: html.getAttribute('vocab') ?? undefined,
|
|
662
|
-
typeOf: html.getAttribute('typeof') ?? undefined,
|
|
663
|
-
itemscope: html.hasAttribute('itemscope') || undefined,
|
|
664
|
-
itemtype: html.getAttribute('itemtype') ?? undefined,
|
|
665
|
-
amp: html.hasAttribute('amp') || undefined,
|
|
666
|
-
lightning: html.hasAttribute('⚡') || undefined,
|
|
667
|
-
},
|
|
668
|
-
{ kind: 'title', content: document.title },
|
|
669
|
-
);
|
|
670
|
-
|
|
671
|
-
for (const base of document.querySelectorAll('base')) {
|
|
672
|
-
if (!(base instanceof HTMLBaseElement)) continue;
|
|
673
|
-
entries.push({
|
|
674
|
-
kind: 'base',
|
|
675
|
-
href: base.getAttribute('href') ?? undefined,
|
|
676
|
-
target: base.getAttribute('target') ?? undefined,
|
|
677
|
-
});
|
|
678
|
-
}
|
|
679
|
-
|
|
680
|
-
for (const meta of document.querySelectorAll('meta')) {
|
|
681
|
-
if (!(meta instanceof HTMLMetaElement)) continue;
|
|
682
|
-
const name = meta.getAttribute('name');
|
|
683
|
-
const property = meta.getAttribute('property');
|
|
684
|
-
const httpEquiv = meta.getAttribute('http-equiv');
|
|
685
|
-
const itemprop = meta.getAttribute('itemprop');
|
|
686
|
-
const charset = meta.getAttribute('charset');
|
|
687
|
-
const content = meta.getAttribute('content');
|
|
688
|
-
const media = meta.getAttribute('media');
|
|
689
|
-
entries.push({
|
|
690
|
-
kind: 'meta',
|
|
691
|
-
name: name ? name.toLowerCase() : undefined,
|
|
692
|
-
property: property ? property.toLowerCase() : undefined,
|
|
693
|
-
httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
|
|
694
|
-
itemprop: itemprop ?? undefined,
|
|
695
|
-
charset: charset ?? undefined,
|
|
696
|
-
content: content ?? undefined,
|
|
697
|
-
media: media ?? undefined,
|
|
698
|
-
});
|
|
699
|
-
}
|
|
700
|
-
|
|
701
|
-
for (const link of document.querySelectorAll('link[href]')) {
|
|
702
|
-
if (!(link instanceof HTMLLinkElement)) continue;
|
|
703
|
-
const relRaw = link.getAttribute('rel') ?? '';
|
|
704
|
-
const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
|
|
705
|
-
entries.push({
|
|
706
|
-
kind: 'link',
|
|
707
|
-
rel,
|
|
708
|
-
href: link.getAttribute('href') ?? '',
|
|
709
|
-
type: link.getAttribute('type') ?? undefined,
|
|
710
|
-
media: link.getAttribute('media') ?? undefined,
|
|
711
|
-
sizes: link.getAttribute('sizes') ?? undefined,
|
|
712
|
-
title: link.getAttribute('title') ?? undefined,
|
|
713
|
-
hreflang: link.getAttribute('hreflang') ?? undefined,
|
|
714
|
-
as: link.getAttribute('as') ?? undefined,
|
|
715
|
-
crossorigin: link.getAttribute('crossorigin') ?? undefined,
|
|
716
|
-
color: link.getAttribute('color') ?? undefined,
|
|
717
|
-
blocking: link.getAttribute('blocking') ?? undefined,
|
|
718
|
-
imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
|
|
719
|
-
});
|
|
720
|
-
}
|
|
721
|
-
|
|
722
|
-
const STRUCTURED_TYPES = new Set([
|
|
723
|
-
'application/ld+json',
|
|
724
|
-
'speculationrules',
|
|
725
|
-
'application/json+oembed',
|
|
726
|
-
'application/xml+oembed',
|
|
727
|
-
]);
|
|
728
|
-
for (const script of document.querySelectorAll('script[type]')) {
|
|
729
|
-
if (!(script instanceof HTMLScriptElement)) continue;
|
|
730
|
-
const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
|
|
731
|
-
if (!STRUCTURED_TYPES.has(scriptType)) continue;
|
|
732
|
-
const src = script.getAttribute('src') ?? undefined;
|
|
733
|
-
const text = script.textContent ?? '';
|
|
734
|
-
const inHead = !!script.closest('head');
|
|
735
|
-
const inNoscript = !!script.closest('noscript');
|
|
736
|
-
const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
|
|
737
|
-
entries.push({
|
|
738
|
-
kind: 'script',
|
|
739
|
-
scriptType,
|
|
740
|
-
content: text || undefined,
|
|
741
|
-
src,
|
|
742
|
-
location,
|
|
743
|
-
});
|
|
744
|
-
}
|
|
745
|
-
|
|
746
|
-
for (const iframe of document.querySelectorAll('iframe[src]')) {
|
|
747
|
-
if (!(iframe instanceof HTMLIFrameElement)) continue;
|
|
748
|
-
const src = iframe.getAttribute('src') ?? '';
|
|
749
|
-
if (!src) continue;
|
|
750
|
-
const inHead = !!iframe.closest('head');
|
|
751
|
-
const inNoscript = !!iframe.closest('noscript');
|
|
752
|
-
const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
|
|
753
|
-
entries.push({ kind: 'iframe', src, location });
|
|
754
|
-
}
|
|
755
|
-
|
|
756
|
-
const win = window as unknown as Record<string, unknown>;
|
|
757
|
-
const presentGlobals = knownGlobals.filter((name) => win[name] !== undefined);
|
|
758
|
-
if (presentGlobals.length > 0) {
|
|
759
|
-
entries.push({ kind: 'window-global', names: presentGlobals });
|
|
760
|
-
}
|
|
761
|
-
|
|
762
|
-
return entries;
|
|
763
|
-
}, WINDOW_GLOBALS_TO_CHECK)
|
|
764
|
-
.catch(() => [] as unknown[]);
|
|
765
|
-
|
|
621
|
+
const fnSource = collectHeadFromDocument.toString();
|
|
622
|
+
const globalsLiteral = JSON.stringify(WINDOW_GLOBALS_TO_CHECK);
|
|
623
|
+
const expr = `(${fnSource})(window, ${globalsLiteral})`;
|
|
624
|
+
const raw = await page.evaluate(expr).catch(() => [] as unknown[]);
|
|
766
625
|
return raw as RawHeadEntry[];
|
|
767
626
|
}
|