@d-zero/beholder 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,247 @@
1
+ import { JSDOM } from 'jsdom';
2
+ import { describe, expect, it } from 'vitest';
3
+
4
+ import { extractMetaFromDocument } from './extract-meta.js';
5
+
6
+ const URL = 'https://example.com/';
7
+
8
+ /**
9
+ *
10
+ * @param html
11
+ */
12
+ function mkDom(html: string): JSDOM {
13
+ return new JSDOM(html, { url: URL });
14
+ }
15
+
16
+ /**
17
+ *
18
+ * @param dom
19
+ */
20
+ function asWindow(dom: JSDOM): Window {
21
+ return dom.window as unknown as Window;
22
+ }
23
+
24
+ describe('extractMetaFromDocument', () => {
25
+ it('extracts <title>, lang and basic <meta name=description>', async () => {
26
+ const html = `<!doctype html>
27
+ <html lang="ja">
28
+ <head>
29
+ <title>Example Title</title>
30
+ <meta name="description" content="An example page">
31
+ <meta name="keywords" content="a, b, c">
32
+ </head>
33
+ <body></body>
34
+ </html>`;
35
+ const dom = mkDom(html);
36
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
37
+
38
+ expect(meta.title).toBe('Example Title');
39
+ expect(meta.lang).toBe('ja');
40
+ expect(meta.description).toBe('An example page');
41
+ expect(meta.keywords).toBe('a, b, c');
42
+ });
43
+
44
+ it('parses og:* and twitter:* meta tags', async () => {
45
+ const html = `<!doctype html>
46
+ <html>
47
+ <head>
48
+ <title>OG</title>
49
+ <meta property="og:title" content="OG Title">
50
+ <meta property="og:type" content="article">
51
+ <meta property="og:image" content="https://example.com/a.png">
52
+ <meta property="og:image" content="https://example.com/b.png">
53
+ <meta name="twitter:card" content="summary_large_image">
54
+ <meta name="twitter:site" content="@example">
55
+ </head>
56
+ <body></body>
57
+ </html>`;
58
+ const dom = mkDom(html);
59
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
60
+
61
+ expect(meta.og?.title).toBe('OG Title');
62
+ expect(meta.og?.type).toBe('article');
63
+ expect(meta.og?.image).toEqual([
64
+ 'https://example.com/a.png',
65
+ 'https://example.com/b.png',
66
+ ]);
67
+ expect(meta.twitter?.card).toBe('summary_large_image');
68
+ expect(meta.twitter?.site).toBe('@example');
69
+ });
70
+
71
+ it('parses viewport, robots and theme-color (with media branches)', async () => {
72
+ const html = `<!doctype html>
73
+ <html>
74
+ <head>
75
+ <title>X</title>
76
+ <meta name="viewport" content="width=device-width, initial-scale=1">
77
+ <meta name="robots" content="noindex, nofollow">
78
+ <meta name="theme-color" content="#000000">
79
+ <meta name="theme-color" media="(prefers-color-scheme: dark)" content="#111111">
80
+ <meta name="theme-color" media="(prefers-color-scheme: light)" content="#eeeeee">
81
+ </head>
82
+ </html>`;
83
+ const dom = mkDom(html);
84
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
85
+
86
+ expect(meta.viewport?.width).toBe('device-width');
87
+ expect(meta.viewport?.initialScale).toBe(1);
88
+ expect(meta.robots?.noindex).toBe(true);
89
+ expect(meta.robots?.nofollow).toBe(true);
90
+ expect(meta.themeColor).toBe('#000000');
91
+ expect(meta.themeColorDark).toBe('#111111');
92
+ expect(meta.themeColorLight).toBe('#eeeeee');
93
+ });
94
+
95
+ it('captures <link rel="canonical"> and alternate hreflang', async () => {
96
+ const html = `<!doctype html>
97
+ <html>
98
+ <head>
99
+ <title>L</title>
100
+ <link rel="canonical" href="https://example.com/canonical">
101
+ <link rel="alternate" hreflang="en" href="https://example.com/en">
102
+ <link rel="alternate" hreflang="ja" href="https://example.com/ja">
103
+ </head>
104
+ </html>`;
105
+ const dom = mkDom(html);
106
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
107
+
108
+ expect(meta.link?.canonical).toBe('https://example.com/canonical');
109
+ const hreflangs = meta.link?.alternateHreflang.map((e) => e.hreflang) ?? [];
110
+ expect(hreflangs).toEqual(['en', 'ja']);
111
+ });
112
+
113
+ it('parses inline JSON-LD scripts', async () => {
114
+ const data = { '@context': 'https://schema.org', '@type': 'WebPage', name: 'X' };
115
+ const html = `<!doctype html>
116
+ <html>
117
+ <head>
118
+ <title>J</title>
119
+ <script type="application/ld+json">${JSON.stringify(data)}</script>
120
+ </head>
121
+ </html>`;
122
+ const dom = mkDom(html);
123
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
124
+
125
+ expect(meta.jsonLd).toHaveLength(1);
126
+ const first = meta.jsonLd[0];
127
+ expect(first?.parsed).toEqual(data);
128
+ });
129
+
130
+ it('captures itemtype/itemscope (microdata) and prefix/vocab (RDFa) from <html>', async () => {
131
+ const html = `<!doctype html>
132
+ <html itemscope itemtype="https://schema.org/WebPage" prefix="og: https://ogp.me/ns#" vocab="https://schema.org/" typeof="WebPage">
133
+ <head><title>M</title></head>
134
+ </html>`;
135
+ const dom = mkDom(html);
136
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
137
+
138
+ expect(meta.microdata?.itemscope).toBe(true);
139
+ expect(meta.microdata?.itemtype).toBe('https://schema.org/WebPage');
140
+ expect(meta.rdfa?.prefix).toBe('og: https://ogp.me/ns#');
141
+ expect(meta.rdfa?.vocab).toBe('https://schema.org/');
142
+ expect(meta.rdfa?.typeOf).toBe('WebPage');
143
+ });
144
+
145
+ it('captures <base href> and <iframe src>', async () => {
146
+ const html = `<!doctype html>
147
+ <html>
148
+ <head>
149
+ <title>B</title>
150
+ <base href="https://example.com/sub/">
151
+ </head>
152
+ <body>
153
+ <iframe src="https://www.youtube.com/embed/abc"></iframe>
154
+ </body>
155
+ </html>`;
156
+ const dom = mkDom(html);
157
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
158
+
159
+ expect(meta.baseHref).toBe('https://example.com/sub/');
160
+ expect(meta.others.iframe).toEqual([
161
+ { src: 'https://www.youtube.com/embed/abc', location: 'body' },
162
+ ]);
163
+ });
164
+
165
+ it('falls back to documentElement.outerHTML when context.html is omitted', async () => {
166
+ const html = `<!doctype html><html><head><title>FB</title></head></html>`;
167
+ const dom = mkDom(html);
168
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL });
169
+ expect(meta.title).toBe('FB');
170
+ expect(meta.tags).toBeDefined();
171
+ expect(meta.tags.entries).toBeInstanceOf(Array);
172
+ });
173
+
174
+ it('returns includeRaw when requested', async () => {
175
+ const html = `<!doctype html><html><head><title>R</title></head></html>`;
176
+ const dom = mkDom(html);
177
+ const meta = await extractMetaFromDocument(asWindow(dom), {
178
+ url: URL,
179
+ html,
180
+ includeRaw: true,
181
+ });
182
+ expect(meta._raw).toBeInstanceOf(Array);
183
+ expect(meta._raw?.some((e) => e.kind === 'title')).toBe(true);
184
+ });
185
+
186
+ it("emits a 'window-global' raw entry when known globals are present on the window", async () => {
187
+ const html = `<!doctype html><html><head><title>WG</title></head></html>`;
188
+ const dom = mkDom(html);
189
+ // jsdom does not execute scripts by default, so simulate a tag library
190
+ // having installed itself onto `window` (the production trigger for the
191
+ // `window-global` branch in `collectHeadFromDocument`).
192
+ (dom.window as unknown as Record<string, unknown>).dataLayer = [];
193
+ (dom.window as unknown as Record<string, unknown>).fbq = () => {};
194
+
195
+ const meta = await extractMetaFromDocument(asWindow(dom), {
196
+ url: URL,
197
+ html,
198
+ includeRaw: true,
199
+ });
200
+
201
+ const globalEntry = meta._raw?.find((e) => e.kind === 'window-global');
202
+ expect(globalEntry).toBeDefined();
203
+ // Force a type error if the narrow ever fails, rather than letting the
204
+ // trailing `expect` calls silently skip via an `if` branch.
205
+ if (globalEntry === undefined || globalEntry.kind !== 'window-global') {
206
+ throw new Error('expected a window-global raw entry');
207
+ }
208
+ expect(globalEntry.names).toContain('dataLayer');
209
+ expect(globalEntry.names).toContain('fbq');
210
+ });
211
+
212
+ it('forwards headers and statusCode to the tag-detection layer', async () => {
213
+ // We can't assert Wappalyzer's internal decisions without coupling to its
214
+ // signature table, but we can at least verify that supplying headers and
215
+ // statusCode does not throw and that the returned Meta is still well-formed.
216
+ const html = `<!doctype html><html><head><title>H</title></head></html>`;
217
+ const dom = mkDom(html);
218
+ const meta = await extractMetaFromDocument(asWindow(dom), {
219
+ url: URL,
220
+ html,
221
+ headers: {
222
+ 'content-type': 'text/html; charset=utf-8',
223
+ 'x-powered-by': 'Express',
224
+ },
225
+ statusCode: 200,
226
+ });
227
+ expect(meta.title).toBe('H');
228
+ expect(Array.isArray(meta.tags.entries)).toBe(true);
229
+ });
230
+
231
+ it('records parseError for malformed inline JSON-LD', async () => {
232
+ const html = `<!doctype html>
233
+ <html>
234
+ <head>
235
+ <title>JE</title>
236
+ <script type="application/ld+json">{ this is not valid json</script>
237
+ </head>
238
+ </html>`;
239
+ const dom = mkDom(html);
240
+ const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
241
+
242
+ expect(meta.jsonLd).toHaveLength(1);
243
+ const first = meta.jsonLd[0];
244
+ expect(first?.parsed).toBeUndefined();
245
+ expect(typeof first?.parseError).toBe('string');
246
+ });
247
+ });
@@ -0,0 +1,121 @@
1
+ /**
2
+ * Public, Puppeteer-free entry point for extracting {@link Meta} from an
3
+ * already-parsed DOM (e.g. jsdom).
4
+ *
5
+ * WHY this exists alongside `Scraper.scrapeStart()` / `getMeta(page, …)`:
6
+ * callers who already have an HTML string (from `fetch`, a fixture, an
7
+ * archive) should not be forced to spin up Chromium just to read a few `<meta>`
8
+ * tags. This module reuses the same `collectHead → detectTags → classify`
9
+ * pipeline as the Puppeteer path — the `Meta` shape returned here is
10
+ * identical to what `Scraper` produces, so downstream consumers do not branch
11
+ * on the source.
12
+ *
13
+ * See {@link extractMetaFromDocument} for the usage example.
14
+ * @module
15
+ */
16
+
17
+ import type { Meta } from './types.js';
18
+
19
+ import { classify } from './meta/classify.js';
20
+ import { collectHeadFromDocument, WINDOW_GLOBALS_TO_CHECK } from './meta/collect-head.js';
21
+ import { detectTags } from './meta/tag-detection.js';
22
+
23
+ /**
24
+ * Inputs for {@link extractMetaFromDocument}.
25
+ *
26
+ * `url`/`statusCode`/`headers` mirror the inputs to the underlying
27
+ * `simple-wappalyzer` driver. They are not consumed by the DOM-walk side of
28
+ * the pipeline.
29
+ *
30
+ * `html` is optional: when omitted, `document.documentElement.outerHTML` is
31
+ * read off the passed window — matching the fallback `getMeta(page, …)` does
32
+ * via `page.content()`.
33
+ */
34
+ export type ExtractMetaContext = {
35
+ /** The fully resolved URL of the page (used by Wappalyzer + AMP fields). */
36
+ readonly url: string;
37
+ /**
38
+ * Rendered HTML used for technology detection. Defaults to
39
+ * `window.document.documentElement.outerHTML` when omitted.
40
+ *
41
+ * WHY allow override: callers that fetched the raw HTML string from the
42
+ * network already have the *pre-script-execution* markup, which is what
43
+ * Wappalyzer's HTML patterns are tuned for. The serialized DOM from
44
+ * `outerHTML` reflects whatever scripts have already mutated; provide the
45
+ * raw string to get more stable detections.
46
+ */
47
+ readonly html?: string;
48
+ /** HTTP status code, surfaced to the Wappalyzer driver. */
49
+ readonly statusCode?: number;
50
+ /**
51
+ * Response headers; case is preserved by the caller, lowercased internally
52
+ * by `detectTags`.
53
+ */
54
+ readonly headers?: Record<string, string | string[] | undefined>;
55
+ /**
56
+ * When `true`, the returned `Meta` includes `_raw: RawHeadEntry[]` for
57
+ * debugging. Default `false` to keep the serialized payload small.
58
+ */
59
+ readonly includeRaw?: boolean;
60
+ };
61
+
62
+ /**
63
+ * Extracts a `Meta` object from a DOM provided by the caller.
64
+ *
65
+ * Pipeline:
66
+ *
67
+ * 1. {@link collectHeadFromDocument} walks `window.document` and returns a
68
+ * serializable `RawHeadEntry[]`.
69
+ * 2. {@link detectTags} runs `simple-wappalyzer` over the HTML + headers to
70
+ * detect third-party technologies.
71
+ * 3. {@link classify} folds the two signals together into a typed `Meta`.
72
+ *
73
+ * Step (1) is synchronous and runs first; step (2) is awaited next. The two
74
+ * are independent in principle, but the current shape is sequential — keeping
75
+ * it that way avoids forcing the synchronous DOM walk into a microtask just to
76
+ * gain a few milliseconds of overlap with the Wappalyzer call.
77
+ * @param window - The window whose `document` will be walked. jsdom's
78
+ * `dom.window` works; pass any object satisfying the `Window`
79
+ * type. The function never mutates the document.
80
+ * @param context - URL / HTML / headers / status code context. See
81
+ * {@link ExtractMetaContext}.
82
+ * @returns The extracted `Meta` (always defined; empty fields stay empty).
83
+ * @example
84
+ * ```ts
85
+ * import { JSDOM } from 'jsdom';
86
+ * import { extractMetaFromDocument } from '@d-zero/beholder';
87
+ *
88
+ * const url = 'https://example.com/';
89
+ * const html = await (await fetch(url)).text();
90
+ * const dom = new JSDOM(html, { url });
91
+ *
92
+ * // The `as unknown as Window` cast is needed because jsdom's `DOMWindow` is
93
+ * // not structurally identical to lib.dom's `Window` (a few rare globals
94
+ * // differ), but the runtime shape is compatible for this function's needs.
95
+ * const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
96
+ * url,
97
+ * html,
98
+ * });
99
+ *
100
+ * meta.title; // <title>
101
+ * meta.og?.image; // og:image[]
102
+ * meta.tags.entries; // Wappalyzer detections + extracted IDs
103
+ * ```
104
+ */
105
+ export async function extractMetaFromDocument(
106
+ window: Window,
107
+ context: ExtractMetaContext,
108
+ ): Promise<Meta> {
109
+ const raw = collectHeadFromDocument(window, WINDOW_GLOBALS_TO_CHECK);
110
+ const html = context.html ?? window.document.documentElement.outerHTML;
111
+ const tags = await detectTags({
112
+ url: context.url,
113
+ html,
114
+ ...(context.statusCode === undefined ? {} : { statusCode: context.statusCode }),
115
+ ...(context.headers === undefined ? {} : { headers: context.headers }),
116
+ });
117
+ return classify(raw, {
118
+ tags,
119
+ ...(context.includeRaw ? { includeRaw: true } : {}),
120
+ });
121
+ }
package/src/index.ts CHANGED
@@ -12,6 +12,8 @@
12
12
  */
13
13
  export { default as default } from './scraper.js';
14
14
  export { isError } from './is-error.js';
15
+ export { extractMetaFromDocument } from './extract-meta.js';
16
+ export type { ExtractMetaContext } from './extract-meta.js';
15
17
  export { detectCompress } from '@d-zero/shared/detect-compress';
16
18
  export type { CompressType } from '@d-zero/shared/detect-compress';
17
19
  export { detectCDN } from '@d-zero/shared/detect-cdn';
@@ -0,0 +1,247 @@
1
+ /**
2
+ * DOM-side raw `<head>` collector.
3
+ *
4
+ * `collectHeadFromDocument` walks a `Document` (Puppeteer page realm or jsdom realm
5
+ * alike) and produces a serializable {@link RawHeadEntry}[] that
6
+ * {@link ../meta/classify.ts | classify} can turn into a typed `Meta`.
7
+ *
8
+ * WHY this function is realm-agnostic:
9
+ *
10
+ * - The Puppeteer path stringifies this function via `Function.prototype.toString`
11
+ * and runs it as a `page.evaluate(string)` expression, so any closure over
12
+ * module-scope bindings would resolve to `undefined` in the browser realm.
13
+ * - The jsdom (Node) path calls it directly with the jsdom `Window`. Because
14
+ * `HTMLLinkElement` (etc.) in jsdom is a *different class instance* from the
15
+ * one in the page realm, `instanceof` only works when the constructor is read
16
+ * from the *passed* `window` rather than from bare globals.
17
+ *
18
+ * Together those constraints dictate that the function MUST:
19
+ *
20
+ * 1. Reference no module-level variables — only its own parameters and inner locals.
21
+ * 2. Take every HTML class constructor (`HTMLBaseElement`, …) from the passed
22
+ * `window` via destructuring instead of relying on ambient globals.
23
+ * 3. Stay in plain ES syntax (no TS-only constructs that need helper imports).
24
+ * @module
25
+ */
26
+
27
+ import type { RawHeadEntry } from './types.js';
28
+
29
+ /**
30
+ * Curated list of `window` globals whose presence indicates that a third-party
31
+ * tag library has been loaded on the page. Surfaced as a single
32
+ * `kind: 'window-global'` entry so that downstream consumers (e.g. tag-detection)
33
+ * can cross-reference the script/iframe signals.
34
+ *
35
+ * Kept here (rather than in `dom-evaluation.ts`) so the Puppeteer path and the
36
+ * jsdom path share one source of truth.
37
+ */
38
+ export const WINDOW_GLOBALS_TO_CHECK: readonly string[] = [
39
+ 'dataLayer',
40
+ 'gtag',
41
+ 'ga',
42
+ '_gaq',
43
+ 'fbq',
44
+ '_fbq',
45
+ 'clarity',
46
+ '_hjSettings',
47
+ '_hjid',
48
+ 'twq',
49
+ 'ttq',
50
+ '_linkedin_partner_id',
51
+ 'pintrk',
52
+ 'amplitude',
53
+ 'mixpanel',
54
+ 'analytics',
55
+ 'heap',
56
+ 'posthog',
57
+ 'plausible',
58
+ 'fathom',
59
+ '_paq',
60
+ 's_account',
61
+ 's',
62
+ 'ym',
63
+ 'UET',
64
+ 'optimizely',
65
+ '_hsq',
66
+ 'Sentry',
67
+ 'Intercom',
68
+ 'intercomSettings',
69
+ 'drift',
70
+ 'Tawk_API',
71
+ 'zE',
72
+ 'OneTrust',
73
+ 'Cookiebot',
74
+ 'Stripe',
75
+ 'grecaptcha',
76
+ ];
77
+
78
+ /**
79
+ * Walks the given window's `Document` and returns a serializable list of raw
80
+ * head entries.
81
+ *
82
+ * Two realms are supported:
83
+ *
84
+ * - Browser realm (Puppeteer): the function source is `.toString()`'d and run
85
+ * inside the page via `page.evaluate(string)`. Inside the page, `window`
86
+ * resolves to the page's global object, so destructured class constructors
87
+ * match `instanceof` checks against elements returned from `querySelectorAll`.
88
+ * - Node realm (jsdom et al.): the caller passes `dom.window` directly. jsdom's
89
+ * HTML element prototypes are distinct from the host Node's bare globals, so
90
+ * reading the constructors off the passed `window` is what makes `instanceof`
91
+ * succeed.
92
+ *
93
+ * The function MUST NOT close over any module-scope binding — all data it needs
94
+ * is reached through its two parameters.
95
+ * @param window - The window object whose `document` will be inspected. Provides
96
+ * both the DOM tree and the HTML element constructors used for
97
+ * `instanceof` narrowing.
98
+ * @param knownGlobals - Names of `window` properties that, when present,
99
+ * indicate a third-party tag library is loaded. Required
100
+ * (no default) so the Puppeteer-side string-eval path
101
+ * does not have to inline a default value list.
102
+ * @returns Serializable list of raw head entries for {@link ../meta/classify.ts | classify}.
103
+ */
104
+ export function collectHeadFromDocument(
105
+ window: Window,
106
+ knownGlobals: readonly string[],
107
+ ): RawHeadEntry[] {
108
+ const document = window.document;
109
+ // TypeScript's `Window` interface in lib.dom does not directly expose the
110
+ // HTML element constructors (`HTMLLinkElement`, `HTMLScriptElement`, …)
111
+ // even though every real window object — browser realm AND jsdom realm —
112
+ // carries them at runtime. Widening the type here lets us destructure them
113
+ // uniformly; the runtime values come straight from the passed window, so
114
+ // the cast is purely cosmetic for TS and erased at compile time.
115
+ const w = window as Window & {
116
+ HTMLBaseElement: typeof globalThis.HTMLBaseElement;
117
+ HTMLMetaElement: typeof globalThis.HTMLMetaElement;
118
+ HTMLLinkElement: typeof globalThis.HTMLLinkElement;
119
+ HTMLScriptElement: typeof globalThis.HTMLScriptElement;
120
+ HTMLIFrameElement: typeof globalThis.HTMLIFrameElement;
121
+ };
122
+ const {
123
+ HTMLBaseElement,
124
+ HTMLMetaElement,
125
+ HTMLLinkElement,
126
+ HTMLScriptElement,
127
+ HTMLIFrameElement,
128
+ } = w;
129
+
130
+ const entries: RawHeadEntry[] = [];
131
+
132
+ const html = document.documentElement;
133
+ entries.push(
134
+ {
135
+ kind: 'html',
136
+ lang: html.lang || undefined,
137
+ dir: html.dir || undefined,
138
+ xmlns: html.getAttribute('xmlns') ?? undefined,
139
+ prefix: html.getAttribute('prefix') ?? undefined,
140
+ vocab: html.getAttribute('vocab') ?? undefined,
141
+ typeOf: html.getAttribute('typeof') ?? undefined,
142
+ itemscope: html.hasAttribute('itemscope') || undefined,
143
+ itemtype: html.getAttribute('itemtype') ?? undefined,
144
+ amp: html.hasAttribute('amp') || undefined,
145
+ lightning: html.hasAttribute('⚡') || undefined,
146
+ },
147
+ { kind: 'title', content: document.title },
148
+ );
149
+
150
+ for (const base of document.querySelectorAll('base')) {
151
+ if (!(base instanceof HTMLBaseElement)) continue;
152
+ entries.push({
153
+ kind: 'base',
154
+ href: base.getAttribute('href') ?? undefined,
155
+ target: base.getAttribute('target') ?? undefined,
156
+ });
157
+ }
158
+
159
+ for (const meta of document.querySelectorAll('meta')) {
160
+ if (!(meta instanceof HTMLMetaElement)) continue;
161
+ const name = meta.getAttribute('name');
162
+ const property = meta.getAttribute('property');
163
+ const httpEquiv = meta.getAttribute('http-equiv');
164
+ const itemprop = meta.getAttribute('itemprop');
165
+ const charset = meta.getAttribute('charset');
166
+ const content = meta.getAttribute('content');
167
+ const media = meta.getAttribute('media');
168
+ entries.push({
169
+ kind: 'meta',
170
+ name: name ? name.toLowerCase() : undefined,
171
+ property: property ? property.toLowerCase() : undefined,
172
+ httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
173
+ itemprop: itemprop ?? undefined,
174
+ charset: charset ?? undefined,
175
+ content: content ?? undefined,
176
+ media: media ?? undefined,
177
+ });
178
+ }
179
+
180
+ for (const link of document.querySelectorAll('link[href]')) {
181
+ if (!(link instanceof HTMLLinkElement)) continue;
182
+ const relRaw = link.getAttribute('rel') ?? '';
183
+ const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
184
+ entries.push({
185
+ kind: 'link',
186
+ rel,
187
+ href: link.getAttribute('href') ?? '',
188
+ type: link.getAttribute('type') ?? undefined,
189
+ media: link.getAttribute('media') ?? undefined,
190
+ sizes: link.getAttribute('sizes') ?? undefined,
191
+ title: link.getAttribute('title') ?? undefined,
192
+ hreflang: link.getAttribute('hreflang') ?? undefined,
193
+ as: link.getAttribute('as') ?? undefined,
194
+ crossorigin: link.getAttribute('crossorigin') ?? undefined,
195
+ color: link.getAttribute('color') ?? undefined,
196
+ blocking: link.getAttribute('blocking') ?? undefined,
197
+ imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
198
+ });
199
+ }
200
+
201
+ const STRUCTURED_TYPES = new Set([
202
+ 'application/ld+json',
203
+ 'speculationrules',
204
+ 'application/json+oembed',
205
+ 'application/xml+oembed',
206
+ ]);
207
+ for (const script of document.querySelectorAll('script[type]')) {
208
+ if (!(script instanceof HTMLScriptElement)) continue;
209
+ const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
210
+ if (!STRUCTURED_TYPES.has(scriptType)) continue;
211
+ const src = script.getAttribute('src') ?? undefined;
212
+ const text = script.textContent ?? '';
213
+ const inHead = !!script.closest('head');
214
+ const inNoscript = !!script.closest('noscript');
215
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
216
+ entries.push({
217
+ kind: 'script',
218
+ scriptType,
219
+ content: text || undefined,
220
+ src,
221
+ location,
222
+ });
223
+ }
224
+
225
+ for (const iframe of document.querySelectorAll('iframe[src]')) {
226
+ if (!(iframe instanceof HTMLIFrameElement)) continue;
227
+ const src = iframe.getAttribute('src') ?? '';
228
+ if (!src) continue;
229
+ const inHead = !!iframe.closest('head');
230
+ const inNoscript = !!iframe.closest('noscript');
231
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
232
+ entries.push({ kind: 'iframe', src, location });
233
+ }
234
+
235
+ const win = window as unknown as Record<string, unknown>;
236
+ const presentGlobals: string[] = [];
237
+ for (const name of knownGlobals) {
238
+ if (win[name] !== undefined) {
239
+ presentGlobals.push(name);
240
+ }
241
+ }
242
+ if (presentGlobals.length > 0) {
243
+ entries.push({ kind: 'window-global', names: presentGlobals });
244
+ }
245
+
246
+ return entries;
247
+ }