@d-zero/beholder 2.1.5 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ /**
2
+ * Value normalizers used by `classify()` to turn raw `content` strings into
3
+ * structured objects (viewport, robots, format-detection, etc.).
4
+ *
5
+ * Each parser is a pure function that takes the raw `content` string and
6
+ * returns a normalized structure. They never throw; on unrecognizable input
7
+ * they fall back to keeping the `raw` field only.
8
+ * @module
9
+ */
10
+ /**
11
+ * Parses `<meta name="viewport">` content into a structured `ViewportMeta`.
12
+ * @param raw
13
+ * @example parseViewport('width=device-width, initial-scale=1.0')
14
+ * → { raw: '...', width: 'device-width', initialScale: 1 }
15
+ */
16
+ export function parseViewport(raw) {
17
+ const meta = { raw };
18
+ for (const part of raw.split(',')) {
19
+ const split = part.split('=');
20
+ const keyRaw = split[0] ?? '';
21
+ const valueRaw = split[1] ?? '';
22
+ const key = keyRaw.trim().toLowerCase();
23
+ const value = valueRaw.trim();
24
+ if (!key)
25
+ continue;
26
+ switch (key) {
27
+ case 'width': {
28
+ meta.width = value;
29
+ break;
30
+ }
31
+ case 'height': {
32
+ meta.height = value;
33
+ break;
34
+ }
35
+ case 'initial-scale': {
36
+ const n = Number.parseFloat(value);
37
+ if (!Number.isNaN(n))
38
+ meta.initialScale = n;
39
+ break;
40
+ }
41
+ case 'minimum-scale': {
42
+ const n = Number.parseFloat(value);
43
+ if (!Number.isNaN(n))
44
+ meta.minimumScale = n;
45
+ break;
46
+ }
47
+ case 'maximum-scale': {
48
+ const n = Number.parseFloat(value);
49
+ if (!Number.isNaN(n))
50
+ meta.maximumScale = n;
51
+ break;
52
+ }
53
+ case 'user-scalable': {
54
+ const lower = value.toLowerCase();
55
+ if (lower === 'no' || lower === '0')
56
+ meta.userScalable = false;
57
+ else if (lower === 'yes' || lower === '1')
58
+ meta.userScalable = true;
59
+ else
60
+ meta.userScalable = value;
61
+ break;
62
+ }
63
+ case 'viewport-fit': {
64
+ meta.viewportFit = value;
65
+ break;
66
+ }
67
+ case 'interactive-widget': {
68
+ meta.interactiveWidget = value;
69
+ break;
70
+ }
71
+ }
72
+ }
73
+ return meta;
74
+ }
75
+ const ROBOTS_BOOLEAN_FLAGS = new Set([
76
+ 'index',
77
+ 'noindex',
78
+ 'follow',
79
+ 'nofollow',
80
+ 'none',
81
+ 'all',
82
+ 'noarchive',
83
+ 'nosnippet',
84
+ 'noimageindex',
85
+ 'nocache',
86
+ 'notranslate',
87
+ 'noodp',
88
+ 'noydir',
89
+ 'indexifembedded',
90
+ ]);
91
+ /**
92
+ * Parses `<meta name="robots">` content into a structured `RobotsMeta`.
93
+ * @param raw
94
+ * @example parseRobots('noindex, max-snippet:50, unavailable_after:2026-01-01')
95
+ * → { raw: '...', noindex: true, maxSnippet: 50, unavailableAfter: '2026-01-01' }
96
+ */
97
+ export function parseRobots(raw) {
98
+ const meta = { raw };
99
+ for (const token of raw.split(',')) {
100
+ const trimmed = token.trim().toLowerCase();
101
+ if (!trimmed)
102
+ continue;
103
+ if (ROBOTS_BOOLEAN_FLAGS.has(trimmed)) {
104
+ meta[trimmed] = true;
105
+ continue;
106
+ }
107
+ const colonIndex = trimmed.indexOf(':');
108
+ if (colonIndex === -1) {
109
+ continue;
110
+ }
111
+ const key = trimmed.slice(0, colonIndex).trim();
112
+ const value = token.slice(token.indexOf(':') + 1).trim();
113
+ switch (key) {
114
+ case 'max-snippet': {
115
+ const n = Number.parseInt(value, 10);
116
+ if (!Number.isNaN(n))
117
+ meta.maxSnippet = n;
118
+ break;
119
+ }
120
+ case 'max-image-preview': {
121
+ meta.maxImagePreview = value;
122
+ break;
123
+ }
124
+ case 'max-video-preview': {
125
+ const n = Number.parseInt(value, 10);
126
+ if (!Number.isNaN(n))
127
+ meta.maxVideoPreview = n;
128
+ break;
129
+ }
130
+ case 'unavailable_after':
131
+ case 'unavailable-after': {
132
+ meta.unavailableAfter = value;
133
+ break;
134
+ }
135
+ }
136
+ }
137
+ return meta;
138
+ }
139
+ const REFERRER_POLICY_KEYS = {
140
+ 'no-referrer': 'noReferrer',
141
+ origin: 'origin',
142
+ 'origin-when-cross-origin': 'originWhenCrossOrigin',
143
+ 'strict-origin': 'strictOrigin',
144
+ 'strict-origin-when-cross-origin': 'strictOriginWhenCrossOrigin',
145
+ 'unsafe-url': 'unsafeUrl',
146
+ 'same-origin': 'sameOrigin',
147
+ 'no-referrer-when-downgrade': 'noReferrerWhenDowngrade',
148
+ };
149
+ /**
150
+ * Parses `<meta name="referrer">` content into a structured `ReferrerMeta`.
151
+ * @param raw
152
+ */
153
+ export function parseReferrer(raw) {
154
+ const meta = { raw };
155
+ const key = REFERRER_POLICY_KEYS[raw.trim().toLowerCase()];
156
+ if (key) {
157
+ meta[key] = true;
158
+ }
159
+ return meta;
160
+ }
161
+ /**
162
+ * Parses `<meta name="format-detection">` content (e.g. `'telephone=no, address=no'`).
163
+ * @param raw
164
+ */
165
+ export function parseFormatDetection(raw) {
166
+ const meta = { raw };
167
+ for (const part of raw.split(/[,;]/)) {
168
+ const split = part.split('=');
169
+ const keyRaw = split[0] ?? '';
170
+ const valueRaw = split[1] ?? '';
171
+ const key = keyRaw.trim().toLowerCase();
172
+ const value = valueRaw.trim().toLowerCase();
173
+ if (!key)
174
+ continue;
175
+ const enabled = value !== 'no' && value !== 'false' && value !== '0';
176
+ switch (key) {
177
+ case 'telephone': {
178
+ meta.telephone = enabled;
179
+ break;
180
+ }
181
+ case 'email': {
182
+ meta.email = enabled;
183
+ break;
184
+ }
185
+ case 'address': {
186
+ meta.address = enabled;
187
+ break;
188
+ }
189
+ case 'date': {
190
+ meta.date = enabled;
191
+ break;
192
+ }
193
+ }
194
+ }
195
+ return meta;
196
+ }
197
+ /**
198
+ * Parses `<meta http-equiv="refresh">` content (e.g. `'5; url=https://...'`).
199
+ * @param raw
200
+ */
201
+ export function parseRefresh(raw) {
202
+ const refresh = { raw };
203
+ const split = raw.split(';');
204
+ const secondsRaw = split[0] ?? '';
205
+ const rest = split.slice(1).join(';');
206
+ const seconds = Number.parseFloat(secondsRaw.trim());
207
+ if (!Number.isNaN(seconds)) {
208
+ refresh.seconds = seconds;
209
+ }
210
+ const urlMatch = /url\s*=\s*(.+)/i.exec(rest);
211
+ if (urlMatch?.[1]) {
212
+ refresh.url = urlMatch[1].trim().replaceAll(/^['"]|['"]$/g, '');
213
+ }
214
+ return refresh;
215
+ }
216
+ /**
217
+ * Parses a `<script type="application/ld+json">` (or speculationrules) body
218
+ * into a {@link JsonLdEntry}. On parse failure, the entry preserves the `raw`
219
+ * text and records the error message in `parseError`.
220
+ * @param content
221
+ */
222
+ export function parseJsonLd(content) {
223
+ const raw = content;
224
+ try {
225
+ const parsed = JSON.parse(content);
226
+ return { raw, parsed };
227
+ }
228
+ catch (error) {
229
+ const parseError = error instanceof Error ? error.message : String(error);
230
+ return { raw, parseError };
231
+ }
232
+ }
233
+ /**
234
+ * Normalizes a string value according to a {@link KeyTransform}.
235
+ *
236
+ * `'boolean-yes'`: `'yes'` → `true`, `'no'` → `false`, anything else → raw string
237
+ * `'boolean-on'`: `'on'`/`'true'`/`'1'` → `true`, `'off'`/`'false'`/`'0'` → `false`, else raw
238
+ * `'boolean-true'`: `'true'` → `true`, `'false'` → `false`, else raw
239
+ * `'number'`: parsed via `Number.parseFloat`, falls back to raw on NaN
240
+ * `'string'` (default): returns the value unchanged
241
+ * @param value
242
+ * @param transform
243
+ */
244
+ export function normalizeValue(value, transform) {
245
+ if (!transform || transform === 'string') {
246
+ return value;
247
+ }
248
+ const lower = value.trim().toLowerCase();
249
+ switch (transform) {
250
+ case 'boolean-yes': {
251
+ if (lower === 'yes')
252
+ return true;
253
+ if (lower === 'no')
254
+ return false;
255
+ return value;
256
+ }
257
+ case 'boolean-on': {
258
+ if (lower === 'on' || lower === 'true' || lower === '1')
259
+ return true;
260
+ if (lower === 'off' || lower === 'false' || lower === '0')
261
+ return false;
262
+ return value;
263
+ }
264
+ case 'boolean-true': {
265
+ if (lower === 'true')
266
+ return true;
267
+ if (lower === 'false')
268
+ return false;
269
+ return value;
270
+ }
271
+ case 'number': {
272
+ const n = Number.parseFloat(value);
273
+ return Number.isNaN(n) ? value : n;
274
+ }
275
+ }
276
+ }
277
+ /**
278
+ * JSON-LD / speculationrules content size caps (bytes). Above these sizes the
279
+ * content is truncated and a `truncated` marker is emitted via `parseError`.
280
+ */
281
+ export const JSON_LD_PER_ENTRY_LIMIT = 200_000;
282
+ export const JSON_LD_TOTAL_LIMIT = 1_000_000;
283
+ /**
284
+ * Caps a single JSON-LD entry's raw content to {@link JSON_LD_PER_ENTRY_LIMIT}.
285
+ * Returns the (possibly truncated) entry and a `truncated` flag.
286
+ * @param content
287
+ */
288
+ export function capJsonLdContent(content) {
289
+ if (content.length <= JSON_LD_PER_ENTRY_LIMIT) {
290
+ return { content, truncated: false };
291
+ }
292
+ return { content: content.slice(0, JSON_LD_PER_ENTRY_LIMIT), truncated: true };
293
+ }
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Third-party tag detection layer.
3
+ *
4
+ * Combines two signals to populate {@link TagsMeta}:
5
+ * 1. `simple-wappalyzer` runs over the page HTML + headers to identify
6
+ * the technologies present (and their Wappalyzer categories).
7
+ * 2. {@link extractIds} from `./id-extractors.js` finds the real account
8
+ * / measurement IDs (e.g. `G-XXXXXXXX`, `GTM-XXXXX`) for each detected
9
+ * provider.
10
+ *
11
+ * Returned shape is documented on {@link TagsMeta} in `./types.ts`.
12
+ * @module
13
+ */
14
+ import type { TagsMeta } from './types.js';
15
+ /**
16
+ * Shape of a single technology entry returned by `simple-wappalyzer`.
17
+ * Mirrors the subset of fields we use; everything else is ignored.
18
+ */
19
+ interface WappalyzerTech {
20
+ readonly name: string;
21
+ readonly version?: string;
22
+ readonly confidence?: number;
23
+ readonly categories?: ReadonlyArray<{
24
+ readonly name?: string;
25
+ readonly id?: number;
26
+ }>;
27
+ }
28
+ /**
29
+ * Inputs required to drive `simple-wappalyzer`.
30
+ *
31
+ * `headers` keys should be lowercase; `simple-wappalyzer` is case-insensitive
32
+ * but normalizing up front avoids ambiguity.
33
+ */
34
+ export type DetectTagsInput = {
35
+ readonly url: string;
36
+ readonly html: string;
37
+ readonly statusCode?: number;
38
+ readonly headers?: Record<string, string | string[] | undefined>;
39
+ };
40
+ /**
41
+ * Drives `simple-wappalyzer` and post-processes the result with the
42
+ * provider-specific ID extractors. Failures fall back to an empty `TagsMeta`
43
+ * rather than throwing, so the caller does not need to wrap the call.
44
+ * @param input
45
+ */
46
+ export declare function detectTags(input: DetectTagsInput): Promise<TagsMeta>;
47
+ /**
48
+ * Builds a `TagsMeta` from the raw `simple-wappalyzer` output and the page
49
+ * HTML used for ID extraction.
50
+ *
51
+ * Exported for unit tests that bypass `simple-wappalyzer` and feed
52
+ * pre-recorded detections directly.
53
+ * @param detections
54
+ * @param html
55
+ */
56
+ export declare function assembleTagsMeta(detections: readonly WappalyzerTech[], html: string): TagsMeta;
57
+ /** Singleton empty `TagsMeta` value (exported for tests). */
58
+ export declare const EMPTY_TAGS_META: TagsMeta;
59
+ export {};
@@ -0,0 +1,120 @@
1
+ /**
2
+ * Third-party tag detection layer.
3
+ *
4
+ * Combines two signals to populate {@link TagsMeta}:
5
+ * 1. `simple-wappalyzer` runs over the page HTML + headers to identify
6
+ * the technologies present (and their Wappalyzer categories).
7
+ * 2. {@link extractIds} from `./id-extractors.js` finds the real account
8
+ * / measurement IDs (e.g. `G-XXXXXXXX`, `GTM-XXXXX`) for each detected
9
+ * provider.
10
+ *
11
+ * Returned shape is documented on {@link TagsMeta} in `./types.ts`.
12
+ * @module
13
+ */
14
+ import wappalyzer from 'simple-wappalyzer';
15
+ import { domLog } from '../debug.js';
16
+ import { extractIds } from './id-extractors.js';
17
+ const log = domLog.extend(`${process.pid}`);
18
+ const EMPTY_TAGS = { detected: {}, entries: [] };
19
+ /**
20
+ * Drives `simple-wappalyzer` and post-processes the result with the
21
+ * provider-specific ID extractors. Failures fall back to an empty `TagsMeta`
22
+ * rather than throwing, so the caller does not need to wrap the call.
23
+ * @param input
24
+ */
25
+ export async function detectTags(input) {
26
+ const headers = normalizeHeaders(input.headers);
27
+ let detections;
28
+ try {
29
+ const result = (await wappalyzer({
30
+ url: input.url,
31
+ html: input.html,
32
+ headers,
33
+ }));
34
+ detections = Array.isArray(result) ? result : [];
35
+ }
36
+ catch (error) {
37
+ log('detectTags: simple-wappalyzer failed; returning empty TagsMeta. Error: %O', error);
38
+ return cloneEmpty();
39
+ }
40
+ return assembleTagsMeta(detections, input.html);
41
+ }
42
+ /**
43
+ * Builds a `TagsMeta` from the raw `simple-wappalyzer` output and the page
44
+ * HTML used for ID extraction.
45
+ *
46
+ * Exported for unit tests that bypass `simple-wappalyzer` and feed
47
+ * pre-recorded detections directly.
48
+ * @param detections
49
+ * @param html
50
+ */
51
+ export function assembleTagsMeta(detections, html) {
52
+ const detected = {};
53
+ const entries = [];
54
+ for (const tech of detections) {
55
+ if (!tech.name)
56
+ continue;
57
+ const ids = extractIds(tech.name, html);
58
+ const categories = tech.categories
59
+ ?.map((c) => c.name)
60
+ .filter((name) => typeof name === 'string') ?? [];
61
+ const detail = {
62
+ ids,
63
+ ...(tech.version === undefined ? {} : { version: tech.version }),
64
+ ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
65
+ };
66
+ for (const category of categories.length > 0 ? categories : ['Other']) {
67
+ if (detected[category] === undefined) {
68
+ detected[category] = {};
69
+ }
70
+ detected[category][tech.name] = detail;
71
+ }
72
+ const baseSources = [{ type: 'html' }];
73
+ if (ids.length === 0) {
74
+ entries.push({
75
+ provider: tech.name,
76
+ categories,
77
+ ...(tech.version === undefined ? {} : { version: tech.version }),
78
+ ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
79
+ sources: baseSources,
80
+ });
81
+ }
82
+ else {
83
+ for (const id of ids) {
84
+ entries.push({
85
+ provider: tech.name,
86
+ categories,
87
+ id,
88
+ ...(tech.version === undefined ? {} : { version: tech.version }),
89
+ ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
90
+ sources: baseSources,
91
+ });
92
+ }
93
+ }
94
+ }
95
+ return { detected, entries };
96
+ }
97
+ /**
98
+ *
99
+ */
100
+ function cloneEmpty() {
101
+ return { detected: {}, entries: [] };
102
+ }
103
+ /**
104
+ *
105
+ * @param headers
106
+ */
107
+ function normalizeHeaders(headers) {
108
+ if (!headers)
109
+ return {};
110
+ const out = {};
111
+ for (const [key, value] of Object.entries(headers)) {
112
+ if (value === undefined)
113
+ continue;
114
+ const flat = Array.isArray(value) ? value.join(', ') : value;
115
+ out[key.toLowerCase()] = flat;
116
+ }
117
+ return out;
118
+ }
119
+ /** Singleton empty `TagsMeta` value (exported for tests). */
120
+ export const EMPTY_TAGS_META = EMPTY_TAGS;