@d-zero/beholder 2.1.6 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +26 -0
- package/dist/dom-evaluation.d.ts +72 -24
- package/dist/dom-evaluation.js +310 -84
- package/dist/extract-meta.d.ts +98 -0
- package/dist/extract-meta.js +75 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.js +1 -0
- package/dist/meta/classify.d.ts +52 -0
- package/dist/meta/classify.js +731 -0
- package/dist/meta/collect-head.d.ts +63 -0
- package/dist/meta/collect-head.js +223 -0
- package/dist/meta/id-extractors.d.ts +40 -0
- package/dist/meta/id-extractors.js +196 -0
- package/dist/meta/keys.d.ts +41 -0
- package/dist/meta/keys.js +507 -0
- package/dist/meta/parsers.d.ts +74 -0
- package/dist/meta/parsers.js +293 -0
- package/dist/meta/tag-detection.d.ts +59 -0
- package/dist/meta/tag-detection.js +120 -0
- package/dist/meta/types.d.ts +874 -0
- package/dist/meta/types.js +12 -0
- package/dist/scraper.js +15 -13
- package/dist/types.d.ts +3 -38
- package/package.json +8 -5
- package/src/dom-evaluation.spec.ts +301 -73
- package/src/dom-evaluation.ts +417 -88
- package/src/extract-meta.spec.ts +247 -0
- package/src/extract-meta.ts +121 -0
- package/src/index.ts +45 -0
- package/src/meta/classify.spec.ts +281 -0
- package/src/meta/classify.ts +810 -0
- package/src/meta/collect-head.ts +247 -0
- package/src/meta/id-extractors.spec.ts +69 -0
- package/src/meta/id-extractors.ts +206 -0
- package/src/meta/keys.ts +568 -0
- package/src/meta/parsers.spec.ts +178 -0
- package/src/meta/parsers.ts +304 -0
- package/src/meta/simple-wappalyzer.d.ts +37 -0
- package/src/meta/tag-detection.spec.ts +134 -0
- package/src/meta/tag-detection.ts +161 -0
- package/src/meta/types.ts +949 -0
- package/src/scraper.ts +19 -13
- package/src/types.ts +49 -55
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest';
|
|
2
|
+
|
|
3
|
+
import {
|
|
4
|
+
capJsonLdContent,
|
|
5
|
+
JSON_LD_PER_ENTRY_LIMIT,
|
|
6
|
+
normalizeValue,
|
|
7
|
+
parseFormatDetection,
|
|
8
|
+
parseJsonLd,
|
|
9
|
+
parseRefresh,
|
|
10
|
+
parseReferrer,
|
|
11
|
+
parseRobots,
|
|
12
|
+
parseViewport,
|
|
13
|
+
} from './parsers.js';
|
|
14
|
+
|
|
15
|
+
describe('parseViewport', () => {
|
|
16
|
+
it('parses width=device-width and initial-scale', () => {
|
|
17
|
+
const result = parseViewport('width=device-width, initial-scale=1.0');
|
|
18
|
+
expect(result.raw).toBe('width=device-width, initial-scale=1.0');
|
|
19
|
+
expect(result.width).toBe('device-width');
|
|
20
|
+
expect(result.initialScale).toBe(1);
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
it('parses user-scalable=no as boolean false', () => {
|
|
24
|
+
const result = parseViewport('user-scalable=no');
|
|
25
|
+
expect(result.userScalable).toBe(false);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it('parses minimum-scale/maximum-scale as numbers', () => {
|
|
29
|
+
const result = parseViewport('minimum-scale=0.5, maximum-scale=2');
|
|
30
|
+
expect(result.minimumScale).toBe(0.5);
|
|
31
|
+
expect(result.maximumScale).toBe(2);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it('preserves viewport-fit and interactive-widget literally', () => {
|
|
35
|
+
const result = parseViewport('viewport-fit=cover, interactive-widget=resizes-visual');
|
|
36
|
+
expect(result.viewportFit).toBe('cover');
|
|
37
|
+
expect(result.interactiveWidget).toBe('resizes-visual');
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('keeps raw on unrecognizable input', () => {
|
|
41
|
+
const result = parseViewport('garbage');
|
|
42
|
+
expect(result.raw).toBe('garbage');
|
|
43
|
+
expect(result.width).toBeUndefined();
|
|
44
|
+
});
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
describe('parseRobots', () => {
|
|
48
|
+
it('flags noindex/nofollow/noarchive', () => {
|
|
49
|
+
const result = parseRobots('noindex, NOFOLLOW, noarchive');
|
|
50
|
+
expect(result.noindex).toBe(true);
|
|
51
|
+
expect(result.nofollow).toBe(true);
|
|
52
|
+
expect(result.noarchive).toBe(true);
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it('extracts max-snippet, max-image-preview, max-video-preview', () => {
|
|
56
|
+
const result = parseRobots(
|
|
57
|
+
'max-snippet:50, max-image-preview:large, max-video-preview:120',
|
|
58
|
+
);
|
|
59
|
+
expect(result.maxSnippet).toBe(50);
|
|
60
|
+
expect(result.maxImagePreview).toBe('large');
|
|
61
|
+
expect(result.maxVideoPreview).toBe(120);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it('extracts unavailable_after', () => {
|
|
65
|
+
const result = parseRobots('unavailable_after:2026-12-31');
|
|
66
|
+
expect(result.unavailableAfter).toBe('2026-12-31');
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it('flags index/follow positives', () => {
|
|
70
|
+
const result = parseRobots('index, follow');
|
|
71
|
+
expect(result.index).toBe(true);
|
|
72
|
+
expect(result.follow).toBe(true);
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
describe('parseReferrer', () => {
|
|
77
|
+
it('flags strict-origin-when-cross-origin', () => {
|
|
78
|
+
const result = parseReferrer('strict-origin-when-cross-origin');
|
|
79
|
+
expect(result.strictOriginWhenCrossOrigin).toBe(true);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('flags no-referrer', () => {
|
|
83
|
+
const result = parseReferrer('no-referrer');
|
|
84
|
+
expect(result.noReferrer).toBe(true);
|
|
85
|
+
});
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
describe('parseFormatDetection', () => {
|
|
89
|
+
it('parses telephone=no, address=no', () => {
|
|
90
|
+
const result = parseFormatDetection('telephone=no, address=no');
|
|
91
|
+
expect(result.telephone).toBe(false);
|
|
92
|
+
expect(result.address).toBe(false);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
it('parses date=no via semicolon separator', () => {
|
|
96
|
+
const result = parseFormatDetection('telephone=no; date=no');
|
|
97
|
+
expect(result.telephone).toBe(false);
|
|
98
|
+
expect(result.date).toBe(false);
|
|
99
|
+
});
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
describe('parseRefresh', () => {
|
|
103
|
+
it('parses seconds and url', () => {
|
|
104
|
+
const result = parseRefresh('5; url=https://example.com/');
|
|
105
|
+
expect(result.seconds).toBe(5);
|
|
106
|
+
expect(result.url).toBe('https://example.com/');
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
it('handles missing url', () => {
|
|
110
|
+
const result = parseRefresh('30');
|
|
111
|
+
expect(result.seconds).toBe(30);
|
|
112
|
+
expect(result.url).toBeUndefined();
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('strips surrounding quotes in url', () => {
|
|
116
|
+
const result = parseRefresh(`0; url='https://example.com/'`);
|
|
117
|
+
expect(result.url).toBe('https://example.com/');
|
|
118
|
+
});
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
describe('parseJsonLd', () => {
|
|
122
|
+
it('returns parsed object on valid JSON', () => {
|
|
123
|
+
const entry = parseJsonLd('{"@type":"WebSite","name":"Site"}');
|
|
124
|
+
expect(entry.parsed).toEqual({ '@type': 'WebSite', name: 'Site' });
|
|
125
|
+
expect(entry.parseError).toBeUndefined();
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
it('records parseError on invalid JSON', () => {
|
|
129
|
+
const entry = parseJsonLd('{ not valid }');
|
|
130
|
+
expect(entry.parsed).toBeUndefined();
|
|
131
|
+
expect(entry.parseError).toBeDefined();
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
describe('normalizeValue', () => {
|
|
136
|
+
it('passes through string by default', () => {
|
|
137
|
+
expect(normalizeValue('hello')).toBe('hello');
|
|
138
|
+
expect(normalizeValue('hello', 'string')).toBe('hello');
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
it('boolean-yes maps yes/no', () => {
|
|
142
|
+
expect(normalizeValue('yes', 'boolean-yes')).toBe(true);
|
|
143
|
+
expect(normalizeValue('no', 'boolean-yes')).toBe(false);
|
|
144
|
+
expect(normalizeValue('unknown', 'boolean-yes')).toBe('unknown');
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it('boolean-on maps on/off/true/false/1/0', () => {
|
|
148
|
+
expect(normalizeValue('on', 'boolean-on')).toBe(true);
|
|
149
|
+
expect(normalizeValue('off', 'boolean-on')).toBe(false);
|
|
150
|
+
expect(normalizeValue('true', 'boolean-on')).toBe(true);
|
|
151
|
+
expect(normalizeValue('0', 'boolean-on')).toBe(false);
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
it('boolean-true maps true/false only', () => {
|
|
155
|
+
expect(normalizeValue('true', 'boolean-true')).toBe(true);
|
|
156
|
+
expect(normalizeValue('false', 'boolean-true')).toBe(false);
|
|
157
|
+
expect(normalizeValue('1', 'boolean-true')).toBe('1');
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
it('number parses floats and falls back to raw', () => {
|
|
161
|
+
expect(normalizeValue('3.14', 'number')).toBe(3.14);
|
|
162
|
+
expect(normalizeValue('NaN-ish', 'number')).toBe('NaN-ish');
|
|
163
|
+
});
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
describe('capJsonLdContent', () => {
|
|
167
|
+
it('returns content unchanged when under the limit', () => {
|
|
168
|
+
const result = capJsonLdContent('{}');
|
|
169
|
+
expect(result).toEqual({ content: '{}', truncated: false });
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
it('truncates content over the per-entry limit', () => {
|
|
173
|
+
const big = 'a'.repeat(JSON_LD_PER_ENTRY_LIMIT + 100);
|
|
174
|
+
const result = capJsonLdContent(big);
|
|
175
|
+
expect(result.truncated).toBe(true);
|
|
176
|
+
expect(result.content.length).toBe(JSON_LD_PER_ENTRY_LIMIT);
|
|
177
|
+
});
|
|
178
|
+
});
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Value normalizers used by `classify()` to turn raw `content` strings into
|
|
3
|
+
* structured objects (viewport, robots, format-detection, etc.).
|
|
4
|
+
*
|
|
5
|
+
* Each parser is a pure function that takes the raw `content` string and
|
|
6
|
+
* returns a normalized structure. They never throw; on unrecognizable input
|
|
7
|
+
* they fall back to keeping the `raw` field only.
|
|
8
|
+
* @module
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { KeyTransform } from './keys.js';
|
|
12
|
+
import type {
|
|
13
|
+
FormatDetectionMeta,
|
|
14
|
+
HttpEquivRefresh,
|
|
15
|
+
JsonLdEntry,
|
|
16
|
+
ReferrerMeta,
|
|
17
|
+
RobotsMeta,
|
|
18
|
+
ViewportMeta,
|
|
19
|
+
} from './types.js';
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Parses `<meta name="viewport">` content into a structured `ViewportMeta`.
|
|
23
|
+
* @param raw
|
|
24
|
+
* @example parseViewport('width=device-width, initial-scale=1.0')
|
|
25
|
+
* → { raw: '...', width: 'device-width', initialScale: 1 }
|
|
26
|
+
*/
|
|
27
|
+
export function parseViewport(raw: string): ViewportMeta {
|
|
28
|
+
const meta: ViewportMeta = { raw };
|
|
29
|
+
for (const part of raw.split(',')) {
|
|
30
|
+
const split = part.split('=');
|
|
31
|
+
const keyRaw = split[0] ?? '';
|
|
32
|
+
const valueRaw = split[1] ?? '';
|
|
33
|
+
const key = keyRaw.trim().toLowerCase();
|
|
34
|
+
const value = valueRaw.trim();
|
|
35
|
+
if (!key) continue;
|
|
36
|
+
switch (key) {
|
|
37
|
+
case 'width': {
|
|
38
|
+
meta.width = value;
|
|
39
|
+
break;
|
|
40
|
+
}
|
|
41
|
+
case 'height': {
|
|
42
|
+
meta.height = value;
|
|
43
|
+
break;
|
|
44
|
+
}
|
|
45
|
+
case 'initial-scale': {
|
|
46
|
+
const n = Number.parseFloat(value);
|
|
47
|
+
if (!Number.isNaN(n)) meta.initialScale = n;
|
|
48
|
+
break;
|
|
49
|
+
}
|
|
50
|
+
case 'minimum-scale': {
|
|
51
|
+
const n = Number.parseFloat(value);
|
|
52
|
+
if (!Number.isNaN(n)) meta.minimumScale = n;
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
case 'maximum-scale': {
|
|
56
|
+
const n = Number.parseFloat(value);
|
|
57
|
+
if (!Number.isNaN(n)) meta.maximumScale = n;
|
|
58
|
+
break;
|
|
59
|
+
}
|
|
60
|
+
case 'user-scalable': {
|
|
61
|
+
const lower = value.toLowerCase();
|
|
62
|
+
if (lower === 'no' || lower === '0') meta.userScalable = false;
|
|
63
|
+
else if (lower === 'yes' || lower === '1') meta.userScalable = true;
|
|
64
|
+
else meta.userScalable = value;
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
case 'viewport-fit': {
|
|
68
|
+
meta.viewportFit = value;
|
|
69
|
+
break;
|
|
70
|
+
}
|
|
71
|
+
case 'interactive-widget': {
|
|
72
|
+
meta.interactiveWidget = value;
|
|
73
|
+
break;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return meta;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const ROBOTS_BOOLEAN_FLAGS = new Set<keyof RobotsMeta>([
|
|
81
|
+
'index',
|
|
82
|
+
'noindex',
|
|
83
|
+
'follow',
|
|
84
|
+
'nofollow',
|
|
85
|
+
'none',
|
|
86
|
+
'all',
|
|
87
|
+
'noarchive',
|
|
88
|
+
'nosnippet',
|
|
89
|
+
'noimageindex',
|
|
90
|
+
'nocache',
|
|
91
|
+
'notranslate',
|
|
92
|
+
'noodp',
|
|
93
|
+
'noydir',
|
|
94
|
+
'indexifembedded',
|
|
95
|
+
]);
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Parses `<meta name="robots">` content into a structured `RobotsMeta`.
|
|
99
|
+
* @param raw
|
|
100
|
+
* @example parseRobots('noindex, max-snippet:50, unavailable_after:2026-01-01')
|
|
101
|
+
* → { raw: '...', noindex: true, maxSnippet: 50, unavailableAfter: '2026-01-01' }
|
|
102
|
+
*/
|
|
103
|
+
export function parseRobots(raw: string): RobotsMeta {
|
|
104
|
+
const meta: RobotsMeta = { raw };
|
|
105
|
+
for (const token of raw.split(',')) {
|
|
106
|
+
const trimmed = token.trim().toLowerCase();
|
|
107
|
+
if (!trimmed) continue;
|
|
108
|
+
|
|
109
|
+
if (ROBOTS_BOOLEAN_FLAGS.has(trimmed as keyof RobotsMeta)) {
|
|
110
|
+
(meta as Record<string, unknown>)[trimmed] = true;
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const colonIndex = trimmed.indexOf(':');
|
|
115
|
+
if (colonIndex === -1) {
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
const key = trimmed.slice(0, colonIndex).trim();
|
|
119
|
+
const value = token.slice(token.indexOf(':') + 1).trim();
|
|
120
|
+
switch (key) {
|
|
121
|
+
case 'max-snippet': {
|
|
122
|
+
const n = Number.parseInt(value, 10);
|
|
123
|
+
if (!Number.isNaN(n)) meta.maxSnippet = n;
|
|
124
|
+
break;
|
|
125
|
+
}
|
|
126
|
+
case 'max-image-preview': {
|
|
127
|
+
meta.maxImagePreview = value;
|
|
128
|
+
break;
|
|
129
|
+
}
|
|
130
|
+
case 'max-video-preview': {
|
|
131
|
+
const n = Number.parseInt(value, 10);
|
|
132
|
+
if (!Number.isNaN(n)) meta.maxVideoPreview = n;
|
|
133
|
+
break;
|
|
134
|
+
}
|
|
135
|
+
case 'unavailable_after':
|
|
136
|
+
case 'unavailable-after': {
|
|
137
|
+
meta.unavailableAfter = value;
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return meta;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
const REFERRER_POLICY_KEYS: Record<string, keyof ReferrerMeta> = {
|
|
146
|
+
'no-referrer': 'noReferrer',
|
|
147
|
+
origin: 'origin',
|
|
148
|
+
'origin-when-cross-origin': 'originWhenCrossOrigin',
|
|
149
|
+
'strict-origin': 'strictOrigin',
|
|
150
|
+
'strict-origin-when-cross-origin': 'strictOriginWhenCrossOrigin',
|
|
151
|
+
'unsafe-url': 'unsafeUrl',
|
|
152
|
+
'same-origin': 'sameOrigin',
|
|
153
|
+
'no-referrer-when-downgrade': 'noReferrerWhenDowngrade',
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Parses `<meta name="referrer">` content into a structured `ReferrerMeta`.
|
|
158
|
+
* @param raw
|
|
159
|
+
*/
|
|
160
|
+
export function parseReferrer(raw: string): ReferrerMeta {
|
|
161
|
+
const meta: ReferrerMeta = { raw };
|
|
162
|
+
const key = REFERRER_POLICY_KEYS[raw.trim().toLowerCase()];
|
|
163
|
+
if (key) {
|
|
164
|
+
(meta as Record<string, unknown>)[key] = true;
|
|
165
|
+
}
|
|
166
|
+
return meta;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Parses `<meta name="format-detection">` content (e.g. `'telephone=no, address=no'`).
|
|
171
|
+
* @param raw
|
|
172
|
+
*/
|
|
173
|
+
export function parseFormatDetection(raw: string): FormatDetectionMeta {
|
|
174
|
+
const meta: FormatDetectionMeta = { raw };
|
|
175
|
+
for (const part of raw.split(/[,;]/)) {
|
|
176
|
+
const split = part.split('=');
|
|
177
|
+
const keyRaw = split[0] ?? '';
|
|
178
|
+
const valueRaw = split[1] ?? '';
|
|
179
|
+
const key = keyRaw.trim().toLowerCase();
|
|
180
|
+
const value = valueRaw.trim().toLowerCase();
|
|
181
|
+
if (!key) continue;
|
|
182
|
+
const enabled = value !== 'no' && value !== 'false' && value !== '0';
|
|
183
|
+
switch (key) {
|
|
184
|
+
case 'telephone': {
|
|
185
|
+
meta.telephone = enabled;
|
|
186
|
+
break;
|
|
187
|
+
}
|
|
188
|
+
case 'email': {
|
|
189
|
+
meta.email = enabled;
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
case 'address': {
|
|
193
|
+
meta.address = enabled;
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
case 'date': {
|
|
197
|
+
meta.date = enabled;
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return meta;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Parses `<meta http-equiv="refresh">` content (e.g. `'5; url=https://...'`).
|
|
207
|
+
* @param raw
|
|
208
|
+
*/
|
|
209
|
+
export function parseRefresh(raw: string): HttpEquivRefresh {
|
|
210
|
+
const refresh: HttpEquivRefresh = { raw };
|
|
211
|
+
const split = raw.split(';');
|
|
212
|
+
const secondsRaw = split[0] ?? '';
|
|
213
|
+
const rest = split.slice(1).join(';');
|
|
214
|
+
const seconds = Number.parseFloat(secondsRaw.trim());
|
|
215
|
+
if (!Number.isNaN(seconds)) {
|
|
216
|
+
refresh.seconds = seconds;
|
|
217
|
+
}
|
|
218
|
+
const urlMatch = /url\s*=\s*(.+)/i.exec(rest);
|
|
219
|
+
if (urlMatch?.[1]) {
|
|
220
|
+
refresh.url = urlMatch[1].trim().replaceAll(/^['"]|['"]$/g, '');
|
|
221
|
+
}
|
|
222
|
+
return refresh;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Parses a `<script type="application/ld+json">` (or speculationrules) body
|
|
227
|
+
* into a {@link JsonLdEntry}. On parse failure, the entry preserves the `raw`
|
|
228
|
+
* text and records the error message in `parseError`.
|
|
229
|
+
* @param content
|
|
230
|
+
*/
|
|
231
|
+
export function parseJsonLd(content: string): JsonLdEntry {
|
|
232
|
+
const raw = content;
|
|
233
|
+
try {
|
|
234
|
+
const parsed: unknown = JSON.parse(content);
|
|
235
|
+
return { raw, parsed };
|
|
236
|
+
} catch (error) {
|
|
237
|
+
const parseError = error instanceof Error ? error.message : String(error);
|
|
238
|
+
return { raw, parseError };
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Normalizes a string value according to a {@link KeyTransform}.
|
|
244
|
+
*
|
|
245
|
+
* `'boolean-yes'`: `'yes'` → `true`, `'no'` → `false`, anything else → raw string
|
|
246
|
+
* `'boolean-on'`: `'on'`/`'true'`/`'1'` → `true`, `'off'`/`'false'`/`'0'` → `false`, else raw
|
|
247
|
+
* `'boolean-true'`: `'true'` → `true`, `'false'` → `false`, else raw
|
|
248
|
+
* `'number'`: parsed via `Number.parseFloat`, falls back to raw on NaN
|
|
249
|
+
* `'string'` (default): returns the value unchanged
|
|
250
|
+
* @param value
|
|
251
|
+
* @param transform
|
|
252
|
+
*/
|
|
253
|
+
export function normalizeValue(
|
|
254
|
+
value: string,
|
|
255
|
+
transform: KeyTransform | undefined,
|
|
256
|
+
): string | number | boolean {
|
|
257
|
+
if (!transform || transform === 'string') {
|
|
258
|
+
return value;
|
|
259
|
+
}
|
|
260
|
+
const lower = value.trim().toLowerCase();
|
|
261
|
+
switch (transform) {
|
|
262
|
+
case 'boolean-yes': {
|
|
263
|
+
if (lower === 'yes') return true;
|
|
264
|
+
if (lower === 'no') return false;
|
|
265
|
+
return value;
|
|
266
|
+
}
|
|
267
|
+
case 'boolean-on': {
|
|
268
|
+
if (lower === 'on' || lower === 'true' || lower === '1') return true;
|
|
269
|
+
if (lower === 'off' || lower === 'false' || lower === '0') return false;
|
|
270
|
+
return value;
|
|
271
|
+
}
|
|
272
|
+
case 'boolean-true': {
|
|
273
|
+
if (lower === 'true') return true;
|
|
274
|
+
if (lower === 'false') return false;
|
|
275
|
+
return value;
|
|
276
|
+
}
|
|
277
|
+
case 'number': {
|
|
278
|
+
const n = Number.parseFloat(value);
|
|
279
|
+
return Number.isNaN(n) ? value : n;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* JSON-LD / speculationrules content size caps (bytes). Above these sizes the
|
|
286
|
+
* content is truncated and a `truncated` marker is emitted via `parseError`.
|
|
287
|
+
*/
|
|
288
|
+
export const JSON_LD_PER_ENTRY_LIMIT = 200_000;
|
|
289
|
+
export const JSON_LD_TOTAL_LIMIT = 1_000_000;
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Caps a single JSON-LD entry's raw content to {@link JSON_LD_PER_ENTRY_LIMIT}.
|
|
293
|
+
* Returns the (possibly truncated) entry and a `truncated` flag.
|
|
294
|
+
* @param content
|
|
295
|
+
*/
|
|
296
|
+
export function capJsonLdContent(content: string): {
|
|
297
|
+
content: string;
|
|
298
|
+
truncated: boolean;
|
|
299
|
+
} {
|
|
300
|
+
if (content.length <= JSON_LD_PER_ENTRY_LIMIT) {
|
|
301
|
+
return { content, truncated: false };
|
|
302
|
+
}
|
|
303
|
+
return { content: content.slice(0, JSON_LD_PER_ENTRY_LIMIT), truncated: true };
|
|
304
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ambient type declarations for `simple-wappalyzer` (no upstream types).
|
|
3
|
+
*
|
|
4
|
+
* Mirrors the runtime shape verified against the installed
|
|
5
|
+
* `simple-wappalyzer@1.1.99` (`node_modules/simple-wappalyzer/src/index.js`):
|
|
6
|
+
* the module exports a single async function taking `{ url, headers, html }`
|
|
7
|
+
* and returning the resolved Wappalyzer technology list.
|
|
8
|
+
*
|
|
9
|
+
* Only the subset of fields actually consumed in {@link tag-detection.ts} is
|
|
10
|
+
* declared; the runtime value may have more keys (icon, website, etc.) which
|
|
11
|
+
* we ignore.
|
|
12
|
+
*/
|
|
13
|
+
declare module 'simple-wappalyzer' {
|
|
14
|
+
export type WappalyzerCategory = {
|
|
15
|
+
readonly id?: number;
|
|
16
|
+
readonly slug?: string;
|
|
17
|
+
readonly name?: string;
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
export type WappalyzerDetection = {
|
|
21
|
+
readonly name: string;
|
|
22
|
+
readonly version?: string;
|
|
23
|
+
readonly confidence?: number;
|
|
24
|
+
readonly icon?: string;
|
|
25
|
+
readonly website?: string;
|
|
26
|
+
readonly categories?: readonly WappalyzerCategory[];
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
export type WappalyzerInput = {
|
|
30
|
+
readonly url: string;
|
|
31
|
+
readonly html: string;
|
|
32
|
+
readonly headers?: Record<string, string>;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
const wappalyzer: (input: WappalyzerInput) => Promise<WappalyzerDetection[]>;
|
|
36
|
+
export default wappalyzer;
|
|
37
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import { describe, expect, it, vi } from 'vitest';
|
|
2
|
+
|
|
3
|
+
import { assembleTagsMeta, detectTags, EMPTY_TAGS_META } from './tag-detection.js';
|
|
4
|
+
|
|
5
|
+
vi.mock('simple-wappalyzer', () => ({
|
|
6
|
+
default: vi.fn(),
|
|
7
|
+
}));
|
|
8
|
+
|
|
9
|
+
const importedModule = await import('simple-wappalyzer');
|
|
10
|
+
const wappalyzerMock = importedModule.default as unknown as ReturnType<typeof vi.fn>;
|
|
11
|
+
|
|
12
|
+
describe('assembleTagsMeta', () => {
|
|
13
|
+
it('groups detections under their Wappalyzer categories', () => {
|
|
14
|
+
const tags = assembleTagsMeta(
|
|
15
|
+
[
|
|
16
|
+
{
|
|
17
|
+
name: 'Google Analytics',
|
|
18
|
+
version: 'GA4',
|
|
19
|
+
confidence: 100,
|
|
20
|
+
categories: [{ name: 'Analytics' }],
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
name: 'Google Tag Manager',
|
|
24
|
+
confidence: 100,
|
|
25
|
+
categories: [{ name: 'Tag Managers' }, { name: 'Analytics' }],
|
|
26
|
+
},
|
|
27
|
+
],
|
|
28
|
+
'',
|
|
29
|
+
);
|
|
30
|
+
expect(tags.detected.Analytics).toBeDefined();
|
|
31
|
+
expect(tags.detected.Analytics?.['Google Analytics']?.version).toBe('GA4');
|
|
32
|
+
expect(tags.detected['Tag Managers']?.['Google Tag Manager']).toBeDefined();
|
|
33
|
+
expect(tags.detected.Analytics?.['Google Tag Manager']).toBeDefined();
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('attaches extracted IDs to detail and emits one entry per ID', () => {
|
|
37
|
+
const html = `<script>gtag('config', 'G-XYZ123')</script><script>gtag('config', 'G-AAA999')</script>`;
|
|
38
|
+
const tags = assembleTagsMeta(
|
|
39
|
+
[
|
|
40
|
+
{
|
|
41
|
+
name: 'Google Analytics',
|
|
42
|
+
categories: [{ name: 'Analytics' }],
|
|
43
|
+
},
|
|
44
|
+
],
|
|
45
|
+
html,
|
|
46
|
+
);
|
|
47
|
+
expect(tags.detected.Analytics?.['Google Analytics']?.ids).toEqual([
|
|
48
|
+
'G-XYZ123',
|
|
49
|
+
'G-AAA999',
|
|
50
|
+
]);
|
|
51
|
+
const providerEntries = tags.entries.filter((e) => e.provider === 'Google Analytics');
|
|
52
|
+
expect(providerEntries.map((e) => e.id)).toEqual(['G-XYZ123', 'G-AAA999']);
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it('emits one entry without id when no IDs are extracted', () => {
|
|
56
|
+
const tags = assembleTagsMeta(
|
|
57
|
+
[
|
|
58
|
+
{
|
|
59
|
+
name: 'jQuery',
|
|
60
|
+
version: '3.6.0',
|
|
61
|
+
categories: [{ name: 'JavaScript Libraries' }],
|
|
62
|
+
},
|
|
63
|
+
],
|
|
64
|
+
'<html></html>',
|
|
65
|
+
);
|
|
66
|
+
expect(tags.entries).toHaveLength(1);
|
|
67
|
+
expect(tags.entries[0]?.id).toBeUndefined();
|
|
68
|
+
expect(tags.entries[0]?.version).toBe('3.6.0');
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it('falls back to "Other" category when no categories are present', () => {
|
|
72
|
+
const tags = assembleTagsMeta([{ name: 'Unknown', categories: [] }], '<html></html>');
|
|
73
|
+
expect(tags.detected['Other']?.['Unknown']).toBeDefined();
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it('skips detections without a name', () => {
|
|
77
|
+
const tags = assembleTagsMeta(
|
|
78
|
+
[{ name: '', categories: [{ name: 'Analytics' }] }],
|
|
79
|
+
'<html></html>',
|
|
80
|
+
);
|
|
81
|
+
expect(tags.entries).toHaveLength(0);
|
|
82
|
+
expect(Object.keys(tags.detected)).toHaveLength(0);
|
|
83
|
+
});
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
describe('detectTags', () => {
|
|
87
|
+
it('falls back to empty TagsMeta when simple-wappalyzer throws', async () => {
|
|
88
|
+
wappalyzerMock.mockRejectedValueOnce(new Error('wappalyzer boom'));
|
|
89
|
+
const result = await detectTags({
|
|
90
|
+
url: 'https://example.com/',
|
|
91
|
+
html: '<html></html>',
|
|
92
|
+
});
|
|
93
|
+
expect(result).toEqual(EMPTY_TAGS_META);
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it('falls back to empty TagsMeta when simple-wappalyzer returns non-array', async () => {
|
|
97
|
+
wappalyzerMock.mockResolvedValueOnce(null as unknown as never);
|
|
98
|
+
const result = await detectTags({
|
|
99
|
+
url: 'https://example.com/',
|
|
100
|
+
html: '<html></html>',
|
|
101
|
+
});
|
|
102
|
+
expect(result).toEqual(EMPTY_TAGS_META);
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it('passes detections through assembleTagsMeta', async () => {
|
|
106
|
+
wappalyzerMock.mockResolvedValueOnce([
|
|
107
|
+
{
|
|
108
|
+
name: 'Google Analytics',
|
|
109
|
+
version: 'GA4',
|
|
110
|
+
categories: [{ name: 'Analytics' }],
|
|
111
|
+
},
|
|
112
|
+
] as never);
|
|
113
|
+
const result = await detectTags({
|
|
114
|
+
url: 'https://example.com/',
|
|
115
|
+
html: `<script>gtag('config', 'G-XYZ123')</script>`,
|
|
116
|
+
});
|
|
117
|
+
expect(result.entries).toHaveLength(1);
|
|
118
|
+
expect(result.entries[0]?.id).toBe('G-XYZ123');
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
it('normalizes headers to lowercase before calling wappalyzer', async () => {
|
|
122
|
+
wappalyzerMock.mockResolvedValueOnce([] as never);
|
|
123
|
+
await detectTags({
|
|
124
|
+
url: 'https://example.com/',
|
|
125
|
+
html: '<html></html>',
|
|
126
|
+
headers: { 'Content-Type': 'text/html', 'X-Custom': ['a', 'b'] },
|
|
127
|
+
});
|
|
128
|
+
const arg = wappalyzerMock.mock.calls.at(-1)?.[0] as {
|
|
129
|
+
headers?: Record<string, string>;
|
|
130
|
+
};
|
|
131
|
+
expect(arg?.headers?.['content-type']).toBe('text/html');
|
|
132
|
+
expect(arg?.headers?.['x-custom']).toBe('a, b');
|
|
133
|
+
});
|
|
134
|
+
});
|