@d-zero/beholder 2.1.6 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +26 -0
- package/dist/dom-evaluation.d.ts +72 -24
- package/dist/dom-evaluation.js +310 -84
- package/dist/extract-meta.d.ts +98 -0
- package/dist/extract-meta.js +75 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.js +1 -0
- package/dist/meta/classify.d.ts +52 -0
- package/dist/meta/classify.js +731 -0
- package/dist/meta/collect-head.d.ts +63 -0
- package/dist/meta/collect-head.js +223 -0
- package/dist/meta/id-extractors.d.ts +40 -0
- package/dist/meta/id-extractors.js +196 -0
- package/dist/meta/keys.d.ts +41 -0
- package/dist/meta/keys.js +507 -0
- package/dist/meta/parsers.d.ts +74 -0
- package/dist/meta/parsers.js +293 -0
- package/dist/meta/tag-detection.d.ts +59 -0
- package/dist/meta/tag-detection.js +120 -0
- package/dist/meta/types.d.ts +874 -0
- package/dist/meta/types.js +12 -0
- package/dist/scraper.js +15 -13
- package/dist/types.d.ts +3 -38
- package/package.json +8 -5
- package/src/dom-evaluation.spec.ts +301 -73
- package/src/dom-evaluation.ts +417 -88
- package/src/extract-meta.spec.ts +247 -0
- package/src/extract-meta.ts +121 -0
- package/src/index.ts +45 -0
- package/src/meta/classify.spec.ts +281 -0
- package/src/meta/classify.ts +810 -0
- package/src/meta/collect-head.ts +247 -0
- package/src/meta/id-extractors.spec.ts +69 -0
- package/src/meta/id-extractors.ts +206 -0
- package/src/meta/keys.ts +568 -0
- package/src/meta/parsers.spec.ts +178 -0
- package/src/meta/parsers.ts +304 -0
- package/src/meta/simple-wappalyzer.d.ts +37 -0
- package/src/meta/tag-detection.spec.ts +134 -0
- package/src/meta/tag-detection.ts +161 -0
- package/src/meta/types.ts +949 -0
- package/src/scraper.ts +19 -13
- package/src/types.ts +49 -55
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,810 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure-function classifier that turns `RawHeadEntry[]` (collected on the browser
|
|
3
|
+
* side by `collectHead`) into a typed `Meta` object.
|
|
4
|
+
*
|
|
5
|
+
* The classifier is the **only place** where dot-paths from `keys.ts` get
|
|
6
|
+
* resolved against the `Meta` shape. Parsers (viewport/robots/refresh/etc.)
|
|
7
|
+
* are dispatched on the fly for the few entries that need value normalization.
|
|
8
|
+
*
|
|
9
|
+
* Unknown entries (names/properties/rels not in the lookup tables) are
|
|
10
|
+
* preserved in {@link Meta.others} so consumers never lose information.
|
|
11
|
+
* @module
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import type { KeyDef, LinkRelDef } from './keys.js';
|
|
15
|
+
import type {
|
|
16
|
+
JsonLdEntry,
|
|
17
|
+
LinkEntry,
|
|
18
|
+
Meta,
|
|
19
|
+
OthersBucket,
|
|
20
|
+
RawHeadEntry,
|
|
21
|
+
TagsMeta,
|
|
22
|
+
} from './types.js';
|
|
23
|
+
|
|
24
|
+
import {
|
|
25
|
+
HTTP_EQUIV_MAP,
|
|
26
|
+
ITEMPROP_MAP,
|
|
27
|
+
LINK_REL_MAP,
|
|
28
|
+
META_NAME_MAP,
|
|
29
|
+
META_PROPERTY_MAP,
|
|
30
|
+
} from './keys.js';
|
|
31
|
+
import {
|
|
32
|
+
JSON_LD_TOTAL_LIMIT,
|
|
33
|
+
capJsonLdContent,
|
|
34
|
+
normalizeValue,
|
|
35
|
+
parseFormatDetection,
|
|
36
|
+
parseJsonLd,
|
|
37
|
+
parseRefresh,
|
|
38
|
+
parseReferrer,
|
|
39
|
+
parseRobots,
|
|
40
|
+
parseViewport,
|
|
41
|
+
} from './parsers.js';
|
|
42
|
+
|
|
43
|
+
const THEME_COLOR_DARK_MEDIA = /prefers-color-scheme:\s*dark/i;
|
|
44
|
+
const THEME_COLOR_LIGHT_MEDIA = /prefers-color-scheme:\s*light/i;
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Options for {@link classify}.
|
|
48
|
+
*/
|
|
49
|
+
export type ClassifyOptions = {
|
|
50
|
+
/**
|
|
51
|
+
* When `true`, copies the input `raw` entries onto `Meta._raw` for debugging.
|
|
52
|
+
* Default `false` to keep the serialized `Meta` small.
|
|
53
|
+
*/
|
|
54
|
+
readonly includeRaw?: boolean;
|
|
55
|
+
/**
|
|
56
|
+
* Pre-computed `TagsMeta` from `tag-detection.ts`. When omitted, an empty
|
|
57
|
+
* `TagsMeta` (with `detected: {}` and `entries: []`) is used.
|
|
58
|
+
*/
|
|
59
|
+
readonly tags?: TagsMeta;
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Builds the empty `Meta` skeleton with all required fields initialized.
|
|
64
|
+
*/
|
|
65
|
+
/** Returns a fresh `Meta` skeleton with all required fields initialized. */
|
|
66
|
+
export function emptyMeta(): Meta {
|
|
67
|
+
return {
|
|
68
|
+
title: '',
|
|
69
|
+
originTrial: [],
|
|
70
|
+
jsonLd: [],
|
|
71
|
+
speculationRules: [],
|
|
72
|
+
tags: { detected: {}, entries: [] },
|
|
73
|
+
others: emptyOthersBucket(),
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
*
|
|
79
|
+
* @param meta
|
|
80
|
+
*/
|
|
81
|
+
function ensureHttpEquiv(meta: Meta): NonNullable<Meta['httpEquiv']> {
|
|
82
|
+
if (meta.httpEquiv === undefined) {
|
|
83
|
+
meta.httpEquiv = { originTrialToken: [] };
|
|
84
|
+
} else if (!Array.isArray(meta.httpEquiv.originTrialToken)) {
|
|
85
|
+
meta.httpEquiv.originTrialToken = [];
|
|
86
|
+
}
|
|
87
|
+
return meta.httpEquiv;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
*
|
|
92
|
+
*/
|
|
93
|
+
function emptyOthersBucket(): OthersBucket {
|
|
94
|
+
return {
|
|
95
|
+
meta: {},
|
|
96
|
+
property: {},
|
|
97
|
+
httpEquiv: {},
|
|
98
|
+
itemprop: {},
|
|
99
|
+
link: [],
|
|
100
|
+
script: [],
|
|
101
|
+
iframe: [],
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Writes `value` to `target` along `dotPath`. Intermediate objects are created
|
|
107
|
+
* on demand. When `multi` is `true`, the leaf is treated as an array and `value`
|
|
108
|
+
* is appended; otherwise the first assignment wins (subsequent calls are no-ops).
|
|
109
|
+
*
|
|
110
|
+
* Exported for the unit tests in `classify.spec.ts`.
|
|
111
|
+
* @param target
|
|
112
|
+
* @param dotPath
|
|
113
|
+
* @param value
|
|
114
|
+
* @param multi
|
|
115
|
+
*/
|
|
116
|
+
export function setByPath(
|
|
117
|
+
target: Record<string, unknown>,
|
|
118
|
+
dotPath: string,
|
|
119
|
+
value: unknown,
|
|
120
|
+
multi: boolean,
|
|
121
|
+
): void {
|
|
122
|
+
const segments = dotPath.split('.');
|
|
123
|
+
if (segments.length === 0) return;
|
|
124
|
+
let cursor: Record<string, unknown> = target;
|
|
125
|
+
for (let i = 0; i < segments.length - 1; i++) {
|
|
126
|
+
const seg = segments[i] ?? '';
|
|
127
|
+
if (!seg) return;
|
|
128
|
+
const next = cursor[seg];
|
|
129
|
+
if (next == null || typeof next !== 'object' || Array.isArray(next)) {
|
|
130
|
+
const created: Record<string, unknown> = {};
|
|
131
|
+
cursor[seg] = created;
|
|
132
|
+
cursor = created;
|
|
133
|
+
} else {
|
|
134
|
+
cursor = next as Record<string, unknown>;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
const leaf = segments.at(-1) ?? '';
|
|
138
|
+
if (!leaf) return;
|
|
139
|
+
if (multi) {
|
|
140
|
+
const existing = cursor[leaf];
|
|
141
|
+
if (Array.isArray(existing)) {
|
|
142
|
+
existing.push(value);
|
|
143
|
+
} else {
|
|
144
|
+
cursor[leaf] = [value];
|
|
145
|
+
}
|
|
146
|
+
} else if (cursor[leaf] === undefined) {
|
|
147
|
+
cursor[leaf] = value;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
*
|
|
153
|
+
* @param meta
|
|
154
|
+
* @param def
|
|
155
|
+
* @param rawValue
|
|
156
|
+
*/
|
|
157
|
+
function applyKeyDef(meta: Meta, def: KeyDef, rawValue: string): void {
|
|
158
|
+
const value = normalizeValue(rawValue, def.transform);
|
|
159
|
+
for (const path of def.paths) {
|
|
160
|
+
setByPath(
|
|
161
|
+
meta as unknown as Record<string, unknown>,
|
|
162
|
+
path,
|
|
163
|
+
value,
|
|
164
|
+
def.multi === true,
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
*
|
|
171
|
+
* @param meta
|
|
172
|
+
* @param name
|
|
173
|
+
* @param content
|
|
174
|
+
* @param media
|
|
175
|
+
*/
|
|
176
|
+
function classifyMetaName(
|
|
177
|
+
meta: Meta,
|
|
178
|
+
name: string,
|
|
179
|
+
content: string,
|
|
180
|
+
media: string | undefined,
|
|
181
|
+
): boolean {
|
|
182
|
+
if (name === 'viewport') {
|
|
183
|
+
if (meta.viewport === undefined) {
|
|
184
|
+
meta.viewport = parseViewport(content);
|
|
185
|
+
}
|
|
186
|
+
return true;
|
|
187
|
+
}
|
|
188
|
+
if (name === 'robots') {
|
|
189
|
+
if (meta.robots === undefined) {
|
|
190
|
+
meta.robots = parseRobots(content);
|
|
191
|
+
}
|
|
192
|
+
return true;
|
|
193
|
+
}
|
|
194
|
+
if (name === 'referrer') {
|
|
195
|
+
if (meta.referrer === undefined) {
|
|
196
|
+
meta.referrer = parseReferrer(content);
|
|
197
|
+
}
|
|
198
|
+
return true;
|
|
199
|
+
}
|
|
200
|
+
if (name === 'format-detection') {
|
|
201
|
+
const parsed = parseFormatDetection(content);
|
|
202
|
+
if (meta.formatDetection === undefined) {
|
|
203
|
+
meta.formatDetection = parsed;
|
|
204
|
+
}
|
|
205
|
+
if (parsed.telephone === false && meta.apple === undefined) {
|
|
206
|
+
meta.apple = { formatDetectionTelephone: false };
|
|
207
|
+
} else if (parsed.telephone === false && meta.apple !== undefined) {
|
|
208
|
+
meta.apple.formatDetectionTelephone = false;
|
|
209
|
+
}
|
|
210
|
+
return true;
|
|
211
|
+
}
|
|
212
|
+
if (name === 'theme-color') {
|
|
213
|
+
const target =
|
|
214
|
+
media && THEME_COLOR_DARK_MEDIA.test(media)
|
|
215
|
+
? 'themeColorDark'
|
|
216
|
+
: media && THEME_COLOR_LIGHT_MEDIA.test(media)
|
|
217
|
+
? 'themeColorLight'
|
|
218
|
+
: 'themeColor';
|
|
219
|
+
if ((meta as Record<string, unknown>)[target] === undefined) {
|
|
220
|
+
(meta as Record<string, unknown>)[target] = content;
|
|
221
|
+
}
|
|
222
|
+
return true;
|
|
223
|
+
}
|
|
224
|
+
if (name === 'google') {
|
|
225
|
+
const flag = content.trim().toLowerCase();
|
|
226
|
+
if (
|
|
227
|
+
flag === 'notranslate' ||
|
|
228
|
+
flag === 'nositelinkssearchbox' ||
|
|
229
|
+
flag === 'nopagereadaloud'
|
|
230
|
+
) {
|
|
231
|
+
const camel = flag.replaceAll(/-([a-z])/g, (_match, ch: string) =>
|
|
232
|
+
ch.toUpperCase(),
|
|
233
|
+
);
|
|
234
|
+
if (meta.google === undefined) {
|
|
235
|
+
meta.google = {};
|
|
236
|
+
}
|
|
237
|
+
(meta.google as Record<string, unknown>)[camel] = true;
|
|
238
|
+
return true;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
if (name === 'googlebot' && content.trim().toLowerCase() === 'notranslate') {
|
|
242
|
+
if (meta.google === undefined) {
|
|
243
|
+
meta.google = {};
|
|
244
|
+
}
|
|
245
|
+
meta.google.googlebotNotranslate = true;
|
|
246
|
+
// fall through to also write `googlebot` field
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
const def = META_NAME_MAP[name];
|
|
250
|
+
if (def) {
|
|
251
|
+
applyKeyDef(meta, def, content);
|
|
252
|
+
return true;
|
|
253
|
+
}
|
|
254
|
+
return false;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
*
|
|
259
|
+
* @param meta
|
|
260
|
+
* @param property
|
|
261
|
+
* @param content
|
|
262
|
+
*/
|
|
263
|
+
function classifyMetaProperty(meta: Meta, property: string, content: string): boolean {
|
|
264
|
+
const def = META_PROPERTY_MAP[property];
|
|
265
|
+
if (def) {
|
|
266
|
+
applyKeyDef(meta, def, content);
|
|
267
|
+
return true;
|
|
268
|
+
}
|
|
269
|
+
return false;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
*
|
|
274
|
+
* @param meta
|
|
275
|
+
* @param key
|
|
276
|
+
* @param content
|
|
277
|
+
*/
|
|
278
|
+
function classifyHttpEquiv(meta: Meta, key: string, content: string): boolean {
|
|
279
|
+
if (key === 'refresh') {
|
|
280
|
+
const slot = ensureHttpEquiv(meta);
|
|
281
|
+
if (slot.refresh === undefined) {
|
|
282
|
+
slot.refresh = parseRefresh(content);
|
|
283
|
+
}
|
|
284
|
+
return true;
|
|
285
|
+
}
|
|
286
|
+
const def = HTTP_EQUIV_MAP[key];
|
|
287
|
+
if (def) {
|
|
288
|
+
applyKeyDef(meta, def, content);
|
|
289
|
+
return true;
|
|
290
|
+
}
|
|
291
|
+
return false;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
*
|
|
296
|
+
* @param meta
|
|
297
|
+
* @param key
|
|
298
|
+
* @param content
|
|
299
|
+
*/
|
|
300
|
+
function classifyItemprop(meta: Meta, key: string, content: string): boolean {
|
|
301
|
+
const def = ITEMPROP_MAP[key];
|
|
302
|
+
if (def) {
|
|
303
|
+
applyKeyDef(meta, def, content);
|
|
304
|
+
return true;
|
|
305
|
+
}
|
|
306
|
+
return false;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
*
|
|
311
|
+
* @param entry
|
|
312
|
+
*/
|
|
313
|
+
function makeLinkEntry(entry: Extract<RawHeadEntry, { kind: 'link' }>): LinkEntry {
|
|
314
|
+
return {
|
|
315
|
+
href: entry.href,
|
|
316
|
+
rel: entry.rel,
|
|
317
|
+
type: entry.type,
|
|
318
|
+
media: entry.media,
|
|
319
|
+
sizes: entry.sizes,
|
|
320
|
+
title: entry.title,
|
|
321
|
+
hreflang: entry.hreflang,
|
|
322
|
+
as: entry.as,
|
|
323
|
+
crossorigin: entry.crossorigin,
|
|
324
|
+
color: entry.color,
|
|
325
|
+
blocking: entry.blocking,
|
|
326
|
+
imagesrcset: entry.imagesrcset,
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/**
|
|
331
|
+
*
|
|
332
|
+
* @param meta
|
|
333
|
+
* @param def
|
|
334
|
+
* @param entry
|
|
335
|
+
*/
|
|
336
|
+
function applyLinkRel(meta: Meta, def: LinkRelDef, entry: LinkEntry): void {
|
|
337
|
+
if (meta.link === undefined) {
|
|
338
|
+
meta.link = createEmptyLinkMeta();
|
|
339
|
+
}
|
|
340
|
+
const linkRecord = meta.link as unknown as Record<string, unknown>;
|
|
341
|
+
switch (def.cardinality) {
|
|
342
|
+
case 'href-only': {
|
|
343
|
+
if (linkRecord[def.path] === undefined) {
|
|
344
|
+
linkRecord[def.path] = entry.href;
|
|
345
|
+
}
|
|
346
|
+
break;
|
|
347
|
+
}
|
|
348
|
+
case 'single': {
|
|
349
|
+
if (linkRecord[def.path] === undefined) {
|
|
350
|
+
linkRecord[def.path] = entry;
|
|
351
|
+
}
|
|
352
|
+
break;
|
|
353
|
+
}
|
|
354
|
+
case 'array': {
|
|
355
|
+
const list = linkRecord[def.path];
|
|
356
|
+
if (Array.isArray(list)) {
|
|
357
|
+
list.push(entry);
|
|
358
|
+
} else {
|
|
359
|
+
linkRecord[def.path] = [entry];
|
|
360
|
+
}
|
|
361
|
+
break;
|
|
362
|
+
}
|
|
363
|
+
case 'icon-sized': {
|
|
364
|
+
if (entry.sizes) {
|
|
365
|
+
const list = linkRecord[def.path];
|
|
366
|
+
if (Array.isArray(list)) {
|
|
367
|
+
list.push(entry);
|
|
368
|
+
} else {
|
|
369
|
+
linkRecord[def.path] = [entry];
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
break;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
*
|
|
379
|
+
*/
|
|
380
|
+
function createEmptyLinkMeta(): NonNullable<Meta['link']> {
|
|
381
|
+
return {
|
|
382
|
+
alternateHreflang: [],
|
|
383
|
+
alternateMedia: [],
|
|
384
|
+
alternateRss: [],
|
|
385
|
+
alternateAtom: [],
|
|
386
|
+
alternateJsonFeed: [],
|
|
387
|
+
tag: [],
|
|
388
|
+
archives: [],
|
|
389
|
+
appendix: [],
|
|
390
|
+
chapter: [],
|
|
391
|
+
section: [],
|
|
392
|
+
subsection: [],
|
|
393
|
+
profile: [],
|
|
394
|
+
me: [],
|
|
395
|
+
enclosure: [],
|
|
396
|
+
external: [],
|
|
397
|
+
nofollow: [],
|
|
398
|
+
sponsored: [],
|
|
399
|
+
ugc: [],
|
|
400
|
+
noopener: [],
|
|
401
|
+
noreferrer: [],
|
|
402
|
+
opener: [],
|
|
403
|
+
dnsPrefetch: [],
|
|
404
|
+
preconnect: [],
|
|
405
|
+
prefetch: [],
|
|
406
|
+
prerender: [],
|
|
407
|
+
preload: [],
|
|
408
|
+
modulepreload: [],
|
|
409
|
+
expect: [],
|
|
410
|
+
stylesheet: [],
|
|
411
|
+
syndication: [],
|
|
412
|
+
related: [],
|
|
413
|
+
iconSized: [],
|
|
414
|
+
appleTouchIconSized: [],
|
|
415
|
+
appleTouchIconPrecomposed: [],
|
|
416
|
+
appleTouchStartupImage: [],
|
|
417
|
+
};
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
/**
|
|
421
|
+
* Refines `alternate` rel into RSS/Atom/JSON-Feed sub-buckets when `type` matches.
|
|
422
|
+
* @param meta
|
|
423
|
+
* @param entry
|
|
424
|
+
*/
|
|
425
|
+
function refineAlternate(meta: Meta, entry: LinkEntry): void {
|
|
426
|
+
if (meta.link === undefined) {
|
|
427
|
+
meta.link = createEmptyLinkMeta();
|
|
428
|
+
}
|
|
429
|
+
const t = entry.type?.toLowerCase();
|
|
430
|
+
switch (t) {
|
|
431
|
+
case 'application/rss+xml': {
|
|
432
|
+
meta.link.alternateRss.push(entry);
|
|
433
|
+
|
|
434
|
+
break;
|
|
435
|
+
}
|
|
436
|
+
case 'application/atom+xml': {
|
|
437
|
+
meta.link.alternateAtom.push(entry);
|
|
438
|
+
|
|
439
|
+
break;
|
|
440
|
+
}
|
|
441
|
+
case 'application/feed+json': {
|
|
442
|
+
meta.link.alternateJsonFeed.push(entry);
|
|
443
|
+
|
|
444
|
+
break;
|
|
445
|
+
}
|
|
446
|
+
case 'application/json+oembed': {
|
|
447
|
+
if (meta.link.oembedJson === undefined) {
|
|
448
|
+
meta.link.oembedJson = entry;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
break;
|
|
452
|
+
}
|
|
453
|
+
case 'application/xml+oembed': {
|
|
454
|
+
if (meta.link.oembedXml === undefined) {
|
|
455
|
+
meta.link.oembedXml = entry;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
break;
|
|
459
|
+
}
|
|
460
|
+
case 'application/activity+json': {
|
|
461
|
+
if (meta.link.alternateActivityJson === undefined) {
|
|
462
|
+
meta.link.alternateActivityJson = entry;
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
break;
|
|
466
|
+
}
|
|
467
|
+
default: {
|
|
468
|
+
if (entry.media) {
|
|
469
|
+
meta.link.alternateMedia.push(entry);
|
|
470
|
+
} else {
|
|
471
|
+
meta.link.alternateHreflang.push(entry);
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
/**
|
|
478
|
+
* Refines `icon` rel by `type`/`sizes`/`media`.
|
|
479
|
+
* @param meta
|
|
480
|
+
* @param entry
|
|
481
|
+
*/
|
|
482
|
+
function refineIcon(meta: Meta, entry: LinkEntry): void {
|
|
483
|
+
if (meta.link === undefined) {
|
|
484
|
+
meta.link = createEmptyLinkMeta();
|
|
485
|
+
}
|
|
486
|
+
const sizes = entry.sizes?.toLowerCase();
|
|
487
|
+
if (entry.type === 'image/svg+xml') {
|
|
488
|
+
if (meta.link.iconSvg === undefined) {
|
|
489
|
+
meta.link.iconSvg = entry;
|
|
490
|
+
}
|
|
491
|
+
return;
|
|
492
|
+
}
|
|
493
|
+
if (sizes === 'any') {
|
|
494
|
+
if (meta.link.iconAny === undefined) {
|
|
495
|
+
meta.link.iconAny = entry;
|
|
496
|
+
}
|
|
497
|
+
return;
|
|
498
|
+
}
|
|
499
|
+
if (entry.sizes) {
|
|
500
|
+
meta.link.iconSized.push(entry);
|
|
501
|
+
return;
|
|
502
|
+
}
|
|
503
|
+
if (meta.link.icon === undefined) {
|
|
504
|
+
meta.link.icon = entry;
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
/**
|
|
509
|
+
*
|
|
510
|
+
* @param meta
|
|
511
|
+
* @param entry
|
|
512
|
+
*/
|
|
513
|
+
function refineAppleTouchIcon(meta: Meta, entry: LinkEntry): void {
|
|
514
|
+
if (meta.link === undefined) {
|
|
515
|
+
meta.link = createEmptyLinkMeta();
|
|
516
|
+
}
|
|
517
|
+
if (entry.sizes) {
|
|
518
|
+
meta.link.appleTouchIconSized.push(entry);
|
|
519
|
+
return;
|
|
520
|
+
}
|
|
521
|
+
if (meta.link.appleTouchIcon === undefined) {
|
|
522
|
+
meta.link.appleTouchIcon = entry;
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
/**
|
|
527
|
+
*
|
|
528
|
+
* @param meta
|
|
529
|
+
* @param entry
|
|
530
|
+
*/
|
|
531
|
+
function refineAppleTouchStartupImage(meta: Meta, entry: LinkEntry): void {
|
|
532
|
+
if (meta.link === undefined) {
|
|
533
|
+
meta.link = createEmptyLinkMeta();
|
|
534
|
+
}
|
|
535
|
+
meta.link.appleTouchStartupImage.push(entry);
|
|
536
|
+
const media = entry.media ?? '';
|
|
537
|
+
if (/device-width:\s*320px/i.test(media)) {
|
|
538
|
+
if (meta.link.appleTouchStartupImageIphone === undefined) {
|
|
539
|
+
meta.link.appleTouchStartupImageIphone = entry;
|
|
540
|
+
}
|
|
541
|
+
} else if (/device-width:\s*768px/i.test(media) && /portrait/i.test(media)) {
|
|
542
|
+
if (meta.link.appleTouchStartupImageIpadPortrait === undefined) {
|
|
543
|
+
meta.link.appleTouchStartupImageIpadPortrait = entry;
|
|
544
|
+
}
|
|
545
|
+
} else if (
|
|
546
|
+
/device-width:\s*768px/i.test(media) &&
|
|
547
|
+
/landscape/i.test(media) &&
|
|
548
|
+
meta.link.appleTouchStartupImageIpadLandscape === undefined
|
|
549
|
+
) {
|
|
550
|
+
meta.link.appleTouchStartupImageIpadLandscape = entry;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
/**
|
|
555
|
+
*
|
|
556
|
+
* @param meta
|
|
557
|
+
* @param entry
|
|
558
|
+
*/
|
|
559
|
+
function classifyLink(meta: Meta, entry: Extract<RawHeadEntry, { kind: 'link' }>): void {
|
|
560
|
+
const linkEntry = makeLinkEntry(entry);
|
|
561
|
+
let anyKnown = false;
|
|
562
|
+
for (const rel of entry.rel) {
|
|
563
|
+
const lower = rel.toLowerCase();
|
|
564
|
+
if (lower === 'alternate') {
|
|
565
|
+
refineAlternate(meta, linkEntry);
|
|
566
|
+
anyKnown = true;
|
|
567
|
+
continue;
|
|
568
|
+
}
|
|
569
|
+
if (lower === 'icon') {
|
|
570
|
+
refineIcon(meta, linkEntry);
|
|
571
|
+
anyKnown = true;
|
|
572
|
+
continue;
|
|
573
|
+
}
|
|
574
|
+
if (lower === 'apple-touch-icon') {
|
|
575
|
+
refineAppleTouchIcon(meta, linkEntry);
|
|
576
|
+
anyKnown = true;
|
|
577
|
+
continue;
|
|
578
|
+
}
|
|
579
|
+
if (lower === 'apple-touch-startup-image') {
|
|
580
|
+
refineAppleTouchStartupImage(meta, linkEntry);
|
|
581
|
+
anyKnown = true;
|
|
582
|
+
continue;
|
|
583
|
+
}
|
|
584
|
+
if (lower === 'me' && meta.microformats === undefined) {
|
|
585
|
+
meta.microformats = { relMe: [linkEntry.href] };
|
|
586
|
+
anyKnown = true;
|
|
587
|
+
} else if (lower === 'me' && meta.microformats !== undefined) {
|
|
588
|
+
meta.microformats.relMe.push(linkEntry.href);
|
|
589
|
+
anyKnown = true;
|
|
590
|
+
}
|
|
591
|
+
const def = LINK_REL_MAP[lower];
|
|
592
|
+
if (def) {
|
|
593
|
+
applyLinkRel(meta, def, linkEntry);
|
|
594
|
+
anyKnown = true;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
if (!anyKnown) {
|
|
598
|
+
meta.others.link.push(linkEntry);
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
/**
|
|
603
|
+
*
|
|
604
|
+
* @param meta
|
|
605
|
+
* @param entry
|
|
606
|
+
* @param totals
|
|
607
|
+
* @param totals.jsonLdBytes
|
|
608
|
+
*/
|
|
609
|
+
function classifyScript(
|
|
610
|
+
meta: Meta,
|
|
611
|
+
entry: Extract<RawHeadEntry, { kind: 'script' }>,
|
|
612
|
+
totals: { jsonLdBytes: number },
|
|
613
|
+
): void {
|
|
614
|
+
const type = entry.scriptType.toLowerCase();
|
|
615
|
+
if (type === 'application/ld+json' || type === 'speculationrules') {
|
|
616
|
+
const raw = entry.content ?? '';
|
|
617
|
+
if (totals.jsonLdBytes + raw.length > JSON_LD_TOTAL_LIMIT) {
|
|
618
|
+
const remaining = Math.max(0, JSON_LD_TOTAL_LIMIT - totals.jsonLdBytes);
|
|
619
|
+
const capped = raw.slice(0, remaining);
|
|
620
|
+
totals.jsonLdBytes += capped.length;
|
|
621
|
+
const jsonEntry: JsonLdEntry = {
|
|
622
|
+
raw: capped,
|
|
623
|
+
parseError: 'truncated: total jsonLd bytes exceeded limit',
|
|
624
|
+
};
|
|
625
|
+
pushJsonLd(meta, type, jsonEntry);
|
|
626
|
+
return;
|
|
627
|
+
}
|
|
628
|
+
const { content: capped, truncated } = capJsonLdContent(raw);
|
|
629
|
+
totals.jsonLdBytes += capped.length;
|
|
630
|
+
const jsonEntry = parseJsonLd(capped);
|
|
631
|
+
if (truncated && jsonEntry.parseError === undefined) {
|
|
632
|
+
jsonEntry.parseError = 'truncated: per-entry size limit exceeded';
|
|
633
|
+
}
|
|
634
|
+
pushJsonLd(meta, type, jsonEntry);
|
|
635
|
+
return;
|
|
636
|
+
}
|
|
637
|
+
meta.others.script.push({
|
|
638
|
+
type: entry.scriptType,
|
|
639
|
+
content: entry.content,
|
|
640
|
+
src: entry.src,
|
|
641
|
+
location: entry.location,
|
|
642
|
+
});
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
/**
|
|
646
|
+
*
|
|
647
|
+
* @param meta
|
|
648
|
+
* @param type
|
|
649
|
+
* @param entry
|
|
650
|
+
*/
|
|
651
|
+
function pushJsonLd(meta: Meta, type: string, entry: JsonLdEntry): void {
|
|
652
|
+
if (type === 'application/ld+json') {
|
|
653
|
+
meta.jsonLd.push(entry);
|
|
654
|
+
} else if (type === 'speculationrules') {
|
|
655
|
+
meta.speculationRules.push(entry);
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
/**
|
|
660
|
+
* Top-level classifier. Takes a list of raw entries collected from the page
|
|
661
|
+
* and produces a populated `Meta`.
|
|
662
|
+
* @param raw
|
|
663
|
+
* @param options
|
|
664
|
+
*/
|
|
665
|
+
export function classify(
|
|
666
|
+
raw: readonly RawHeadEntry[],
|
|
667
|
+
options: ClassifyOptions = {},
|
|
668
|
+
): Meta {
|
|
669
|
+
const meta = emptyMeta();
|
|
670
|
+
const totals = { jsonLdBytes: 0 };
|
|
671
|
+
if (options.tags) {
|
|
672
|
+
meta.tags = options.tags;
|
|
673
|
+
}
|
|
674
|
+
for (const entry of raw) {
|
|
675
|
+
classifyEntry(meta, entry, totals);
|
|
676
|
+
}
|
|
677
|
+
if (options.includeRaw) {
|
|
678
|
+
meta._raw = raw;
|
|
679
|
+
}
|
|
680
|
+
return meta;
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
/**
|
|
684
|
+
*
|
|
685
|
+
* @param meta
|
|
686
|
+
* @param entry
|
|
687
|
+
* @param totals
|
|
688
|
+
* @param totals.jsonLdBytes
|
|
689
|
+
*/
|
|
690
|
+
function classifyEntry(
|
|
691
|
+
meta: Meta,
|
|
692
|
+
entry: RawHeadEntry,
|
|
693
|
+
totals: { jsonLdBytes: number },
|
|
694
|
+
): void {
|
|
695
|
+
switch (entry.kind) {
|
|
696
|
+
case 'html': {
|
|
697
|
+
if (entry.lang) meta.lang = entry.lang;
|
|
698
|
+
if (entry.dir) meta.dir = entry.dir;
|
|
699
|
+
if (entry.xmlns) meta.xmlns = entry.xmlns;
|
|
700
|
+
if (entry.prefix) {
|
|
701
|
+
meta.prefix = entry.prefix;
|
|
702
|
+
if (meta.rdfa === undefined) meta.rdfa = {};
|
|
703
|
+
meta.rdfa.prefix = entry.prefix;
|
|
704
|
+
}
|
|
705
|
+
if (entry.vocab) {
|
|
706
|
+
meta.vocab = entry.vocab;
|
|
707
|
+
if (meta.rdfa === undefined) meta.rdfa = {};
|
|
708
|
+
meta.rdfa.vocab = entry.vocab;
|
|
709
|
+
}
|
|
710
|
+
if (entry.typeOf) {
|
|
711
|
+
meta.typeOf = entry.typeOf;
|
|
712
|
+
if (meta.rdfa === undefined) meta.rdfa = {};
|
|
713
|
+
meta.rdfa.typeOf = entry.typeOf;
|
|
714
|
+
}
|
|
715
|
+
if (entry.itemtype) {
|
|
716
|
+
meta.itemType = entry.itemtype;
|
|
717
|
+
if (meta.microdata === undefined) meta.microdata = {};
|
|
718
|
+
meta.microdata.itemtype = entry.itemtype;
|
|
719
|
+
}
|
|
720
|
+
if (entry.itemscope) {
|
|
721
|
+
if (meta.microdata === undefined) meta.microdata = {};
|
|
722
|
+
meta.microdata.itemscope = true;
|
|
723
|
+
}
|
|
724
|
+
if (entry.amp || entry.lightning) {
|
|
725
|
+
if (meta.amp === undefined) meta.amp = {};
|
|
726
|
+
if (entry.amp) meta.amp.enabled = true;
|
|
727
|
+
if (entry.lightning) meta.amp.lightning = true;
|
|
728
|
+
}
|
|
729
|
+
break;
|
|
730
|
+
}
|
|
731
|
+
case 'title': {
|
|
732
|
+
if (meta.title === '') {
|
|
733
|
+
meta.title = entry.content;
|
|
734
|
+
}
|
|
735
|
+
break;
|
|
736
|
+
}
|
|
737
|
+
case 'base': {
|
|
738
|
+
if (entry.href && meta.baseHref === undefined) {
|
|
739
|
+
meta.baseHref = entry.href;
|
|
740
|
+
}
|
|
741
|
+
if (entry.target && meta.baseTarget === undefined) {
|
|
742
|
+
meta.baseTarget = entry.target;
|
|
743
|
+
}
|
|
744
|
+
break;
|
|
745
|
+
}
|
|
746
|
+
case 'meta': {
|
|
747
|
+
if (entry.charset && meta.charset === undefined) {
|
|
748
|
+
meta.charset = entry.charset;
|
|
749
|
+
}
|
|
750
|
+
const content = entry.content ?? '';
|
|
751
|
+
if (entry.name) {
|
|
752
|
+
const handled = classifyMetaName(meta, entry.name, content, entry.media);
|
|
753
|
+
if (!handled) {
|
|
754
|
+
pushMulti(meta.others.meta, entry.name, content);
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
if (entry.property) {
|
|
758
|
+
const handled = classifyMetaProperty(meta, entry.property, content);
|
|
759
|
+
if (!handled) {
|
|
760
|
+
pushMulti(meta.others.property, entry.property, content);
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
if (entry.httpEquiv) {
|
|
764
|
+
const handled = classifyHttpEquiv(meta, entry.httpEquiv, content);
|
|
765
|
+
if (!handled) {
|
|
766
|
+
pushMulti(meta.others.httpEquiv, entry.httpEquiv, content);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
if (entry.itemprop) {
|
|
770
|
+
const handled = classifyItemprop(meta, entry.itemprop, content);
|
|
771
|
+
if (!handled) {
|
|
772
|
+
pushMulti(meta.others.itemprop, entry.itemprop, content);
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
break;
|
|
776
|
+
}
|
|
777
|
+
case 'link': {
|
|
778
|
+
classifyLink(meta, entry);
|
|
779
|
+
break;
|
|
780
|
+
}
|
|
781
|
+
case 'script': {
|
|
782
|
+
classifyScript(meta, entry, totals);
|
|
783
|
+
break;
|
|
784
|
+
}
|
|
785
|
+
case 'iframe': {
|
|
786
|
+
meta.others.iframe.push({ src: entry.src, location: entry.location });
|
|
787
|
+
break;
|
|
788
|
+
}
|
|
789
|
+
case 'window-global': {
|
|
790
|
+
// `window-global` entries are consumed by the tag-detection layer,
|
|
791
|
+
// not by classify itself. Ignored here.
|
|
792
|
+
break;
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
/**
|
|
798
|
+
*
|
|
799
|
+
* @param bucket
|
|
800
|
+
* @param key
|
|
801
|
+
* @param value
|
|
802
|
+
*/
|
|
803
|
+
function pushMulti(bucket: Record<string, string[]>, key: string, value: string): void {
|
|
804
|
+
const list = bucket[key];
|
|
805
|
+
if (list) {
|
|
806
|
+
list.push(value);
|
|
807
|
+
} else {
|
|
808
|
+
bucket[key] = [value];
|
|
809
|
+
}
|
|
810
|
+
}
|