@d-zero/beholder 2.1.5 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +9 -276
- package/dist/dom-evaluation.d.ts +100 -62
- package/dist/dom-evaluation.js +498 -195
- package/dist/index.d.ts +1 -1
- package/dist/meta/classify.d.ts +52 -0
- package/dist/meta/classify.js +731 -0
- package/dist/meta/id-extractors.d.ts +40 -0
- package/dist/meta/id-extractors.js +196 -0
- package/dist/meta/keys.d.ts +41 -0
- package/dist/meta/keys.js +507 -0
- package/dist/meta/parsers.d.ts +74 -0
- package/dist/meta/parsers.js +293 -0
- package/dist/meta/tag-detection.d.ts +59 -0
- package/dist/meta/tag-detection.js +120 -0
- package/dist/meta/types.d.ts +874 -0
- package/dist/meta/types.js +12 -0
- package/dist/scraper.js +22 -18
- package/dist/types.d.ts +8 -37
- package/package.json +5 -4
- package/src/dom-evaluation.spec.ts +521 -0
- package/src/dom-evaluation.ts +655 -227
- package/src/index.ts +43 -0
- package/src/meta/classify.spec.ts +281 -0
- package/src/meta/classify.ts +810 -0
- package/src/meta/id-extractors.spec.ts +69 -0
- package/src/meta/id-extractors.ts +206 -0
- package/src/meta/keys.ts +568 -0
- package/src/meta/parsers.spec.ts +178 -0
- package/src/meta/parsers.ts +304 -0
- package/src/meta/simple-wappalyzer.d.ts +37 -0
- package/src/meta/tag-detection.spec.ts +134 -0
- package/src/meta/tag-detection.ts +161 -0
- package/src/meta/types.ts +949 -0
- package/src/scraper.ts +32 -16
- package/src/types.ts +54 -54
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Provider-specific real-ID extraction rules.
|
|
3
|
+
*
|
|
4
|
+
* `simple-wappalyzer` identifies the *technology* (e.g., "Google Analytics") but
|
|
5
|
+
* does not surface the actual account/measurement ID. We layer real-ID
|
|
6
|
+
* extraction on top: for each detected provider, apply the registered regex
|
|
7
|
+
* over the page HTML and surface what we find.
|
|
8
|
+
*
|
|
9
|
+
* Provider keys must match the names produced by `simple-wappalyzer` exactly;
|
|
10
|
+
* these in turn track `wappalyzer-core@6` (the MIT-licensed fingerprint set).
|
|
11
|
+
*
|
|
12
|
+
* Keep the table **manually maintained**, not generated from Wappalyzer data.
|
|
13
|
+
* @module
|
|
14
|
+
*/
|
|
15
|
+
export type IdExtractor = {
|
|
16
|
+
/**
|
|
17
|
+
* Each regex MUST contain at most one capturing group; the captured text
|
|
18
|
+
* becomes the ID. Patterns without a capturing group fall back to
|
|
19
|
+
* `match[0]`.
|
|
20
|
+
*/
|
|
21
|
+
readonly patterns: readonly RegExp[];
|
|
22
|
+
};
|
|
23
|
+
/**
|
|
24
|
+
* Lookup table keyed by Wappalyzer provider name.
|
|
25
|
+
*
|
|
26
|
+
* When extending: keep regexes anchored on stable, high-signal substrings
|
|
27
|
+
* (the surrounding API call, not just the bare ID character class). Otherwise
|
|
28
|
+
* the same regex will hit unrelated strings on pages that happen to share the
|
|
29
|
+
* shape (e.g., AWS ARNs containing `GA-...`).
|
|
30
|
+
*/
|
|
31
|
+
export declare const ID_EXTRACTORS: Record<string, IdExtractor>;
|
|
32
|
+
/**
|
|
33
|
+
* Extracts real IDs for `provider` from the page HTML.
|
|
34
|
+
*
|
|
35
|
+
* Returns a de-duplicated, insertion-ordered list of IDs. Returns `[]` for
|
|
36
|
+
* unknown providers (so callers can compose freely).
|
|
37
|
+
* @param provider
|
|
38
|
+
* @param html
|
|
39
|
+
*/
|
|
40
|
+
export declare function extractIds(provider: string, html: string): string[];
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Provider-specific real-ID extraction rules.
|
|
3
|
+
*
|
|
4
|
+
* `simple-wappalyzer` identifies the *technology* (e.g., "Google Analytics") but
|
|
5
|
+
* does not surface the actual account/measurement ID. We layer real-ID
|
|
6
|
+
* extraction on top: for each detected provider, apply the registered regex
|
|
7
|
+
* over the page HTML and surface what we find.
|
|
8
|
+
*
|
|
9
|
+
* Provider keys must match the names produced by `simple-wappalyzer` exactly;
|
|
10
|
+
* these in turn track `wappalyzer-core@6` (the MIT-licensed fingerprint set).
|
|
11
|
+
*
|
|
12
|
+
* Keep the table **manually maintained**, not generated from Wappalyzer data.
|
|
13
|
+
* @module
|
|
14
|
+
*/
|
|
15
|
+
/**
|
|
16
|
+
* Lookup table keyed by Wappalyzer provider name.
|
|
17
|
+
*
|
|
18
|
+
* When extending: keep regexes anchored on stable, high-signal substrings
|
|
19
|
+
* (the surrounding API call, not just the bare ID character class). Otherwise
|
|
20
|
+
* the same regex will hit unrelated strings on pages that happen to share the
|
|
21
|
+
* shape (e.g., AWS ARNs containing `GA-...`).
|
|
22
|
+
*/
|
|
23
|
+
export const ID_EXTRACTORS = {
|
|
24
|
+
'Google Analytics': {
|
|
25
|
+
patterns: [
|
|
26
|
+
/gtag\(\s*['"]config['"]\s*,\s*['"](G-[A-Z0-9]{4,20})['"]/g,
|
|
27
|
+
/googletagmanager\.com\/gtag\/js\?id=(G-[A-Z0-9]{4,20})/g,
|
|
28
|
+
/\bga\(\s*['"]create['"]\s*,\s*['"](UA-\d{4,10}-\d{1,4})['"]/g,
|
|
29
|
+
/['"](UA-\d{4,10}-\d{1,4})['"]/g,
|
|
30
|
+
],
|
|
31
|
+
},
|
|
32
|
+
'Google Tag Manager': {
|
|
33
|
+
patterns: [
|
|
34
|
+
/googletagmanager\.com\/(?:gtm|ns)\.[a-z]+\?id=(GTM-[A-Z0-9]{4,12})/g,
|
|
35
|
+
/['"](GTM-[A-Z0-9]{4,12})['"]/g,
|
|
36
|
+
],
|
|
37
|
+
},
|
|
38
|
+
'Google Ads': {
|
|
39
|
+
patterns: [/['"](AW-\d{4,12})['"]/g],
|
|
40
|
+
},
|
|
41
|
+
'Facebook Pixel': {
|
|
42
|
+
patterns: [
|
|
43
|
+
/fbq\(\s*['"]init['"]\s*,\s*['"](\d{6,20})['"]/g,
|
|
44
|
+
/connect\.facebook\.net\/[^"']+\/fbevents\.js\D*(\d{6,20})/g,
|
|
45
|
+
],
|
|
46
|
+
},
|
|
47
|
+
Hotjar: {
|
|
48
|
+
patterns: [
|
|
49
|
+
/hjid\s*[:=]\s*(\d{4,10})/g,
|
|
50
|
+
/static\.hotjar\.com\/c\/hotjar-(\d{4,10})\.js/g,
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
'Microsoft Clarity': {
|
|
54
|
+
patterns: [
|
|
55
|
+
/clarity\.ms\/tag\/([a-z0-9]{6,20})/g,
|
|
56
|
+
/clarity\(\s*['"]start['"]\s*,\s*['"]([a-z0-9]{6,20})['"]/gi,
|
|
57
|
+
],
|
|
58
|
+
},
|
|
59
|
+
Mixpanel: {
|
|
60
|
+
patterns: [/mixpanel\.init\(\s*['"]([a-f0-9]{16,40})['"]/g],
|
|
61
|
+
},
|
|
62
|
+
Segment: {
|
|
63
|
+
patterns: [
|
|
64
|
+
/analytics\.load\(\s*['"]([a-zA-Z0-9]{8,40})['"]/g,
|
|
65
|
+
/cdn\.segment\.com\/analytics\.js\/v1\/([a-zA-Z0-9]{8,40})/g,
|
|
66
|
+
],
|
|
67
|
+
},
|
|
68
|
+
Amplitude: {
|
|
69
|
+
patterns: [
|
|
70
|
+
/amplitude\.init\(\s*['"]([a-f0-9]{16,40})['"]/g,
|
|
71
|
+
/getInstance\(\)\.init\(\s*['"]([a-f0-9]{16,40})['"]/g,
|
|
72
|
+
],
|
|
73
|
+
},
|
|
74
|
+
Heap: {
|
|
75
|
+
patterns: [
|
|
76
|
+
/heap\.load\(\s*['"](\d{6,20})['"]/g,
|
|
77
|
+
/heap\.appid\s*=\s*['"](\d{6,20})['"]/g,
|
|
78
|
+
],
|
|
79
|
+
},
|
|
80
|
+
PostHog: {
|
|
81
|
+
patterns: [/posthog\.init\(\s*['"]([\w-]{16,80})['"]/g],
|
|
82
|
+
},
|
|
83
|
+
Plausible: {
|
|
84
|
+
patterns: [/plausible\.io\/js\/script\.js[?&]domain=([a-zA-Z0-9.,-]+)/g],
|
|
85
|
+
},
|
|
86
|
+
Matomo: {
|
|
87
|
+
patterns: [
|
|
88
|
+
/_paq\.push\(\s*\[\s*['"]setSiteId['"]\s*,\s*['"]?(\d{1,6})['"]?\s*\]/g,
|
|
89
|
+
/matomo\.php\?siteId=(\d{1,6})/g,
|
|
90
|
+
],
|
|
91
|
+
},
|
|
92
|
+
'Adobe Analytics': {
|
|
93
|
+
patterns: [
|
|
94
|
+
/s_account\s*=\s*['"]([a-z0-9,]{3,50})['"]/gi,
|
|
95
|
+
/s\.account\s*=\s*['"]([a-z0-9,]{3,50})['"]/gi,
|
|
96
|
+
],
|
|
97
|
+
},
|
|
98
|
+
'Yandex Metrica': {
|
|
99
|
+
patterns: [/ym\(\s*(\d{6,12})\s*,\s*['"]init['"]/g],
|
|
100
|
+
},
|
|
101
|
+
'LinkedIn Insight Tag': {
|
|
102
|
+
patterns: [/_linkedin_partner_id\s*=\s*['"](\d{4,10})['"]/g],
|
|
103
|
+
},
|
|
104
|
+
'Twitter Ads': {
|
|
105
|
+
patterns: [/twq\(\s*['"]config['"]\s*,\s*['"]([a-z0-9]{4,12})['"]/g],
|
|
106
|
+
},
|
|
107
|
+
'TikTok Pixel': {
|
|
108
|
+
patterns: [
|
|
109
|
+
/ttq\.load\(\s*['"]([A-Z0-9]{12,30})['"]/g,
|
|
110
|
+
/tiktok\.com\/i18n\/pixel\/events\.js\?sdkid=([A-Z0-9]{12,30})/g,
|
|
111
|
+
],
|
|
112
|
+
},
|
|
113
|
+
'Pinterest Tag': {
|
|
114
|
+
patterns: [/pintrk\(\s*['"]load['"]\s*,\s*['"](\d{12,20})['"]/g],
|
|
115
|
+
},
|
|
116
|
+
'Bing Universal Event Tracking': {
|
|
117
|
+
patterns: [
|
|
118
|
+
/setAttribute\(\s*['"]data-tag['"]\s*,\s*['"](\d{6,20})['"]/g,
|
|
119
|
+
/UET\(\{\s*ti:\s*['"](\d{6,20})['"]/g,
|
|
120
|
+
],
|
|
121
|
+
},
|
|
122
|
+
Optimizely: {
|
|
123
|
+
patterns: [/cdn\.optimizely\.com\/js\/(\d{6,20})\.js/g],
|
|
124
|
+
},
|
|
125
|
+
HubSpot: {
|
|
126
|
+
patterns: [
|
|
127
|
+
/js\.hs-?scripts\.com\/(\d{4,12})\.js/g,
|
|
128
|
+
/js\.hubspot\.com\/web-interactives\/v1\/embeds\/(\d{4,12})/g,
|
|
129
|
+
],
|
|
130
|
+
},
|
|
131
|
+
Sentry: {
|
|
132
|
+
patterns: [
|
|
133
|
+
/(https:\/\/[a-f0-9]+@[a-zA-Z0-9.-]+\.ingest\.sentry\.io\/\d+)/g,
|
|
134
|
+
/(https:\/\/[a-f0-9]+@[a-zA-Z0-9.-]+\.sentry\.io\/\d+)/g,
|
|
135
|
+
],
|
|
136
|
+
},
|
|
137
|
+
Intercom: {
|
|
138
|
+
patterns: [
|
|
139
|
+
/intercomSettings\s*=\s*\{[^}]*?app_id:\s*['"]([a-z0-9]{4,10})['"]/g,
|
|
140
|
+
/Intercom\(\s*['"]boot['"]\s*,\s*\{[^}]*?app_id:\s*['"]([a-z0-9]{4,10})['"]/g,
|
|
141
|
+
],
|
|
142
|
+
},
|
|
143
|
+
Drift: {
|
|
144
|
+
patterns: [/drift\.load\(\s*['"]([a-z0-9]{6,30})['"]/g],
|
|
145
|
+
},
|
|
146
|
+
'Tawk.to': {
|
|
147
|
+
patterns: [/embed\.tawk\.to\/([a-f0-9]{16,40})/g],
|
|
148
|
+
},
|
|
149
|
+
'Zendesk Chat': {
|
|
150
|
+
patterns: [/static\.zdassets\.com\/ekr\/snippet\.js\?key=([a-f0-9-]{16,40})/g],
|
|
151
|
+
},
|
|
152
|
+
Cookiebot: {
|
|
153
|
+
patterns: [/consent\.cookiebot\.com\/uc\.js[^"']*?cbid=([a-f0-9-]{16,40})/g],
|
|
154
|
+
},
|
|
155
|
+
OneTrust: {
|
|
156
|
+
patterns: [/dataDomain['"=]\s*['"]?([a-z0-9-]{16,80})['"]?/gi],
|
|
157
|
+
},
|
|
158
|
+
Stripe: {
|
|
159
|
+
patterns: [/js\.stripe\.com\/v\d+\//g],
|
|
160
|
+
},
|
|
161
|
+
'Google reCAPTCHA': {
|
|
162
|
+
patterns: [/google\.com\/recaptcha\/api\.js[^"']*?(?:render=)?([\w-]{20,60})/g],
|
|
163
|
+
},
|
|
164
|
+
'Facebook for WordPress': {
|
|
165
|
+
patterns: [/fbq\(\s*['"]init['"]\s*,\s*['"](\d{6,20})['"]/g],
|
|
166
|
+
},
|
|
167
|
+
};
|
|
168
|
+
/**
|
|
169
|
+
* Extracts real IDs for `provider` from the page HTML.
|
|
170
|
+
*
|
|
171
|
+
* Returns a de-duplicated, insertion-ordered list of IDs. Returns `[]` for
|
|
172
|
+
* unknown providers (so callers can compose freely).
|
|
173
|
+
* @param provider
|
|
174
|
+
* @param html
|
|
175
|
+
*/
|
|
176
|
+
export function extractIds(provider, html) {
|
|
177
|
+
const extractor = ID_EXTRACTORS[provider];
|
|
178
|
+
if (!extractor)
|
|
179
|
+
return [];
|
|
180
|
+
const seen = new Set();
|
|
181
|
+
const result = [];
|
|
182
|
+
for (const pattern of extractor.patterns) {
|
|
183
|
+
// Patterns must be `g`-flagged for `matchAll` to work without re-creating.
|
|
184
|
+
const safe = pattern.flags.includes('g')
|
|
185
|
+
? pattern
|
|
186
|
+
: new RegExp(pattern.source, pattern.flags + 'g');
|
|
187
|
+
for (const match of html.matchAll(safe)) {
|
|
188
|
+
const id = match[1] ?? match[0];
|
|
189
|
+
if (id && !seen.has(id)) {
|
|
190
|
+
seen.add(id);
|
|
191
|
+
result.push(id);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
return result;
|
|
196
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lookup tables mapping `<meta name>`, `<meta property>`, `<meta http-equiv>`,
|
|
3
|
+
* `<meta itemprop>`, and `<link rel>` to their dot-path in `Meta`.
|
|
4
|
+
*
|
|
5
|
+
* Each key has a single canonical lowercase form. Cross-reference keys
|
|
6
|
+
* (e.g., `format-detection` writes to both `formatDetection.*` and
|
|
7
|
+
* `apple.formatDetectionTelephone`) use `paths` with more than one entry.
|
|
8
|
+
*
|
|
9
|
+
* Values referenced from `frontmatter-keys.md` in `../../frontend-env/`.
|
|
10
|
+
* @module
|
|
11
|
+
*/
|
|
12
|
+
export type KeyTransform = 'string' | 'number' | 'boolean-yes' | 'boolean-on' | 'boolean-true';
|
|
13
|
+
export type KeyDef = {
|
|
14
|
+
/** One or more dot-paths under `Meta` to write the value into. */
|
|
15
|
+
readonly paths: readonly string[];
|
|
16
|
+
/** When `true`, repeated occurrences accumulate into an array at the path. */
|
|
17
|
+
readonly multi?: boolean;
|
|
18
|
+
/** Value normalization to apply. Defaults to `'string'`. */
|
|
19
|
+
readonly transform?: KeyTransform;
|
|
20
|
+
};
|
|
21
|
+
/** Defines how a `<link rel>` is stored under `Meta.link`. */
|
|
22
|
+
export type LinkRelDef = {
|
|
23
|
+
/** Dot-path under `Meta.link` (e.g., `'canonical'`, `'preload'`). */
|
|
24
|
+
readonly path: string;
|
|
25
|
+
/**
|
|
26
|
+
* `'single'` keeps the first; `'href-only'` stores the href string only;
|
|
27
|
+
* `'array'` accumulates `LinkEntry[]`; `'icon-sized'` accumulates only when
|
|
28
|
+
* `sizes` is set.
|
|
29
|
+
*/
|
|
30
|
+
readonly cardinality: 'single' | 'href-only' | 'array' | 'icon-sized';
|
|
31
|
+
};
|
|
32
|
+
/** `<meta name="X">` → dot-path in `Meta`. */
|
|
33
|
+
export declare const META_NAME_MAP: Record<string, KeyDef>;
|
|
34
|
+
/** `<meta property="X">` → dot-path in `Meta`. */
|
|
35
|
+
export declare const META_PROPERTY_MAP: Record<string, KeyDef>;
|
|
36
|
+
/** `<meta http-equiv="X">` → dot-path in `Meta.httpEquiv`. */
|
|
37
|
+
export declare const HTTP_EQUIV_MAP: Record<string, KeyDef>;
|
|
38
|
+
/** `<meta itemprop="X">` → dot-path in `Meta.itemprop`. */
|
|
39
|
+
export declare const ITEMPROP_MAP: Record<string, KeyDef>;
|
|
40
|
+
/** `<link rel="X">` → dot-path in `Meta.link`. */
|
|
41
|
+
export declare const LINK_REL_MAP: Record<string, LinkRelDef>;
|