@d-zero/beholder 2.1.6 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +26 -0
- package/dist/dom-evaluation.d.ts +72 -24
- package/dist/dom-evaluation.js +310 -84
- package/dist/extract-meta.d.ts +98 -0
- package/dist/extract-meta.js +75 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.js +1 -0
- package/dist/meta/classify.d.ts +52 -0
- package/dist/meta/classify.js +731 -0
- package/dist/meta/collect-head.d.ts +63 -0
- package/dist/meta/collect-head.js +223 -0
- package/dist/meta/id-extractors.d.ts +40 -0
- package/dist/meta/id-extractors.js +196 -0
- package/dist/meta/keys.d.ts +41 -0
- package/dist/meta/keys.js +507 -0
- package/dist/meta/parsers.d.ts +74 -0
- package/dist/meta/parsers.js +293 -0
- package/dist/meta/tag-detection.d.ts +59 -0
- package/dist/meta/tag-detection.js +120 -0
- package/dist/meta/types.d.ts +874 -0
- package/dist/meta/types.js +12 -0
- package/dist/scraper.js +15 -13
- package/dist/types.d.ts +3 -38
- package/package.json +8 -5
- package/src/dom-evaluation.spec.ts +301 -73
- package/src/dom-evaluation.ts +417 -88
- package/src/extract-meta.spec.ts +247 -0
- package/src/extract-meta.ts +121 -0
- package/src/index.ts +45 -0
- package/src/meta/classify.spec.ts +281 -0
- package/src/meta/classify.ts +810 -0
- package/src/meta/collect-head.ts +247 -0
- package/src/meta/id-extractors.spec.ts +69 -0
- package/src/meta/id-extractors.ts +206 -0
- package/src/meta/keys.ts +568 -0
- package/src/meta/parsers.spec.ts +178 -0
- package/src/meta/parsers.ts +304 -0
- package/src/meta/simple-wappalyzer.d.ts +37 -0
- package/src/meta/tag-detection.spec.ts +134 -0
- package/src/meta/tag-detection.ts +161 -0
- package/src/meta/types.ts +949 -0
- package/src/scraper.ts +19 -13
- package/src/types.ts +49 -55
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOM-side raw `<head>` collector.
|
|
3
|
+
*
|
|
4
|
+
* `collectHeadFromDocument` walks a `Document` (Puppeteer page realm or jsdom realm
|
|
5
|
+
* alike) and produces a serializable {@link RawHeadEntry}[] that
|
|
6
|
+
* {@link ../meta/classify.ts | classify} can turn into a typed `Meta`.
|
|
7
|
+
*
|
|
8
|
+
* WHY this function is realm-agnostic:
|
|
9
|
+
*
|
|
10
|
+
* - The Puppeteer path stringifies this function via `Function.prototype.toString`
|
|
11
|
+
* and runs it as a `page.evaluate(string)` expression, so any closure over
|
|
12
|
+
* module-scope bindings would resolve to `undefined` in the browser realm.
|
|
13
|
+
* - The jsdom (Node) path calls it directly with the jsdom `Window`. Because
|
|
14
|
+
* `HTMLLinkElement` (etc.) in jsdom is a *different class instance* from the
|
|
15
|
+
* one in the page realm, `instanceof` only works when the constructor is read
|
|
16
|
+
* from the *passed* `window` rather than from bare globals.
|
|
17
|
+
*
|
|
18
|
+
* Together those constraints dictate that the function MUST:
|
|
19
|
+
*
|
|
20
|
+
* 1. Reference no module-level variables — only its own parameters and inner locals.
|
|
21
|
+
* 2. Take every HTML class constructor (`HTMLBaseElement`, …) from the passed
|
|
22
|
+
* `window` via destructuring instead of relying on ambient globals.
|
|
23
|
+
* 3. Stay in plain ES syntax (no TS-only constructs that need helper imports).
|
|
24
|
+
* @module
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import type { RawHeadEntry } from './types.js';
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Curated list of `window` globals whose presence indicates that a third-party
|
|
31
|
+
* tag library has been loaded on the page. Surfaced as a single
|
|
32
|
+
* `kind: 'window-global'` entry so that downstream consumers (e.g. tag-detection)
|
|
33
|
+
* can cross-reference the script/iframe signals.
|
|
34
|
+
*
|
|
35
|
+
* Kept here (rather than in `dom-evaluation.ts`) so the Puppeteer path and the
|
|
36
|
+
* jsdom path share one source of truth.
|
|
37
|
+
*/
|
|
38
|
+
export const WINDOW_GLOBALS_TO_CHECK: readonly string[] = [
|
|
39
|
+
'dataLayer',
|
|
40
|
+
'gtag',
|
|
41
|
+
'ga',
|
|
42
|
+
'_gaq',
|
|
43
|
+
'fbq',
|
|
44
|
+
'_fbq',
|
|
45
|
+
'clarity',
|
|
46
|
+
'_hjSettings',
|
|
47
|
+
'_hjid',
|
|
48
|
+
'twq',
|
|
49
|
+
'ttq',
|
|
50
|
+
'_linkedin_partner_id',
|
|
51
|
+
'pintrk',
|
|
52
|
+
'amplitude',
|
|
53
|
+
'mixpanel',
|
|
54
|
+
'analytics',
|
|
55
|
+
'heap',
|
|
56
|
+
'posthog',
|
|
57
|
+
'plausible',
|
|
58
|
+
'fathom',
|
|
59
|
+
'_paq',
|
|
60
|
+
's_account',
|
|
61
|
+
's',
|
|
62
|
+
'ym',
|
|
63
|
+
'UET',
|
|
64
|
+
'optimizely',
|
|
65
|
+
'_hsq',
|
|
66
|
+
'Sentry',
|
|
67
|
+
'Intercom',
|
|
68
|
+
'intercomSettings',
|
|
69
|
+
'drift',
|
|
70
|
+
'Tawk_API',
|
|
71
|
+
'zE',
|
|
72
|
+
'OneTrust',
|
|
73
|
+
'Cookiebot',
|
|
74
|
+
'Stripe',
|
|
75
|
+
'grecaptcha',
|
|
76
|
+
];
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Walks the given window's `Document` and returns a serializable list of raw
|
|
80
|
+
* head entries.
|
|
81
|
+
*
|
|
82
|
+
* Two realms are supported:
|
|
83
|
+
*
|
|
84
|
+
* - Browser realm (Puppeteer): the function source is `.toString()`'d and run
|
|
85
|
+
* inside the page via `page.evaluate(string)`. Inside the page, `window`
|
|
86
|
+
* resolves to the page's global object, so destructured class constructors
|
|
87
|
+
* match `instanceof` checks against elements returned from `querySelectorAll`.
|
|
88
|
+
* - Node realm (jsdom et al.): the caller passes `dom.window` directly. jsdom's
|
|
89
|
+
* HTML element prototypes are distinct from the host Node's bare globals, so
|
|
90
|
+
* reading the constructors off the passed `window` is what makes `instanceof`
|
|
91
|
+
* succeed.
|
|
92
|
+
*
|
|
93
|
+
* The function MUST NOT close over any module-scope binding — all data it needs
|
|
94
|
+
* is reached through its two parameters.
|
|
95
|
+
* @param window - The window object whose `document` will be inspected. Provides
|
|
96
|
+
* both the DOM tree and the HTML element constructors used for
|
|
97
|
+
* `instanceof` narrowing.
|
|
98
|
+
* @param knownGlobals - Names of `window` properties that, when present,
|
|
99
|
+
* indicate a third-party tag library is loaded. Required
|
|
100
|
+
* (no default) so the Puppeteer-side string-eval path
|
|
101
|
+
* does not have to inline a default value list.
|
|
102
|
+
* @returns Serializable list of raw head entries for {@link ../meta/classify.ts | classify}.
|
|
103
|
+
*/
|
|
104
|
+
export function collectHeadFromDocument(
|
|
105
|
+
window: Window,
|
|
106
|
+
knownGlobals: readonly string[],
|
|
107
|
+
): RawHeadEntry[] {
|
|
108
|
+
const document = window.document;
|
|
109
|
+
// TypeScript's `Window` interface in lib.dom does not directly expose the
|
|
110
|
+
// HTML element constructors (`HTMLLinkElement`, `HTMLScriptElement`, …)
|
|
111
|
+
// even though every real window object — browser realm AND jsdom realm —
|
|
112
|
+
// carries them at runtime. Widening the type here lets us destructure them
|
|
113
|
+
// uniformly; the runtime values come straight from the passed window, so
|
|
114
|
+
// the cast is purely cosmetic for TS and erased at compile time.
|
|
115
|
+
const w = window as Window & {
|
|
116
|
+
HTMLBaseElement: typeof globalThis.HTMLBaseElement;
|
|
117
|
+
HTMLMetaElement: typeof globalThis.HTMLMetaElement;
|
|
118
|
+
HTMLLinkElement: typeof globalThis.HTMLLinkElement;
|
|
119
|
+
HTMLScriptElement: typeof globalThis.HTMLScriptElement;
|
|
120
|
+
HTMLIFrameElement: typeof globalThis.HTMLIFrameElement;
|
|
121
|
+
};
|
|
122
|
+
const {
|
|
123
|
+
HTMLBaseElement,
|
|
124
|
+
HTMLMetaElement,
|
|
125
|
+
HTMLLinkElement,
|
|
126
|
+
HTMLScriptElement,
|
|
127
|
+
HTMLIFrameElement,
|
|
128
|
+
} = w;
|
|
129
|
+
|
|
130
|
+
const entries: RawHeadEntry[] = [];
|
|
131
|
+
|
|
132
|
+
const html = document.documentElement;
|
|
133
|
+
entries.push(
|
|
134
|
+
{
|
|
135
|
+
kind: 'html',
|
|
136
|
+
lang: html.lang || undefined,
|
|
137
|
+
dir: html.dir || undefined,
|
|
138
|
+
xmlns: html.getAttribute('xmlns') ?? undefined,
|
|
139
|
+
prefix: html.getAttribute('prefix') ?? undefined,
|
|
140
|
+
vocab: html.getAttribute('vocab') ?? undefined,
|
|
141
|
+
typeOf: html.getAttribute('typeof') ?? undefined,
|
|
142
|
+
itemscope: html.hasAttribute('itemscope') || undefined,
|
|
143
|
+
itemtype: html.getAttribute('itemtype') ?? undefined,
|
|
144
|
+
amp: html.hasAttribute('amp') || undefined,
|
|
145
|
+
lightning: html.hasAttribute('⚡') || undefined,
|
|
146
|
+
},
|
|
147
|
+
{ kind: 'title', content: document.title },
|
|
148
|
+
);
|
|
149
|
+
|
|
150
|
+
for (const base of document.querySelectorAll('base')) {
|
|
151
|
+
if (!(base instanceof HTMLBaseElement)) continue;
|
|
152
|
+
entries.push({
|
|
153
|
+
kind: 'base',
|
|
154
|
+
href: base.getAttribute('href') ?? undefined,
|
|
155
|
+
target: base.getAttribute('target') ?? undefined,
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
for (const meta of document.querySelectorAll('meta')) {
|
|
160
|
+
if (!(meta instanceof HTMLMetaElement)) continue;
|
|
161
|
+
const name = meta.getAttribute('name');
|
|
162
|
+
const property = meta.getAttribute('property');
|
|
163
|
+
const httpEquiv = meta.getAttribute('http-equiv');
|
|
164
|
+
const itemprop = meta.getAttribute('itemprop');
|
|
165
|
+
const charset = meta.getAttribute('charset');
|
|
166
|
+
const content = meta.getAttribute('content');
|
|
167
|
+
const media = meta.getAttribute('media');
|
|
168
|
+
entries.push({
|
|
169
|
+
kind: 'meta',
|
|
170
|
+
name: name ? name.toLowerCase() : undefined,
|
|
171
|
+
property: property ? property.toLowerCase() : undefined,
|
|
172
|
+
httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
|
|
173
|
+
itemprop: itemprop ?? undefined,
|
|
174
|
+
charset: charset ?? undefined,
|
|
175
|
+
content: content ?? undefined,
|
|
176
|
+
media: media ?? undefined,
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
for (const link of document.querySelectorAll('link[href]')) {
|
|
181
|
+
if (!(link instanceof HTMLLinkElement)) continue;
|
|
182
|
+
const relRaw = link.getAttribute('rel') ?? '';
|
|
183
|
+
const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
|
|
184
|
+
entries.push({
|
|
185
|
+
kind: 'link',
|
|
186
|
+
rel,
|
|
187
|
+
href: link.getAttribute('href') ?? '',
|
|
188
|
+
type: link.getAttribute('type') ?? undefined,
|
|
189
|
+
media: link.getAttribute('media') ?? undefined,
|
|
190
|
+
sizes: link.getAttribute('sizes') ?? undefined,
|
|
191
|
+
title: link.getAttribute('title') ?? undefined,
|
|
192
|
+
hreflang: link.getAttribute('hreflang') ?? undefined,
|
|
193
|
+
as: link.getAttribute('as') ?? undefined,
|
|
194
|
+
crossorigin: link.getAttribute('crossorigin') ?? undefined,
|
|
195
|
+
color: link.getAttribute('color') ?? undefined,
|
|
196
|
+
blocking: link.getAttribute('blocking') ?? undefined,
|
|
197
|
+
imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
|
|
198
|
+
});
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const STRUCTURED_TYPES = new Set([
|
|
202
|
+
'application/ld+json',
|
|
203
|
+
'speculationrules',
|
|
204
|
+
'application/json+oembed',
|
|
205
|
+
'application/xml+oembed',
|
|
206
|
+
]);
|
|
207
|
+
for (const script of document.querySelectorAll('script[type]')) {
|
|
208
|
+
if (!(script instanceof HTMLScriptElement)) continue;
|
|
209
|
+
const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
|
|
210
|
+
if (!STRUCTURED_TYPES.has(scriptType)) continue;
|
|
211
|
+
const src = script.getAttribute('src') ?? undefined;
|
|
212
|
+
const text = script.textContent ?? '';
|
|
213
|
+
const inHead = !!script.closest('head');
|
|
214
|
+
const inNoscript = !!script.closest('noscript');
|
|
215
|
+
const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
|
|
216
|
+
entries.push({
|
|
217
|
+
kind: 'script',
|
|
218
|
+
scriptType,
|
|
219
|
+
content: text || undefined,
|
|
220
|
+
src,
|
|
221
|
+
location,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
for (const iframe of document.querySelectorAll('iframe[src]')) {
|
|
226
|
+
if (!(iframe instanceof HTMLIFrameElement)) continue;
|
|
227
|
+
const src = iframe.getAttribute('src') ?? '';
|
|
228
|
+
if (!src) continue;
|
|
229
|
+
const inHead = !!iframe.closest('head');
|
|
230
|
+
const inNoscript = !!iframe.closest('noscript');
|
|
231
|
+
const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
|
|
232
|
+
entries.push({ kind: 'iframe', src, location });
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const win = window as unknown as Record<string, unknown>;
|
|
236
|
+
const presentGlobals: string[] = [];
|
|
237
|
+
for (const name of knownGlobals) {
|
|
238
|
+
if (win[name] !== undefined) {
|
|
239
|
+
presentGlobals.push(name);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
if (presentGlobals.length > 0) {
|
|
243
|
+
entries.push({ kind: 'window-global', names: presentGlobals });
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return entries;
|
|
247
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest';
|
|
2
|
+
|
|
3
|
+
import { extractIds } from './id-extractors.js';
|
|
4
|
+
|
|
5
|
+
describe('extractIds', () => {
|
|
6
|
+
it('returns [] for unknown provider', () => {
|
|
7
|
+
expect(extractIds('NonExistentProvider', '<html></html>')).toEqual([]);
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
it('extracts GA4 measurement ID from gtag config', () => {
|
|
11
|
+
const html = `<script>gtag('config', 'G-ABCD1234XY')</script>`;
|
|
12
|
+
expect(extractIds('Google Analytics', html)).toContain('G-ABCD1234XY');
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
it('extracts GA4 measurement ID from script src', () => {
|
|
16
|
+
const html = `<script src="https://www.googletagmanager.com/gtag/js?id=G-XYZW9876AB"></script>`;
|
|
17
|
+
expect(extractIds('Google Analytics', html)).toContain('G-XYZW9876AB');
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it('extracts UA tracking ID', () => {
|
|
21
|
+
const html = `<script>ga('create', 'UA-12345678-1', 'auto');</script>`;
|
|
22
|
+
expect(extractIds('Google Analytics', html)).toContain('UA-12345678-1');
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it('extracts GTM container ID from src and inline', () => {
|
|
26
|
+
const html = `
|
|
27
|
+
<script src="https://www.googletagmanager.com/gtm.js?id=GTM-ABCD123"></script>
|
|
28
|
+
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-ABCD123"></iframe></noscript>
|
|
29
|
+
`;
|
|
30
|
+
const ids = extractIds('Google Tag Manager', html);
|
|
31
|
+
expect(ids).toContain('GTM-ABCD123');
|
|
32
|
+
expect(ids.length).toBe(1);
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('extracts Facebook Pixel ID from fbq init', () => {
|
|
36
|
+
const html = `<script>fbq('init', '123456789012345');</script>`;
|
|
37
|
+
expect(extractIds('Facebook Pixel', html)).toContain('123456789012345');
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('extracts Hotjar site ID from inline', () => {
|
|
41
|
+
const html = `<script>(function(h,o,t,j,a,r){h.hj=h.hj||function(){};h._hjSettings={hjid:1234567,hjsv:6};})(window,document)</script>`;
|
|
42
|
+
expect(extractIds('Hotjar', html)).toContain('1234567');
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it('extracts Microsoft Clarity project ID from src', () => {
|
|
46
|
+
const html = `<script src="https://www.clarity.ms/tag/abc123xyz"></script>`;
|
|
47
|
+
expect(extractIds('Microsoft Clarity', html)).toContain('abc123xyz');
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
it('extracts TikTok pixel ID from ttq.load', () => {
|
|
51
|
+
const html = `<script>ttq.load('ABCDEFGH12345678')</script>`;
|
|
52
|
+
expect(extractIds('TikTok Pixel', html)).toContain('ABCDEFGH12345678');
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it('deduplicates IDs across multiple patterns', () => {
|
|
56
|
+
const html = `
|
|
57
|
+
<script src="https://www.googletagmanager.com/gtag/js?id=G-DUP12345A"></script>
|
|
58
|
+
<script>gtag('config', 'G-DUP12345A');</script>
|
|
59
|
+
`;
|
|
60
|
+
const ids = extractIds('Google Analytics', html);
|
|
61
|
+
const dupCount = ids.filter((id) => id === 'G-DUP12345A').length;
|
|
62
|
+
expect(dupCount).toBe(1);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
it('extracts Yandex Metrica counter ID from ym init', () => {
|
|
66
|
+
const html = `<script>ym(12345678, 'init', { clickmap:true });</script>`;
|
|
67
|
+
expect(extractIds('Yandex Metrica', html)).toContain('12345678');
|
|
68
|
+
});
|
|
69
|
+
});
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Provider-specific real-ID extraction rules.
|
|
3
|
+
*
|
|
4
|
+
* `simple-wappalyzer` identifies the *technology* (e.g., "Google Analytics") but
|
|
5
|
+
* does not surface the actual account/measurement ID. We layer real-ID
|
|
6
|
+
* extraction on top: for each detected provider, apply the registered regex
|
|
7
|
+
* over the page HTML and surface what we find.
|
|
8
|
+
*
|
|
9
|
+
* Provider keys must match the names produced by `simple-wappalyzer` exactly;
|
|
10
|
+
* these in turn track `wappalyzer-core@6` (the MIT-licensed fingerprint set).
|
|
11
|
+
*
|
|
12
|
+
* Keep the table **manually maintained**, not generated from Wappalyzer data.
|
|
13
|
+
* @module
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
export type IdExtractor = {
|
|
17
|
+
/**
|
|
18
|
+
* Each regex MUST contain at most one capturing group; the captured text
|
|
19
|
+
* becomes the ID. Patterns without a capturing group fall back to
|
|
20
|
+
* `match[0]`.
|
|
21
|
+
*/
|
|
22
|
+
readonly patterns: readonly RegExp[];
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Lookup table keyed by Wappalyzer provider name.
|
|
27
|
+
*
|
|
28
|
+
* When extending: keep regexes anchored on stable, high-signal substrings
|
|
29
|
+
* (the surrounding API call, not just the bare ID character class). Otherwise
|
|
30
|
+
* the same regex will hit unrelated strings on pages that happen to share the
|
|
31
|
+
* shape (e.g., AWS ARNs containing `GA-...`).
|
|
32
|
+
*/
|
|
33
|
+
export const ID_EXTRACTORS: Record<string, IdExtractor> = {
|
|
34
|
+
'Google Analytics': {
|
|
35
|
+
patterns: [
|
|
36
|
+
/gtag\(\s*['"]config['"]\s*,\s*['"](G-[A-Z0-9]{4,20})['"]/g,
|
|
37
|
+
/googletagmanager\.com\/gtag\/js\?id=(G-[A-Z0-9]{4,20})/g,
|
|
38
|
+
/\bga\(\s*['"]create['"]\s*,\s*['"](UA-\d{4,10}-\d{1,4})['"]/g,
|
|
39
|
+
/['"](UA-\d{4,10}-\d{1,4})['"]/g,
|
|
40
|
+
],
|
|
41
|
+
},
|
|
42
|
+
'Google Tag Manager': {
|
|
43
|
+
patterns: [
|
|
44
|
+
/googletagmanager\.com\/(?:gtm|ns)\.[a-z]+\?id=(GTM-[A-Z0-9]{4,12})/g,
|
|
45
|
+
/['"](GTM-[A-Z0-9]{4,12})['"]/g,
|
|
46
|
+
],
|
|
47
|
+
},
|
|
48
|
+
'Google Ads': {
|
|
49
|
+
patterns: [/['"](AW-\d{4,12})['"]/g],
|
|
50
|
+
},
|
|
51
|
+
'Facebook Pixel': {
|
|
52
|
+
patterns: [
|
|
53
|
+
/fbq\(\s*['"]init['"]\s*,\s*['"](\d{6,20})['"]/g,
|
|
54
|
+
/connect\.facebook\.net\/[^"']+\/fbevents\.js\D*(\d{6,20})/g,
|
|
55
|
+
],
|
|
56
|
+
},
|
|
57
|
+
Hotjar: {
|
|
58
|
+
patterns: [
|
|
59
|
+
/hjid\s*[:=]\s*(\d{4,10})/g,
|
|
60
|
+
/static\.hotjar\.com\/c\/hotjar-(\d{4,10})\.js/g,
|
|
61
|
+
],
|
|
62
|
+
},
|
|
63
|
+
'Microsoft Clarity': {
|
|
64
|
+
patterns: [
|
|
65
|
+
/clarity\.ms\/tag\/([a-z0-9]{6,20})/g,
|
|
66
|
+
/clarity\(\s*['"]start['"]\s*,\s*['"]([a-z0-9]{6,20})['"]/gi,
|
|
67
|
+
],
|
|
68
|
+
},
|
|
69
|
+
Mixpanel: {
|
|
70
|
+
patterns: [/mixpanel\.init\(\s*['"]([a-f0-9]{16,40})['"]/g],
|
|
71
|
+
},
|
|
72
|
+
Segment: {
|
|
73
|
+
patterns: [
|
|
74
|
+
/analytics\.load\(\s*['"]([a-zA-Z0-9]{8,40})['"]/g,
|
|
75
|
+
/cdn\.segment\.com\/analytics\.js\/v1\/([a-zA-Z0-9]{8,40})/g,
|
|
76
|
+
],
|
|
77
|
+
},
|
|
78
|
+
Amplitude: {
|
|
79
|
+
patterns: [
|
|
80
|
+
/amplitude\.init\(\s*['"]([a-f0-9]{16,40})['"]/g,
|
|
81
|
+
/getInstance\(\)\.init\(\s*['"]([a-f0-9]{16,40})['"]/g,
|
|
82
|
+
],
|
|
83
|
+
},
|
|
84
|
+
Heap: {
|
|
85
|
+
patterns: [
|
|
86
|
+
/heap\.load\(\s*['"](\d{6,20})['"]/g,
|
|
87
|
+
/heap\.appid\s*=\s*['"](\d{6,20})['"]/g,
|
|
88
|
+
],
|
|
89
|
+
},
|
|
90
|
+
PostHog: {
|
|
91
|
+
patterns: [/posthog\.init\(\s*['"]([\w-]{16,80})['"]/g],
|
|
92
|
+
},
|
|
93
|
+
Plausible: {
|
|
94
|
+
patterns: [/plausible\.io\/js\/script\.js[?&]domain=([a-zA-Z0-9.,-]+)/g],
|
|
95
|
+
},
|
|
96
|
+
Matomo: {
|
|
97
|
+
patterns: [
|
|
98
|
+
/_paq\.push\(\s*\[\s*['"]setSiteId['"]\s*,\s*['"]?(\d{1,6})['"]?\s*\]/g,
|
|
99
|
+
/matomo\.php\?siteId=(\d{1,6})/g,
|
|
100
|
+
],
|
|
101
|
+
},
|
|
102
|
+
'Adobe Analytics': {
|
|
103
|
+
patterns: [
|
|
104
|
+
/s_account\s*=\s*['"]([a-z0-9,]{3,50})['"]/gi,
|
|
105
|
+
/s\.account\s*=\s*['"]([a-z0-9,]{3,50})['"]/gi,
|
|
106
|
+
],
|
|
107
|
+
},
|
|
108
|
+
'Yandex Metrica': {
|
|
109
|
+
patterns: [/ym\(\s*(\d{6,12})\s*,\s*['"]init['"]/g],
|
|
110
|
+
},
|
|
111
|
+
'LinkedIn Insight Tag': {
|
|
112
|
+
patterns: [/_linkedin_partner_id\s*=\s*['"](\d{4,10})['"]/g],
|
|
113
|
+
},
|
|
114
|
+
'Twitter Ads': {
|
|
115
|
+
patterns: [/twq\(\s*['"]config['"]\s*,\s*['"]([a-z0-9]{4,12})['"]/g],
|
|
116
|
+
},
|
|
117
|
+
'TikTok Pixel': {
|
|
118
|
+
patterns: [
|
|
119
|
+
/ttq\.load\(\s*['"]([A-Z0-9]{12,30})['"]/g,
|
|
120
|
+
/tiktok\.com\/i18n\/pixel\/events\.js\?sdkid=([A-Z0-9]{12,30})/g,
|
|
121
|
+
],
|
|
122
|
+
},
|
|
123
|
+
'Pinterest Tag': {
|
|
124
|
+
patterns: [/pintrk\(\s*['"]load['"]\s*,\s*['"](\d{12,20})['"]/g],
|
|
125
|
+
},
|
|
126
|
+
'Bing Universal Event Tracking': {
|
|
127
|
+
patterns: [
|
|
128
|
+
/setAttribute\(\s*['"]data-tag['"]\s*,\s*['"](\d{6,20})['"]/g,
|
|
129
|
+
/UET\(\{\s*ti:\s*['"](\d{6,20})['"]/g,
|
|
130
|
+
],
|
|
131
|
+
},
|
|
132
|
+
Optimizely: {
|
|
133
|
+
patterns: [/cdn\.optimizely\.com\/js\/(\d{6,20})\.js/g],
|
|
134
|
+
},
|
|
135
|
+
HubSpot: {
|
|
136
|
+
patterns: [
|
|
137
|
+
/js\.hs-?scripts\.com\/(\d{4,12})\.js/g,
|
|
138
|
+
/js\.hubspot\.com\/web-interactives\/v1\/embeds\/(\d{4,12})/g,
|
|
139
|
+
],
|
|
140
|
+
},
|
|
141
|
+
Sentry: {
|
|
142
|
+
patterns: [
|
|
143
|
+
/(https:\/\/[a-f0-9]+@[a-zA-Z0-9.-]+\.ingest\.sentry\.io\/\d+)/g,
|
|
144
|
+
/(https:\/\/[a-f0-9]+@[a-zA-Z0-9.-]+\.sentry\.io\/\d+)/g,
|
|
145
|
+
],
|
|
146
|
+
},
|
|
147
|
+
Intercom: {
|
|
148
|
+
patterns: [
|
|
149
|
+
/intercomSettings\s*=\s*\{[^}]*?app_id:\s*['"]([a-z0-9]{4,10})['"]/g,
|
|
150
|
+
/Intercom\(\s*['"]boot['"]\s*,\s*\{[^}]*?app_id:\s*['"]([a-z0-9]{4,10})['"]/g,
|
|
151
|
+
],
|
|
152
|
+
},
|
|
153
|
+
Drift: {
|
|
154
|
+
patterns: [/drift\.load\(\s*['"]([a-z0-9]{6,30})['"]/g],
|
|
155
|
+
},
|
|
156
|
+
'Tawk.to': {
|
|
157
|
+
patterns: [/embed\.tawk\.to\/([a-f0-9]{16,40})/g],
|
|
158
|
+
},
|
|
159
|
+
'Zendesk Chat': {
|
|
160
|
+
patterns: [/static\.zdassets\.com\/ekr\/snippet\.js\?key=([a-f0-9-]{16,40})/g],
|
|
161
|
+
},
|
|
162
|
+
Cookiebot: {
|
|
163
|
+
patterns: [/consent\.cookiebot\.com\/uc\.js[^"']*?cbid=([a-f0-9-]{16,40})/g],
|
|
164
|
+
},
|
|
165
|
+
OneTrust: {
|
|
166
|
+
patterns: [/dataDomain['"=]\s*['"]?([a-z0-9-]{16,80})['"]?/gi],
|
|
167
|
+
},
|
|
168
|
+
Stripe: {
|
|
169
|
+
patterns: [/js\.stripe\.com\/v\d+\//g],
|
|
170
|
+
},
|
|
171
|
+
'Google reCAPTCHA': {
|
|
172
|
+
patterns: [/google\.com\/recaptcha\/api\.js[^"']*?(?:render=)?([\w-]{20,60})/g],
|
|
173
|
+
},
|
|
174
|
+
'Facebook for WordPress': {
|
|
175
|
+
patterns: [/fbq\(\s*['"]init['"]\s*,\s*['"](\d{6,20})['"]/g],
|
|
176
|
+
},
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Extracts real IDs for `provider` from the page HTML.
|
|
181
|
+
*
|
|
182
|
+
* Returns a de-duplicated, insertion-ordered list of IDs. Returns `[]` for
|
|
183
|
+
* unknown providers (so callers can compose freely).
|
|
184
|
+
* @param provider
|
|
185
|
+
* @param html
|
|
186
|
+
*/
|
|
187
|
+
export function extractIds(provider: string, html: string): string[] {
|
|
188
|
+
const extractor = ID_EXTRACTORS[provider];
|
|
189
|
+
if (!extractor) return [];
|
|
190
|
+
const seen = new Set<string>();
|
|
191
|
+
const result: string[] = [];
|
|
192
|
+
for (const pattern of extractor.patterns) {
|
|
193
|
+
// Patterns must be `g`-flagged for `matchAll` to work without re-creating.
|
|
194
|
+
const safe = pattern.flags.includes('g')
|
|
195
|
+
? pattern
|
|
196
|
+
: new RegExp(pattern.source, pattern.flags + 'g');
|
|
197
|
+
for (const match of html.matchAll(safe)) {
|
|
198
|
+
const id = match[1] ?? match[0];
|
|
199
|
+
if (id && !seen.has(id)) {
|
|
200
|
+
seen.add(id);
|
|
201
|
+
result.push(id);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
return result;
|
|
206
|
+
}
|