@d-zero/beholder 2.1.5 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,69 @@
1
+ import { describe, expect, it } from 'vitest';
2
+
3
+ import { extractIds } from './id-extractors.js';
4
+
5
+ describe('extractIds', () => {
6
+ it('returns [] for unknown provider', () => {
7
+ expect(extractIds('NonExistentProvider', '<html></html>')).toEqual([]);
8
+ });
9
+
10
+ it('extracts GA4 measurement ID from gtag config', () => {
11
+ const html = `<script>gtag('config', 'G-ABCD1234XY')</script>`;
12
+ expect(extractIds('Google Analytics', html)).toContain('G-ABCD1234XY');
13
+ });
14
+
15
+ it('extracts GA4 measurement ID from script src', () => {
16
+ const html = `<script src="https://www.googletagmanager.com/gtag/js?id=G-XYZW9876AB"></script>`;
17
+ expect(extractIds('Google Analytics', html)).toContain('G-XYZW9876AB');
18
+ });
19
+
20
+ it('extracts UA tracking ID', () => {
21
+ const html = `<script>ga('create', 'UA-12345678-1', 'auto');</script>`;
22
+ expect(extractIds('Google Analytics', html)).toContain('UA-12345678-1');
23
+ });
24
+
25
+ it('extracts GTM container ID from src and inline', () => {
26
+ const html = `
27
+ <script src="https://www.googletagmanager.com/gtm.js?id=GTM-ABCD123"></script>
28
+ <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-ABCD123"></iframe></noscript>
29
+ `;
30
+ const ids = extractIds('Google Tag Manager', html);
31
+ expect(ids).toContain('GTM-ABCD123');
32
+ expect(ids.length).toBe(1);
33
+ });
34
+
35
+ it('extracts Facebook Pixel ID from fbq init', () => {
36
+ const html = `<script>fbq('init', '123456789012345');</script>`;
37
+ expect(extractIds('Facebook Pixel', html)).toContain('123456789012345');
38
+ });
39
+
40
+ it('extracts Hotjar site ID from inline', () => {
41
+ const html = `<script>(function(h,o,t,j,a,r){h.hj=h.hj||function(){};h._hjSettings={hjid:1234567,hjsv:6};})(window,document)</script>`;
42
+ expect(extractIds('Hotjar', html)).toContain('1234567');
43
+ });
44
+
45
+ it('extracts Microsoft Clarity project ID from src', () => {
46
+ const html = `<script src="https://www.clarity.ms/tag/abc123xyz"></script>`;
47
+ expect(extractIds('Microsoft Clarity', html)).toContain('abc123xyz');
48
+ });
49
+
50
+ it('extracts TikTok pixel ID from ttq.load', () => {
51
+ const html = `<script>ttq.load('ABCDEFGH12345678')</script>`;
52
+ expect(extractIds('TikTok Pixel', html)).toContain('ABCDEFGH12345678');
53
+ });
54
+
55
+ it('deduplicates IDs across multiple patterns', () => {
56
+ const html = `
57
+ <script src="https://www.googletagmanager.com/gtag/js?id=G-DUP12345A"></script>
58
+ <script>gtag('config', 'G-DUP12345A');</script>
59
+ `;
60
+ const ids = extractIds('Google Analytics', html);
61
+ const dupCount = ids.filter((id) => id === 'G-DUP12345A').length;
62
+ expect(dupCount).toBe(1);
63
+ });
64
+
65
+ it('extracts Yandex Metrica counter ID from ym init', () => {
66
+ const html = `<script>ym(12345678, 'init', { clickmap:true });</script>`;
67
+ expect(extractIds('Yandex Metrica', html)).toContain('12345678');
68
+ });
69
+ });
@@ -0,0 +1,206 @@
1
+ /**
2
+ * Provider-specific real-ID extraction rules.
3
+ *
4
+ * `simple-wappalyzer` identifies the *technology* (e.g., "Google Analytics") but
5
+ * does not surface the actual account/measurement ID. We layer real-ID
6
+ * extraction on top: for each detected provider, apply the registered regex
7
+ * over the page HTML and surface what we find.
8
+ *
9
+ * Provider keys must match the names produced by `simple-wappalyzer` exactly;
10
+ * these in turn track `wappalyzer-core@6` (the MIT-licensed fingerprint set).
11
+ *
12
+ * Keep the table **manually maintained**, not generated from Wappalyzer data.
13
+ * @module
14
+ */
15
+
16
+ export type IdExtractor = {
17
+ /**
18
+ * Each regex MUST contain at most one capturing group; the captured text
19
+ * becomes the ID. Patterns without a capturing group fall back to
20
+ * `match[0]`.
21
+ */
22
+ readonly patterns: readonly RegExp[];
23
+ };
24
+
25
+ /**
26
+ * Lookup table keyed by Wappalyzer provider name.
27
+ *
28
+ * When extending: keep regexes anchored on stable, high-signal substrings
29
+ * (the surrounding API call, not just the bare ID character class). Otherwise
30
+ * the same regex will hit unrelated strings on pages that happen to share the
31
+ * shape (e.g., AWS ARNs containing `GA-...`).
32
+ */
33
+ export const ID_EXTRACTORS: Record<string, IdExtractor> = {
34
+ 'Google Analytics': {
35
+ patterns: [
36
+ /gtag\(\s*['"]config['"]\s*,\s*['"](G-[A-Z0-9]{4,20})['"]/g,
37
+ /googletagmanager\.com\/gtag\/js\?id=(G-[A-Z0-9]{4,20})/g,
38
+ /\bga\(\s*['"]create['"]\s*,\s*['"](UA-\d{4,10}-\d{1,4})['"]/g,
39
+ /['"](UA-\d{4,10}-\d{1,4})['"]/g,
40
+ ],
41
+ },
42
+ 'Google Tag Manager': {
43
+ patterns: [
44
+ /googletagmanager\.com\/(?:gtm|ns)\.[a-z]+\?id=(GTM-[A-Z0-9]{4,12})/g,
45
+ /['"](GTM-[A-Z0-9]{4,12})['"]/g,
46
+ ],
47
+ },
48
+ 'Google Ads': {
49
+ patterns: [/['"](AW-\d{4,12})['"]/g],
50
+ },
51
+ 'Facebook Pixel': {
52
+ patterns: [
53
+ /fbq\(\s*['"]init['"]\s*,\s*['"](\d{6,20})['"]/g,
54
+ /connect\.facebook\.net\/[^"']+\/fbevents\.js\D*(\d{6,20})/g,
55
+ ],
56
+ },
57
+ Hotjar: {
58
+ patterns: [
59
+ /hjid\s*[:=]\s*(\d{4,10})/g,
60
+ /static\.hotjar\.com\/c\/hotjar-(\d{4,10})\.js/g,
61
+ ],
62
+ },
63
+ 'Microsoft Clarity': {
64
+ patterns: [
65
+ /clarity\.ms\/tag\/([a-z0-9]{6,20})/g,
66
+ /clarity\(\s*['"]start['"]\s*,\s*['"]([a-z0-9]{6,20})['"]/gi,
67
+ ],
68
+ },
69
+ Mixpanel: {
70
+ patterns: [/mixpanel\.init\(\s*['"]([a-f0-9]{16,40})['"]/g],
71
+ },
72
+ Segment: {
73
+ patterns: [
74
+ /analytics\.load\(\s*['"]([a-zA-Z0-9]{8,40})['"]/g,
75
+ /cdn\.segment\.com\/analytics\.js\/v1\/([a-zA-Z0-9]{8,40})/g,
76
+ ],
77
+ },
78
+ Amplitude: {
79
+ patterns: [
80
+ /amplitude\.init\(\s*['"]([a-f0-9]{16,40})['"]/g,
81
+ /getInstance\(\)\.init\(\s*['"]([a-f0-9]{16,40})['"]/g,
82
+ ],
83
+ },
84
+ Heap: {
85
+ patterns: [
86
+ /heap\.load\(\s*['"](\d{6,20})['"]/g,
87
+ /heap\.appid\s*=\s*['"](\d{6,20})['"]/g,
88
+ ],
89
+ },
90
+ PostHog: {
91
+ patterns: [/posthog\.init\(\s*['"]([\w-]{16,80})['"]/g],
92
+ },
93
+ Plausible: {
94
+ patterns: [/plausible\.io\/js\/script\.js[?&]domain=([a-zA-Z0-9.,-]+)/g],
95
+ },
96
+ Matomo: {
97
+ patterns: [
98
+ /_paq\.push\(\s*\[\s*['"]setSiteId['"]\s*,\s*['"]?(\d{1,6})['"]?\s*\]/g,
99
+ /matomo\.php\?siteId=(\d{1,6})/g,
100
+ ],
101
+ },
102
+ 'Adobe Analytics': {
103
+ patterns: [
104
+ /s_account\s*=\s*['"]([a-z0-9,]{3,50})['"]/gi,
105
+ /s\.account\s*=\s*['"]([a-z0-9,]{3,50})['"]/gi,
106
+ ],
107
+ },
108
+ 'Yandex Metrica': {
109
+ patterns: [/ym\(\s*(\d{6,12})\s*,\s*['"]init['"]/g],
110
+ },
111
+ 'LinkedIn Insight Tag': {
112
+ patterns: [/_linkedin_partner_id\s*=\s*['"](\d{4,10})['"]/g],
113
+ },
114
+ 'Twitter Ads': {
115
+ patterns: [/twq\(\s*['"]config['"]\s*,\s*['"]([a-z0-9]{4,12})['"]/g],
116
+ },
117
+ 'TikTok Pixel': {
118
+ patterns: [
119
+ /ttq\.load\(\s*['"]([A-Z0-9]{12,30})['"]/g,
120
+ /tiktok\.com\/i18n\/pixel\/events\.js\?sdkid=([A-Z0-9]{12,30})/g,
121
+ ],
122
+ },
123
+ 'Pinterest Tag': {
124
+ patterns: [/pintrk\(\s*['"]load['"]\s*,\s*['"](\d{12,20})['"]/g],
125
+ },
126
+ 'Bing Universal Event Tracking': {
127
+ patterns: [
128
+ /setAttribute\(\s*['"]data-tag['"]\s*,\s*['"](\d{6,20})['"]/g,
129
+ /UET\(\{\s*ti:\s*['"](\d{6,20})['"]/g,
130
+ ],
131
+ },
132
+ Optimizely: {
133
+ patterns: [/cdn\.optimizely\.com\/js\/(\d{6,20})\.js/g],
134
+ },
135
+ HubSpot: {
136
+ patterns: [
137
+ /js\.hs-?scripts\.com\/(\d{4,12})\.js/g,
138
+ /js\.hubspot\.com\/web-interactives\/v1\/embeds\/(\d{4,12})/g,
139
+ ],
140
+ },
141
+ Sentry: {
142
+ patterns: [
143
+ /(https:\/\/[a-f0-9]+@[a-zA-Z0-9.-]+\.ingest\.sentry\.io\/\d+)/g,
144
+ /(https:\/\/[a-f0-9]+@[a-zA-Z0-9.-]+\.sentry\.io\/\d+)/g,
145
+ ],
146
+ },
147
+ Intercom: {
148
+ patterns: [
149
+ /intercomSettings\s*=\s*\{[^}]*?app_id:\s*['"]([a-z0-9]{4,10})['"]/g,
150
+ /Intercom\(\s*['"]boot['"]\s*,\s*\{[^}]*?app_id:\s*['"]([a-z0-9]{4,10})['"]/g,
151
+ ],
152
+ },
153
+ Drift: {
154
+ patterns: [/drift\.load\(\s*['"]([a-z0-9]{6,30})['"]/g],
155
+ },
156
+ 'Tawk.to': {
157
+ patterns: [/embed\.tawk\.to\/([a-f0-9]{16,40})/g],
158
+ },
159
+ 'Zendesk Chat': {
160
+ patterns: [/static\.zdassets\.com\/ekr\/snippet\.js\?key=([a-f0-9-]{16,40})/g],
161
+ },
162
+ Cookiebot: {
163
+ patterns: [/consent\.cookiebot\.com\/uc\.js[^"']*?cbid=([a-f0-9-]{16,40})/g],
164
+ },
165
+ OneTrust: {
166
+ patterns: [/dataDomain['"=]\s*['"]?([a-z0-9-]{16,80})['"]?/gi],
167
+ },
168
+ Stripe: {
169
+ patterns: [/js\.stripe\.com\/v\d+\//g],
170
+ },
171
+ 'Google reCAPTCHA': {
172
+ patterns: [/google\.com\/recaptcha\/api\.js[^"']*?(?:render=)?([\w-]{20,60})/g],
173
+ },
174
+ 'Facebook for WordPress': {
175
+ patterns: [/fbq\(\s*['"]init['"]\s*,\s*['"](\d{6,20})['"]/g],
176
+ },
177
+ };
178
+
179
+ /**
180
+ * Extracts real IDs for `provider` from the page HTML.
181
+ *
182
+ * Returns a de-duplicated, insertion-ordered list of IDs. Returns `[]` for
183
+ * unknown providers (so callers can compose freely).
184
+ * @param provider
185
+ * @param html
186
+ */
187
+ export function extractIds(provider: string, html: string): string[] {
188
+ const extractor = ID_EXTRACTORS[provider];
189
+ if (!extractor) return [];
190
+ const seen = new Set<string>();
191
+ const result: string[] = [];
192
+ for (const pattern of extractor.patterns) {
193
+ // Patterns must be `g`-flagged for `matchAll` to work without re-creating.
194
+ const safe = pattern.flags.includes('g')
195
+ ? pattern
196
+ : new RegExp(pattern.source, pattern.flags + 'g');
197
+ for (const match of html.matchAll(safe)) {
198
+ const id = match[1] ?? match[0];
199
+ if (id && !seen.has(id)) {
200
+ seen.add(id);
201
+ result.push(id);
202
+ }
203
+ }
204
+ }
205
+ return result;
206
+ }