@d-zero/beholder 2.1.6 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Third-party tag detection layer.
3
+ *
4
+ * Combines two signals to populate {@link TagsMeta}:
5
+ * 1. `simple-wappalyzer` runs over the page HTML + headers to identify
6
+ * the technologies present (and their Wappalyzer categories).
7
+ * 2. {@link extractIds} from `./id-extractors.js` finds the real account
8
+ * / measurement IDs (e.g. `G-XXXXXXXX`, `GTM-XXXXX`) for each detected
9
+ * provider.
10
+ *
11
+ * Returned shape is documented on {@link TagsMeta} in `./types.ts`.
12
+ * @module
13
+ */
14
+
15
+ import type { TagDetail, TagEntry, TagsMeta } from './types.js';
16
+
17
+ import wappalyzer from 'simple-wappalyzer';
18
+
19
+ import { domLog } from '../debug.js';
20
+
21
+ import { extractIds } from './id-extractors.js';
22
+
23
+ const log = domLog.extend(`${process.pid}`);
24
+
25
+ /**
26
+ * Shape of a single technology entry returned by `simple-wappalyzer`.
27
+ * Mirrors the subset of fields we use; everything else is ignored.
28
+ */
29
+ interface WappalyzerTech {
30
+ readonly name: string;
31
+ readonly version?: string;
32
+ readonly confidence?: number;
33
+ readonly categories?: ReadonlyArray<{ readonly name?: string; readonly id?: number }>;
34
+ }
35
+
36
+ /**
37
+ * Inputs required to drive `simple-wappalyzer`.
38
+ *
39
+ * `headers` keys should be lowercase; `simple-wappalyzer` is case-insensitive
40
+ * but normalizing up front avoids ambiguity.
41
+ */
42
+ export type DetectTagsInput = {
43
+ readonly url: string;
44
+ readonly html: string;
45
+ readonly statusCode?: number;
46
+ readonly headers?: Record<string, string | string[] | undefined>;
47
+ };
48
+
49
+ const EMPTY_TAGS: TagsMeta = { detected: {}, entries: [] };
50
+
51
+ /**
52
+ * Drives `simple-wappalyzer` and post-processes the result with the
53
+ * provider-specific ID extractors. Failures fall back to an empty `TagsMeta`
54
+ * rather than throwing, so the caller does not need to wrap the call.
55
+ * @param input
56
+ */
57
+ export async function detectTags(input: DetectTagsInput): Promise<TagsMeta> {
58
+ const headers = normalizeHeaders(input.headers);
59
+ let detections: WappalyzerTech[];
60
+ try {
61
+ const result = (await wappalyzer({
62
+ url: input.url,
63
+ html: input.html,
64
+ headers,
65
+ })) as unknown;
66
+ detections = Array.isArray(result) ? (result as WappalyzerTech[]) : [];
67
+ } catch (error) {
68
+ log(
69
+ 'detectTags: simple-wappalyzer failed; returning empty TagsMeta. Error: %O',
70
+ error,
71
+ );
72
+ return cloneEmpty();
73
+ }
74
+ return assembleTagsMeta(detections, input.html);
75
+ }
76
+
77
+ /**
78
+ * Builds a `TagsMeta` from the raw `simple-wappalyzer` output and the page
79
+ * HTML used for ID extraction.
80
+ *
81
+ * Exported for unit tests that bypass `simple-wappalyzer` and feed
82
+ * pre-recorded detections directly.
83
+ * @param detections
84
+ * @param html
85
+ */
86
+ export function assembleTagsMeta(
87
+ detections: readonly WappalyzerTech[],
88
+ html: string,
89
+ ): TagsMeta {
90
+ const detected: Record<string, Record<string, TagDetail>> = {};
91
+ const entries: TagEntry[] = [];
92
+
93
+ for (const tech of detections) {
94
+ if (!tech.name) continue;
95
+ const ids = extractIds(tech.name, html);
96
+ const categories =
97
+ tech.categories
98
+ ?.map((c) => c.name)
99
+ .filter((name): name is string => typeof name === 'string') ?? [];
100
+ const detail: TagDetail = {
101
+ ids,
102
+ ...(tech.version === undefined ? {} : { version: tech.version }),
103
+ ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
104
+ };
105
+ for (const category of categories.length > 0 ? categories : ['Other']) {
106
+ if (detected[category] === undefined) {
107
+ detected[category] = {};
108
+ }
109
+ detected[category][tech.name] = detail;
110
+ }
111
+
112
+ const baseSources = [{ type: 'html' as const }];
113
+ if (ids.length === 0) {
114
+ entries.push({
115
+ provider: tech.name,
116
+ categories,
117
+ ...(tech.version === undefined ? {} : { version: tech.version }),
118
+ ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
119
+ sources: baseSources,
120
+ });
121
+ } else {
122
+ for (const id of ids) {
123
+ entries.push({
124
+ provider: tech.name,
125
+ categories,
126
+ id,
127
+ ...(tech.version === undefined ? {} : { version: tech.version }),
128
+ ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
129
+ sources: baseSources,
130
+ });
131
+ }
132
+ }
133
+ }
134
+
135
+ return { detected, entries };
136
+ }
137
+
138
+ /**
139
+ *
140
+ */
141
+ function cloneEmpty(): TagsMeta {
142
+ return { detected: {}, entries: [] };
143
+ }
144
+
145
+ /**
146
+ *
147
+ * @param headers
148
+ */
149
+ function normalizeHeaders(headers: DetectTagsInput['headers']): Record<string, string> {
150
+ if (!headers) return {};
151
+ const out: Record<string, string> = {};
152
+ for (const [key, value] of Object.entries(headers)) {
153
+ if (value === undefined) continue;
154
+ const flat = Array.isArray(value) ? value.join(', ') : value;
155
+ out[key.toLowerCase()] = flat;
156
+ }
157
+ return out;
158
+ }
159
+
160
+ /** Singleton empty `TagsMeta` value (exported for tests). */
161
+ export const EMPTY_TAGS_META = EMPTY_TAGS;