@d-zero/beholder 2.1.6 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +38 -0
- package/dist/dom-evaluation.d.ts +72 -24
- package/dist/dom-evaluation.js +442 -84
- package/dist/index.d.ts +1 -1
- package/dist/meta/classify.d.ts +52 -0
- package/dist/meta/classify.js +731 -0
- package/dist/meta/id-extractors.d.ts +40 -0
- package/dist/meta/id-extractors.js +196 -0
- package/dist/meta/keys.d.ts +41 -0
- package/dist/meta/keys.js +507 -0
- package/dist/meta/parsers.d.ts +74 -0
- package/dist/meta/parsers.js +293 -0
- package/dist/meta/tag-detection.d.ts +59 -0
- package/dist/meta/tag-detection.js +120 -0
- package/dist/meta/types.d.ts +874 -0
- package/dist/meta/types.js +12 -0
- package/dist/scraper.js +15 -13
- package/dist/types.d.ts +3 -38
- package/package.json +5 -4
- package/src/dom-evaluation.spec.ts +301 -73
- package/src/dom-evaluation.ts +558 -88
- package/src/index.ts +43 -0
- package/src/meta/classify.spec.ts +281 -0
- package/src/meta/classify.ts +810 -0
- package/src/meta/id-extractors.spec.ts +69 -0
- package/src/meta/id-extractors.ts +206 -0
- package/src/meta/keys.ts +568 -0
- package/src/meta/parsers.spec.ts +178 -0
- package/src/meta/parsers.ts +304 -0
- package/src/meta/simple-wappalyzer.d.ts +37 -0
- package/src/meta/tag-detection.spec.ts +134 -0
- package/src/meta/tag-detection.ts +161 -0
- package/src/meta/types.ts +949 -0
- package/src/scraper.ts +19 -13
- package/src/types.ts +49 -55
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Third-party tag detection layer.
|
|
3
|
+
*
|
|
4
|
+
* Combines two signals to populate {@link TagsMeta}:
|
|
5
|
+
* 1. `simple-wappalyzer` runs over the page HTML + headers to identify
|
|
6
|
+
* the technologies present (and their Wappalyzer categories).
|
|
7
|
+
* 2. {@link extractIds} from `./id-extractors.js` finds the real account
|
|
8
|
+
* / measurement IDs (e.g. `G-XXXXXXXX`, `GTM-XXXXX`) for each detected
|
|
9
|
+
* provider.
|
|
10
|
+
*
|
|
11
|
+
* Returned shape is documented on {@link TagsMeta} in `./types.ts`.
|
|
12
|
+
* @module
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import type { TagDetail, TagEntry, TagsMeta } from './types.js';
|
|
16
|
+
|
|
17
|
+
import wappalyzer from 'simple-wappalyzer';
|
|
18
|
+
|
|
19
|
+
import { domLog } from '../debug.js';
|
|
20
|
+
|
|
21
|
+
import { extractIds } from './id-extractors.js';
|
|
22
|
+
|
|
23
|
+
const log = domLog.extend(`${process.pid}`);
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Shape of a single technology entry returned by `simple-wappalyzer`.
|
|
27
|
+
* Mirrors the subset of fields we use; everything else is ignored.
|
|
28
|
+
*/
|
|
29
|
+
interface WappalyzerTech {
|
|
30
|
+
readonly name: string;
|
|
31
|
+
readonly version?: string;
|
|
32
|
+
readonly confidence?: number;
|
|
33
|
+
readonly categories?: ReadonlyArray<{ readonly name?: string; readonly id?: number }>;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Inputs required to drive `simple-wappalyzer`.
|
|
38
|
+
*
|
|
39
|
+
* `headers` keys should be lowercase; `simple-wappalyzer` is case-insensitive
|
|
40
|
+
* but normalizing up front avoids ambiguity.
|
|
41
|
+
*/
|
|
42
|
+
export type DetectTagsInput = {
|
|
43
|
+
readonly url: string;
|
|
44
|
+
readonly html: string;
|
|
45
|
+
readonly statusCode?: number;
|
|
46
|
+
readonly headers?: Record<string, string | string[] | undefined>;
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
const EMPTY_TAGS: TagsMeta = { detected: {}, entries: [] };
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Drives `simple-wappalyzer` and post-processes the result with the
|
|
53
|
+
* provider-specific ID extractors. Failures fall back to an empty `TagsMeta`
|
|
54
|
+
* rather than throwing, so the caller does not need to wrap the call.
|
|
55
|
+
* @param input
|
|
56
|
+
*/
|
|
57
|
+
export async function detectTags(input: DetectTagsInput): Promise<TagsMeta> {
|
|
58
|
+
const headers = normalizeHeaders(input.headers);
|
|
59
|
+
let detections: WappalyzerTech[];
|
|
60
|
+
try {
|
|
61
|
+
const result = (await wappalyzer({
|
|
62
|
+
url: input.url,
|
|
63
|
+
html: input.html,
|
|
64
|
+
headers,
|
|
65
|
+
})) as unknown;
|
|
66
|
+
detections = Array.isArray(result) ? (result as WappalyzerTech[]) : [];
|
|
67
|
+
} catch (error) {
|
|
68
|
+
log(
|
|
69
|
+
'detectTags: simple-wappalyzer failed; returning empty TagsMeta. Error: %O',
|
|
70
|
+
error,
|
|
71
|
+
);
|
|
72
|
+
return cloneEmpty();
|
|
73
|
+
}
|
|
74
|
+
return assembleTagsMeta(detections, input.html);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Builds a `TagsMeta` from the raw `simple-wappalyzer` output and the page
|
|
79
|
+
* HTML used for ID extraction.
|
|
80
|
+
*
|
|
81
|
+
* Exported for unit tests that bypass `simple-wappalyzer` and feed
|
|
82
|
+
* pre-recorded detections directly.
|
|
83
|
+
* @param detections
|
|
84
|
+
* @param html
|
|
85
|
+
*/
|
|
86
|
+
export function assembleTagsMeta(
|
|
87
|
+
detections: readonly WappalyzerTech[],
|
|
88
|
+
html: string,
|
|
89
|
+
): TagsMeta {
|
|
90
|
+
const detected: Record<string, Record<string, TagDetail>> = {};
|
|
91
|
+
const entries: TagEntry[] = [];
|
|
92
|
+
|
|
93
|
+
for (const tech of detections) {
|
|
94
|
+
if (!tech.name) continue;
|
|
95
|
+
const ids = extractIds(tech.name, html);
|
|
96
|
+
const categories =
|
|
97
|
+
tech.categories
|
|
98
|
+
?.map((c) => c.name)
|
|
99
|
+
.filter((name): name is string => typeof name === 'string') ?? [];
|
|
100
|
+
const detail: TagDetail = {
|
|
101
|
+
ids,
|
|
102
|
+
...(tech.version === undefined ? {} : { version: tech.version }),
|
|
103
|
+
...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
|
|
104
|
+
};
|
|
105
|
+
for (const category of categories.length > 0 ? categories : ['Other']) {
|
|
106
|
+
if (detected[category] === undefined) {
|
|
107
|
+
detected[category] = {};
|
|
108
|
+
}
|
|
109
|
+
detected[category][tech.name] = detail;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const baseSources = [{ type: 'html' as const }];
|
|
113
|
+
if (ids.length === 0) {
|
|
114
|
+
entries.push({
|
|
115
|
+
provider: tech.name,
|
|
116
|
+
categories,
|
|
117
|
+
...(tech.version === undefined ? {} : { version: tech.version }),
|
|
118
|
+
...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
|
|
119
|
+
sources: baseSources,
|
|
120
|
+
});
|
|
121
|
+
} else {
|
|
122
|
+
for (const id of ids) {
|
|
123
|
+
entries.push({
|
|
124
|
+
provider: tech.name,
|
|
125
|
+
categories,
|
|
126
|
+
id,
|
|
127
|
+
...(tech.version === undefined ? {} : { version: tech.version }),
|
|
128
|
+
...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
|
|
129
|
+
sources: baseSources,
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return { detected, entries };
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
*
|
|
140
|
+
*/
|
|
141
|
+
function cloneEmpty(): TagsMeta {
|
|
142
|
+
return { detected: {}, entries: [] };
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
*
|
|
147
|
+
* @param headers
|
|
148
|
+
*/
|
|
149
|
+
function normalizeHeaders(headers: DetectTagsInput['headers']): Record<string, string> {
|
|
150
|
+
if (!headers) return {};
|
|
151
|
+
const out: Record<string, string> = {};
|
|
152
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
153
|
+
if (value === undefined) continue;
|
|
154
|
+
const flat = Array.isArray(value) ? value.join(', ') : value;
|
|
155
|
+
out[key.toLowerCase()] = flat;
|
|
156
|
+
}
|
|
157
|
+
return out;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/** Singleton empty `TagsMeta` value (exported for tests). */
|
|
161
|
+
export const EMPTY_TAGS_META = EMPTY_TAGS;
|