@paywalls-net/filter 1.3.9 → 1.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/jest.config.js +7 -0
- package/package.json +6 -2
- package/src/index.js +58 -40
- package/src/signal-extraction.js +524 -0
- package/tests/proxy-vai-request.test.js +379 -0
- package/tests/signal-extraction.test.js +1002 -0
|
@@ -0,0 +1,524 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Signal Extraction Module — Tier 2 + Tier 3 feature extractors
|
|
3
|
+
*
|
|
4
|
+
* Transforms raw browser headers into compact RFC 8941 Structured Field
|
|
5
|
+
* Dictionary strings for privacy-preserving VAI signal forwarding.
|
|
6
|
+
*
|
|
7
|
+
* Spec: specs/vai-privacy-v2.spec.md §6.2–§6.4
|
|
8
|
+
*
|
|
9
|
+
* Each function returns an SF-Dictionary string (e.g. "html, wildcard")
|
|
10
|
+
* or null if the input is absent/empty. null means the caller should
|
|
11
|
+
* omit the header entirely (not send an empty value).
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
// ── VAI Metadata: dynamic loading with hardcoded fallbacks ──────────────────
|
|
15
|
+
// (paywalls-site-fc4)
|
|
16
|
+
//
|
|
17
|
+
// These module-level vars are initialized from hardcoded defaults below.
|
|
18
|
+
// When loadVAIMetadata() is called, they are updated from the cloud-api
|
|
19
|
+
// /pw/vai/metadata endpoint. If the fetch fails, the hardcoded defaults
|
|
20
|
+
// remain in effect — no data loss, no crash.
|
|
21
|
+
|
|
22
|
+
// ── Hardcoded defaults (bootstrap / fallback) ──────────────────────────────
|
|
23
|
+
|
|
24
|
+
const DEFAULT_DC_ASNS = [
|
|
25
|
+
// ── Major IaaS ───────────────────────────────────────────────────────────
|
|
26
|
+
16509, 14618, // Amazon AWS (primary + secondary)
|
|
27
|
+
396982, 36492, 15169, // Google Cloud + Google infra
|
|
28
|
+
8075, 8069, 8068, // Microsoft Azure
|
|
29
|
+
31898, // Oracle Cloud
|
|
30
|
+
36351, // IBM Cloud / SoftLayer
|
|
31
|
+
45102, // Alibaba Cloud
|
|
32
|
+
132203, // Tencent Cloud
|
|
33
|
+
|
|
34
|
+
// ── VPS / Hosting ────────────────────────────────────────────────────────
|
|
35
|
+
14061, // DigitalOcean
|
|
36
|
+
24940, 213230, // Hetzner (dedicated + cloud)
|
|
37
|
+
16276, // OVH
|
|
38
|
+
63949, // Linode / Akamai Connected Cloud
|
|
39
|
+
20473, // Vultr / The Constant Company
|
|
40
|
+
12876, // Scaleway
|
|
41
|
+
51167, // Contabo
|
|
42
|
+
60781, 28753, // Leaseweb (NL + global)
|
|
43
|
+
];
|
|
44
|
+
|
|
45
|
+
const DEFAULT_AUTOMATION_PATTERNS = [
|
|
46
|
+
'Puppeteer', 'Playwright', 'Selenium', 'WebDriver',
|
|
47
|
+
'PhantomJS', 'CasperJS',
|
|
48
|
+
'python-requests', 'python-urllib', 'Go-http-client',
|
|
49
|
+
'okhttp', 'Apache-HttpClient', 'libcurl',
|
|
50
|
+
'\\bcurl\\/', '\\bwget\\/', 'HTTPie',
|
|
51
|
+
'node-fetch', 'undici', 'axios\\/', '\\bgot\\/', 'superagent',
|
|
52
|
+
'Cypress', 'TestCafe', 'Nightwatch', 'WebdriverIO',
|
|
53
|
+
'Scrapy', 'Java\\/|Java HttpURLConnection', 'PostmanRuntime\\/',
|
|
54
|
+
'\\bDeno\\/', '\\bhttpx\\b|python-httpx',
|
|
55
|
+
];
|
|
56
|
+
|
|
57
|
+
const DEFAULT_HEADLESS_PATTERNS = [
|
|
58
|
+
'HeadlessChrome', '\\bHeadless\\b',
|
|
59
|
+
];
|
|
60
|
+
|
|
61
|
+
const DEFAULT_BOT_PATTERNS = [
|
|
62
|
+
'Googlebot', 'bingbot', 'Baiduspider', 'YandexBot', 'DuckDuckBot',
|
|
63
|
+
'Slurp', 'ia_archiver', 'GPTBot', 'ClaudeBot', 'CCBot', 'Bytespider',
|
|
64
|
+
'Applebot', 'PetalBot', 'SemrushBot', 'AhrefsBot', 'DotBot',
|
|
65
|
+
];
|
|
66
|
+
|
|
67
|
+
// ── Mutable state: updated by loadVAIMetadata() ────────────────────────────
|
|
68
|
+
|
|
69
|
+
/** @type {Set<number>} */
|
|
70
|
+
let DC_ASN_SET = new Set(DEFAULT_DC_ASNS);
|
|
71
|
+
|
|
72
|
+
/** @type {RegExp[]} */
|
|
73
|
+
let AUTOMATION_MARKERS = DEFAULT_AUTOMATION_PATTERNS.map(p => new RegExp(p, 'i'));
|
|
74
|
+
|
|
75
|
+
/** @type {RegExp[]} */
|
|
76
|
+
let HEADLESS_MARKERS = DEFAULT_HEADLESS_PATTERNS.map(p => new RegExp(p, 'i'));
|
|
77
|
+
|
|
78
|
+
/** @type {RegExp} — single combined regex for bot family detection */
|
|
79
|
+
let BOT_FAMILY_RE = new RegExp('\\b(' + DEFAULT_BOT_PATTERNS.join('|') + ')\\b', 'i');
|
|
80
|
+
|
|
81
|
+
// ── Metadata cache ─────────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
let _vaiMetadataCache = null; // { data, ts }
|
|
84
|
+
const VAI_METADATA_TTL = 60 * 60 * 1000; // 1 hour
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Compile pattern strings (from metadata JSON) into RegExp objects.
|
|
88
|
+
* Each string is treated as a regex source with case-insensitive flag.
|
|
89
|
+
* @param {string[]} patterns
|
|
90
|
+
* @returns {RegExp[]}
|
|
91
|
+
*/
|
|
92
|
+
function compilePatterns(patterns) {
|
|
93
|
+
return patterns.map(p => new RegExp(p, 'i'));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Fetch VAI metadata from cloud-api and update mutable module state.
|
|
98
|
+
* Caches for 1 hour. Falls back to hardcoded defaults on failure.
|
|
99
|
+
*
|
|
100
|
+
* Pattern: matches loadAgentPatterns() in user-agent-classification.js.
|
|
101
|
+
*
|
|
102
|
+
* @param {Object} cfg Config with paywallsAPIHost (cloud-api base URL)
|
|
103
|
+
* @returns {Promise<void>}
|
|
104
|
+
*/
|
|
105
|
+
export async function loadVAIMetadata(cfg) {
|
|
106
|
+
const now = Date.now();
|
|
107
|
+
|
|
108
|
+
// Return early if cache is still valid
|
|
109
|
+
if (_vaiMetadataCache && (now - _vaiMetadataCache.ts) < VAI_METADATA_TTL) {
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
const response = await fetch(`${cfg.paywallsAPIHost}/pw/vai/metadata`, {
|
|
115
|
+
method: 'GET',
|
|
116
|
+
headers: { 'Accept': 'application/json' },
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
if (!response.ok) {
|
|
120
|
+
throw new Error(`VAI metadata fetch failed: ${response.status} ${response.statusText}`);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const data = await response.json();
|
|
124
|
+
|
|
125
|
+
// Validate minimal schema
|
|
126
|
+
if (!data || typeof data.version !== 'number') {
|
|
127
|
+
throw new Error('VAI metadata: invalid schema (missing version)');
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Update mutable state from fetched data
|
|
131
|
+
if (Array.isArray(data.dc_asns) && data.dc_asns.length > 0) {
|
|
132
|
+
DC_ASN_SET = new Set(data.dc_asns);
|
|
133
|
+
}
|
|
134
|
+
if (Array.isArray(data.automation_patterns) && data.automation_patterns.length > 0) {
|
|
135
|
+
AUTOMATION_MARKERS = compilePatterns(data.automation_patterns);
|
|
136
|
+
}
|
|
137
|
+
if (Array.isArray(data.headless_patterns) && data.headless_patterns.length > 0) {
|
|
138
|
+
HEADLESS_MARKERS = compilePatterns(data.headless_patterns);
|
|
139
|
+
}
|
|
140
|
+
if (Array.isArray(data.bot_patterns) && data.bot_patterns.length > 0) {
|
|
141
|
+
BOT_FAMILY_RE = new RegExp('\\b(' + data.bot_patterns.join('|') + ')\\b', 'i');
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
_vaiMetadataCache = { data, ts: now };
|
|
145
|
+
} catch (error) {
|
|
146
|
+
console.error('loadVAIMetadata: fetch failed, using hardcoded defaults.', error.message || error);
|
|
147
|
+
// Mark cache so we don't retry immediately (back off for 5 minutes)
|
|
148
|
+
_vaiMetadataCache = { data: null, ts: now - VAI_METADATA_TTL + (5 * 60 * 1000) };
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Reset metadata state to hardcoded defaults and clear cache.
|
|
154
|
+
* Exposed for testing only.
|
|
155
|
+
*/
|
|
156
|
+
export function _resetVAIMetadata() {
|
|
157
|
+
DC_ASN_SET = new Set(DEFAULT_DC_ASNS);
|
|
158
|
+
AUTOMATION_MARKERS = DEFAULT_AUTOMATION_PATTERNS.map(p => new RegExp(p, 'i'));
|
|
159
|
+
HEADLESS_MARKERS = DEFAULT_HEADLESS_PATTERNS.map(p => new RegExp(p, 'i'));
|
|
160
|
+
BOT_FAMILY_RE = new RegExp('\\b(' + DEFAULT_BOT_PATTERNS.join('|') + ')\\b', 'i');
|
|
161
|
+
_vaiMetadataCache = null;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// ── §6.2.1 Accept → X-PW-Accept ──────────────────────────────────────────
|
|
165
|
+
/**
|
|
166
|
+
* Extract boolean feature flags from the Accept header.
|
|
167
|
+
*
|
|
168
|
+
* @param {string|null|undefined} accept Raw Accept header value
|
|
169
|
+
* @returns {string|null} SF-Dictionary string or null if absent/empty
|
|
170
|
+
*/
|
|
171
|
+
export function extractAcceptFeatures(accept) {
|
|
172
|
+
if (!accept) return null;
|
|
173
|
+
|
|
174
|
+
const parts = [];
|
|
175
|
+
if (accept.includes('text/html')) parts.push('html');
|
|
176
|
+
if (accept.includes('*/*')) parts.push('wildcard');
|
|
177
|
+
if (accept.includes('application/json')) parts.push('json');
|
|
178
|
+
if (accept.includes('image/')) parts.push('image');
|
|
179
|
+
|
|
180
|
+
return parts.length > 0 ? parts.join(', ') : null;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// ── §6.2.2 Accept-Encoding → X-PW-Enc ────────────────────────────────────
|
|
184
|
+
/**
|
|
185
|
+
* Extract boolean feature flags from the Accept-Encoding header.
|
|
186
|
+
*
|
|
187
|
+
* @param {string|null|undefined} acceptEncoding Raw Accept-Encoding value
|
|
188
|
+
* @returns {string|null} SF-Dictionary string or null if absent/empty
|
|
189
|
+
*/
|
|
190
|
+
export function extractEncodingFeatures(acceptEncoding) {
|
|
191
|
+
if (!acceptEncoding) return null;
|
|
192
|
+
|
|
193
|
+
const parts = [];
|
|
194
|
+
const hasBr = acceptEncoding.includes('br');
|
|
195
|
+
const hasGzip = acceptEncoding.includes('gzip');
|
|
196
|
+
|
|
197
|
+
if (hasBr) parts.push('br');
|
|
198
|
+
if (hasGzip) parts.push('gzip');
|
|
199
|
+
if (hasBr && hasGzip) parts.push('modern');
|
|
200
|
+
|
|
201
|
+
return parts.length > 0 ? parts.join(', ') : null;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// ── §6.2.3 Accept-Language → X-PW-Lang ───────────────────────────────────
|
|
205
|
+
/**
|
|
206
|
+
* Extract presence, primary language family, and locale count from
|
|
207
|
+
* the Accept-Language header.
|
|
208
|
+
*
|
|
209
|
+
* @param {string|null|undefined} acceptLanguage Raw Accept-Language value
|
|
210
|
+
* @returns {string|null} SF-Dictionary string or null if absent/empty
|
|
211
|
+
*/
|
|
212
|
+
export function extractLanguageFeatures(acceptLanguage) {
|
|
213
|
+
if (!acceptLanguage) return null;
|
|
214
|
+
|
|
215
|
+
const trimmed = acceptLanguage.trim();
|
|
216
|
+
if (trimmed === '' || trimmed === '*') return null;
|
|
217
|
+
|
|
218
|
+
// Split on comma to count locales, ignoring quality values
|
|
219
|
+
const locales = trimmed.split(',').map(s => s.trim().split(';')[0].trim()).filter(Boolean);
|
|
220
|
+
const count = locales.length;
|
|
221
|
+
if (count === 0) return null;
|
|
222
|
+
|
|
223
|
+
// Primary language family = first 2 chars of first locale (lowercase)
|
|
224
|
+
const first = locales[0].toLowerCase();
|
|
225
|
+
const primary = first.length >= 2 ? first.slice(0, 2) : first;
|
|
226
|
+
|
|
227
|
+
const parts = ['present', `primary=${primary}`, `count=${count}`];
|
|
228
|
+
return parts.join(', ');
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// ── §6.2.4 ASN → X-PW-Net ────────────────────────────────────────────────
|
|
232
|
+
/**
|
|
233
|
+
* Classify an ASN into a named enum category.
|
|
234
|
+
*
|
|
235
|
+
* @param {string|number|null|undefined} asn Numeric ASN value
|
|
236
|
+
* @returns {string|null} SF-Dictionary string or null if absent/empty
|
|
237
|
+
*/
|
|
238
|
+
export function extractNetFeatures(asn) {
|
|
239
|
+
if (asn == null || asn === '') return null;
|
|
240
|
+
|
|
241
|
+
const num = typeof asn === 'number' ? asn : parseInt(asn, 10);
|
|
242
|
+
if (isNaN(num)) return null;
|
|
243
|
+
|
|
244
|
+
const category = DC_ASN_SET.has(num) ? 'cloud' : 'consumer';
|
|
245
|
+
return `asn=${category}`;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// ── §6.2.5 Sec-CH-UA → X-PW-CH ───────────────────────────────────────────
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Extract Chrome version from a Sec-CH-UA header value.
|
|
252
|
+
* Looks for "Chromium" or "Google Chrome" brand and returns the major version.
|
|
253
|
+
*
|
|
254
|
+
* @param {string} secChUA Raw Sec-CH-UA header
|
|
255
|
+
* @returns {number|null} Major Chrome version or null
|
|
256
|
+
*/
|
|
257
|
+
function extractChromeVersionFromCH(secChUA) {
|
|
258
|
+
// Sec-CH-UA format: "Brand";v="version", "Brand";v="version", ...
|
|
259
|
+
const match = secChUA.match(/"(?:Google Chrome|Chromium)";v="(\d+)"/);
|
|
260
|
+
return match ? parseInt(match[1], 10) : null;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* Extract Chrome version from a User-Agent string.
|
|
265
|
+
*
|
|
266
|
+
* @param {string} userAgent Raw User-Agent string
|
|
267
|
+
* @returns {number|null} Major Chrome version or null
|
|
268
|
+
*/
|
|
269
|
+
function extractChromeVersionFromUA(userAgent) {
|
|
270
|
+
// UA format: ...Chrome/134.0.0.0...
|
|
271
|
+
const match = userAgent.match(/Chrome\/(\d+)/);
|
|
272
|
+
return match ? parseInt(match[1], 10) : null;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* Extract features from Sec-CH-UA header, cross-referenced with User-Agent
|
|
277
|
+
* for the consistency check.
|
|
278
|
+
*
|
|
279
|
+
* @param {string|null|undefined} secChUA Raw Sec-CH-UA header value
|
|
280
|
+
* @param {string|null|undefined} userAgent Raw User-Agent string (for consistency check)
|
|
281
|
+
* @returns {string|null} SF-Dictionary string or null if CH absent/empty
|
|
282
|
+
*/
|
|
283
|
+
export function extractCHFeatures(secChUA, userAgent) {
|
|
284
|
+
if (!secChUA) return null;
|
|
285
|
+
|
|
286
|
+
const trimmed = secChUA.trim();
|
|
287
|
+
if (trimmed === '') return null;
|
|
288
|
+
|
|
289
|
+
const parts = ['present'];
|
|
290
|
+
|
|
291
|
+
// Count brand entries: each is a quoted string followed by ;v="..."
|
|
292
|
+
// Split on comma to count entries
|
|
293
|
+
const brands = trimmed.split(',').map(s => s.trim()).filter(Boolean);
|
|
294
|
+
parts.push(`brands=${brands.length}`);
|
|
295
|
+
|
|
296
|
+
// GREASE detection: Chromium convention includes a "Not" brand
|
|
297
|
+
const hasGrease = brands.some(b => /not[^"]*brand/i.test(b) || /not[:\-_.]/i.test(b));
|
|
298
|
+
if (hasGrease) parts.push('grease');
|
|
299
|
+
|
|
300
|
+
// Consistency check: Chrome version in CH matches Chrome version in UA
|
|
301
|
+
if (userAgent) {
|
|
302
|
+
const chVersion = extractChromeVersionFromCH(trimmed);
|
|
303
|
+
const uaVersion = extractChromeVersionFromUA(userAgent);
|
|
304
|
+
if (chVersion != null && uaVersion != null && chVersion === uaVersion) {
|
|
305
|
+
parts.push('consistent');
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
return parts.join(', ');
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
313
|
+
// Tier 3 — Replace User-Agent with derived features (§6.3) + CT (§6.4)
|
|
314
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
315
|
+
|
|
316
|
+
// ── §6.3.3 Automation marker detection ────────────────────────────────────
|
|
317
|
+
// HeadlessChrome triggers 'headless' only (via HEADLESS_MARKERS).
|
|
318
|
+
// Explicit automation tools (Puppeteer, Selenium, etc.) trigger 'automation'.
|
|
319
|
+
// AUTOMATION_MARKERS and HEADLESS_MARKERS are now module-level mutable vars
|
|
320
|
+
// initialized from hardcoded defaults (top of file) and updated dynamically
|
|
321
|
+
// by loadVAIMetadata(). See paywalls-site-fc4.
|
|
322
|
+
|
|
323
|
+
// ── §6.3.4 Entropy bucketing ──────────────────────────────────────────────
|
|
324
|
+
/**
|
|
325
|
+
* Bucket a User-Agent string's structural complexity.
|
|
326
|
+
* @param {string} userAgent
|
|
327
|
+
* @returns {'low'|'medium'|'high'}
|
|
328
|
+
*/
|
|
329
|
+
function computeUAEntropy(userAgent) {
|
|
330
|
+
if (!userAgent || userAgent.length < 10) return 'low';
|
|
331
|
+
|
|
332
|
+
const hasUpper = /[A-Z]/.test(userAgent);
|
|
333
|
+
const hasLower = /[a-z]/.test(userAgent);
|
|
334
|
+
const hasDigit = /\d/.test(userAgent);
|
|
335
|
+
const hasSpecial = /[\/\.;()\s,_\-]/.test(userAgent);
|
|
336
|
+
const classCount = [hasUpper, hasLower, hasDigit, hasSpecial].filter(Boolean).length;
|
|
337
|
+
|
|
338
|
+
const len = userAgent.length;
|
|
339
|
+
const hasParens = /\([^)]+\)/.test(userAgent);
|
|
340
|
+
|
|
341
|
+
// Typical browser UA: 60-250 chars, 4 char classes, has parens
|
|
342
|
+
if (classCount >= 4 && len >= 60 && len <= 250 && hasParens) return 'medium';
|
|
343
|
+
if (classCount >= 3 && len >= 40 && len <= 300) return 'medium';
|
|
344
|
+
|
|
345
|
+
// Very short, very long, or missing structure
|
|
346
|
+
if (len < 40 || len > 300 || classCount < 3) return 'low';
|
|
347
|
+
|
|
348
|
+
// Unusual: high-entropy random strings
|
|
349
|
+
const uniqueChars = new Set(userAgent).size;
|
|
350
|
+
if (uniqueChars / len > 0.7) return 'high';
|
|
351
|
+
|
|
352
|
+
return 'medium';
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// ── §6.3.1 UA dpf/version parsing ─────────────────────────────────────────
|
|
356
|
+
|
|
357
|
+
/** @returns {'desktop'|'mobile'|'tablet'|'smarttv'|'console'|'car'|'wearable'|'vr'|'server'|'unknown'} */
|
|
358
|
+
function detectDevice(ua) {
|
|
359
|
+
// Smart TV: check before tablet/mobile (some TVs include Android)
|
|
360
|
+
if (/SmartTV|SMART-TV|\bTizen\b|\bWebOS\b|\bBRAVIA\b|\bVizio\b|\bRoku\b|\bAppleTV\b|\bFire TV\b|\bAndroidTV\b|\btvOS\b|\bHBBTV\b/i.test(ua)) return 'smarttv';
|
|
361
|
+
// Gaming consoles
|
|
362
|
+
if (/\b(PlayStation|PLAYSTATION|Xbox|Nintendo)\b/i.test(ua)) return 'console';
|
|
363
|
+
// VR headsets (Meta Quest / Oculus)
|
|
364
|
+
if (/OculusBrowser|\bQuest\b/i.test(ua)) return 'vr';
|
|
365
|
+
// Wearables (Apple Watch, etc.)
|
|
366
|
+
if (/\bWatch\b|\bwearable\b/i.test(ua)) return 'wearable';
|
|
367
|
+
// Automotive
|
|
368
|
+
if (/\bTesla\b|\bCarPlay\b/i.test(ua)) return 'car';
|
|
369
|
+
if (/\b(iPad|Tablet|PlayBook|Silk|Kindle)\b/i.test(ua)) return 'tablet';
|
|
370
|
+
if (/\b(iPhone|iPod|Android.*Mobile|Mobile.*Android|webOS|BlackBerry|Opera Mini|IEMobile|Windows Phone)\b/i.test(ua)) return 'mobile';
|
|
371
|
+
if (/\b(Android)\b/i.test(ua) && !/Mobile/i.test(ua)) return 'tablet';
|
|
372
|
+
if (/\b(Macintosh|Windows NT|X11|Linux(?!.*Android))\b/i.test(ua)) return 'desktop';
|
|
373
|
+
if (/\b(Googlebot|bingbot|Baiduspider|YandexBot|DuckDuckBot)\b/i.test(ua)) return 'server';
|
|
374
|
+
return 'unknown';
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/** @returns {'windows'|'mac'|'ios'|'android'|'linux'|'chromeos'|'freebsd'|'other'} */
|
|
378
|
+
function detectPlatform(ua) {
|
|
379
|
+
if (/\b(iPhone|iPad|iPod)\b/i.test(ua)) return 'ios';
|
|
380
|
+
if (/\bAndroid\b/i.test(ua)) return 'android';
|
|
381
|
+
if (/\bCrOS\b/i.test(ua)) return 'chromeos';
|
|
382
|
+
if (/\bMacintosh\b/i.test(ua)) return 'mac';
|
|
383
|
+
if (/\bWindows\b/i.test(ua)) return 'windows';
|
|
384
|
+
if (/\bFreeBSD\b/i.test(ua)) return 'freebsd';
|
|
385
|
+
if (/\bLinux\b/i.test(ua) || /\bX11\b/i.test(ua)) return 'linux';
|
|
386
|
+
return 'other';
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
/** @returns {'chrome'|'safari'|'firefox'|'edge'|'ucbrowser'|'other'|'bot'} */
|
|
390
|
+
function detectFamily(ua) {
|
|
391
|
+
// Bots: search engine crawlers + AI/SEO crawlers (dynamic via loadVAIMetadata)
|
|
392
|
+
if (BOT_FAMILY_RE.test(ua)) return 'bot';
|
|
393
|
+
// UC Browser: mobile-heavy, no Client Hints — check before Chrome
|
|
394
|
+
if (/UCBrowser|UCWEB/i.test(ua)) return 'ucbrowser';
|
|
395
|
+
// Order matters: Edge before Chrome (Edge UA contains "Chrome")
|
|
396
|
+
if (/\bEdg(?:e|A)?\/\d/i.test(ua)) return 'edge';
|
|
397
|
+
if (/\bFirefox\//i.test(ua)) return 'firefox';
|
|
398
|
+
// Safari check: has "Safari/" but NOT "Chrome/" or "Chromium/" or "HeadlessChrome/"
|
|
399
|
+
if (/\bSafari\//i.test(ua) && !/Chrome|Chromium|HeadlessChrome/i.test(ua)) return 'safari';
|
|
400
|
+
// Opera (OPR/) and Brave share Chromium engine; keep as 'chrome' family
|
|
401
|
+
// since they support Client Hints and score the same.
|
|
402
|
+
if (/(?:\b|Headless)Chrom(?:e|ium)\//i.test(ua)) return 'chrome';
|
|
403
|
+
return 'other';
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* Extract major browser version from a User-Agent string.
|
|
408
|
+
* @param {string} ua
|
|
409
|
+
* @returns {number|null}
|
|
410
|
+
*/
|
|
411
|
+
function extractMajorVersion(ua) {
|
|
412
|
+
// Try common version patterns in order of specificity
|
|
413
|
+
let m = ua.match(/\bEdg(?:e|A)?\/(\d+)/);
|
|
414
|
+
if (m) return parseInt(m[1], 10);
|
|
415
|
+
m = ua.match(/\bFirefox\/(\d+)/);
|
|
416
|
+
if (m) return parseInt(m[1], 10);
|
|
417
|
+
// Chrome / Chromium / HeadlessChrome
|
|
418
|
+
m = ua.match(/(?:\b|Headless)Chrom(?:e|ium)\/(\d+)/);
|
|
419
|
+
if (m) return parseInt(m[1], 10);
|
|
420
|
+
// Safari: Version/17.x (not the Safari/605 build number)
|
|
421
|
+
m = ua.match(/\bVersion\/(\d+)/);
|
|
422
|
+
if (m) return parseInt(m[1], 10);
|
|
423
|
+
// Generic: first thing/number pattern
|
|
424
|
+
m = ua.match(/\/(\d+)/);
|
|
425
|
+
if (m) return parseInt(m[1], 10);
|
|
426
|
+
return null;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
/**
|
|
430
|
+
* Bucket a major version number into a range token.
|
|
431
|
+
* Uses math-based 20-version spans starting at 80, capped at 420+.
|
|
432
|
+
* Legacy range: 0-79. Then 80-99, 100-119, …, 400-419, 420+.
|
|
433
|
+
* @param {number|null} ver
|
|
434
|
+
* @returns {string}
|
|
435
|
+
*/
|
|
436
|
+
function bucketVersion(ver) {
|
|
437
|
+
if (ver == null || ver < 80) return '0-79';
|
|
438
|
+
if (ver >= 420) return '420+';
|
|
439
|
+
// 20-version spans starting at 80: floor((ver - 80) / 20) gives bucket index
|
|
440
|
+
const base = 80;
|
|
441
|
+
const span = 20;
|
|
442
|
+
const bucketIndex = Math.floor((ver - base) / span);
|
|
443
|
+
const lo = base + bucketIndex * span;
|
|
444
|
+
const hi = lo + span - 1;
|
|
445
|
+
return `${lo}-${hi}`;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// ── §6.3.1 extractUAFeatures ──────────────────────────────────────────────
|
|
449
|
+
/**
|
|
450
|
+
* Parse a User-Agent string into an SF-Dictionary of derived features.
|
|
451
|
+
*
|
|
452
|
+
* @param {string|null|undefined} userAgent Raw User-Agent string
|
|
453
|
+
* @returns {string|null} SF-Dictionary string or null if absent/empty
|
|
454
|
+
*/
|
|
455
|
+
export function extractUAFeatures(userAgent) {
|
|
456
|
+
if (!userAgent) return null;
|
|
457
|
+
const ua = userAgent.trim();
|
|
458
|
+
if (ua === '') return null;
|
|
459
|
+
|
|
460
|
+
const device = detectDevice(ua);
|
|
461
|
+
const platform = detectPlatform(ua);
|
|
462
|
+
const family = detectFamily(ua);
|
|
463
|
+
const ver = bucketVersion(extractMajorVersion(ua));
|
|
464
|
+
|
|
465
|
+
const parts = [`dpf=${device}/${platform}/${family}`, `ver=${ver}`];
|
|
466
|
+
|
|
467
|
+
if (/^Mozilla\//i.test(ua)) parts.push('browser');
|
|
468
|
+
|
|
469
|
+
if (HEADLESS_MARKERS.some(re => re.test(ua))) parts.push('headless');
|
|
470
|
+
if (AUTOMATION_MARKERS.some(re => re.test(ua))) parts.push('automation');
|
|
471
|
+
|
|
472
|
+
parts.push(`entropy=${computeUAEntropy(ua)}`);
|
|
473
|
+
|
|
474
|
+
return parts.join(', ');
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// ── §6.3.2 computeUAHMAC ─────────────────────────────────────────────────
|
|
478
|
+
/**
|
|
479
|
+
* Compute HMAC-SHA256 of the raw User-Agent, returned as an RFC 8941
|
|
480
|
+
* Byte Sequence string (:base64:).
|
|
481
|
+
*
|
|
482
|
+
* Uses crypto.subtle — compatible with Cloudflare Workers and modern Node.
|
|
483
|
+
*
|
|
484
|
+
* @param {string} userAgent Raw User-Agent string
|
|
485
|
+
* @param {string} hmacKey HMAC secret key (plain text)
|
|
486
|
+
* @returns {Promise<string|null>} RFC 8941 Byte Sequence or null if inputs missing
|
|
487
|
+
*/
|
|
488
|
+
export async function computeUAHMAC(userAgent, hmacKey) {
|
|
489
|
+
if (!userAgent || !hmacKey) return null;
|
|
490
|
+
|
|
491
|
+
const enc = new TextEncoder();
|
|
492
|
+
const key = await crypto.subtle.importKey(
|
|
493
|
+
'raw', enc.encode(hmacKey),
|
|
494
|
+
{ name: 'HMAC', hash: 'SHA-256' },
|
|
495
|
+
false, ['sign']
|
|
496
|
+
);
|
|
497
|
+
const sig = await crypto.subtle.sign('HMAC', key, enc.encode(userAgent));
|
|
498
|
+
const b64 = btoa(String.fromCharCode(...new Uint8Array(sig)));
|
|
499
|
+
return `:${b64}:`;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// ── §6.4 computeConfidenceToken ───────────────────────────────────────────
|
|
503
|
+
/**
|
|
504
|
+
* Compute the confidence token.
|
|
505
|
+
* ct = SHA-256(userAgent + acceptLanguage + secChUA)[0:8] hex
|
|
506
|
+
*
|
|
507
|
+
* Matches the logic in cloud-api computeConfidenceFingerprint().
|
|
508
|
+
*
|
|
509
|
+
* @param {string|null|undefined} userAgent Raw User-Agent
|
|
510
|
+
* @param {string|null|undefined} acceptLanguage Raw Accept-Language
|
|
511
|
+
* @param {string|null|undefined} secChUA Raw Sec-CH-UA
|
|
512
|
+
* @returns {Promise<string>} 8-char hex token, never null
|
|
513
|
+
*/
|
|
514
|
+
export async function computeConfidenceToken(userAgent, acceptLanguage, secChUA) {
|
|
515
|
+
const ua = userAgent || '';
|
|
516
|
+
const lang = acceptLanguage || '';
|
|
517
|
+
const ch = secChUA || '';
|
|
518
|
+
|
|
519
|
+
const msgBuffer = new TextEncoder().encode(ua + lang + ch);
|
|
520
|
+
const hashBuffer = await crypto.subtle.digest('SHA-256', msgBuffer);
|
|
521
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
522
|
+
const hex = hashArray.map(b => b.toString(16).padStart(2, '0')).join('');
|
|
523
|
+
return hex.slice(0, 8);
|
|
524
|
+
}
|