@paywalls-net/filter 1.3.10 → 1.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -3,7 +3,7 @@
3
3
  "description": "Client SDK for integrating paywalls.net bot filtering and authorization services into your server or CDN.",
4
4
  "author": "paywalls.net",
5
5
  "license": "MIT",
6
- "version": "1.3.10",
6
+ "version": "1.3.11",
7
7
  "publishConfig": {
8
8
  "access": "public"
9
9
  },
package/src/index.js CHANGED
@@ -8,6 +8,7 @@ import {
8
8
  extractAcceptFeatures, extractEncodingFeatures, extractLanguageFeatures,
9
9
  extractNetFeatures, extractCHFeatures, extractUAFeatures,
10
10
  computeUAHMAC, computeConfidenceToken,
11
+ loadVAIMetadata,
11
12
  } from './signal-extraction.js';
12
13
 
13
14
  const PAYWALLS_CLOUD_API_HOST = "https://cloud-api.paywalls.net";
@@ -428,7 +429,11 @@ async function cloudflare(config = null) {
428
429
  return await proxyVAIRequest(paywallsConfig, request);
429
430
  }
430
431
 
431
- await loadAgentPatterns(paywallsConfig);
432
+ // Load agent patterns + VAI metadata in parallel (both self-cache for 1 hour)
433
+ await Promise.all([
434
+ loadAgentPatterns(paywallsConfig),
435
+ loadVAIMetadata(paywallsConfig),
436
+ ]);
432
437
 
433
438
  if (await isRecognizedBot(paywallsConfig, request)) {
434
439
  const authz = await checkAgentStatus(paywallsConfig, request);
@@ -461,7 +466,11 @@ async function fastly() {
461
466
  return await proxyVAIRequest(paywallsConfig, request);
462
467
  }
463
468
 
464
- await loadAgentPatterns(paywallsConfig);
469
+ // Load agent patterns + VAI metadata in parallel (both self-cache for 1 hour)
470
+ await Promise.all([
471
+ loadAgentPatterns(paywallsConfig),
472
+ loadVAIMetadata(paywallsConfig),
473
+ ]);
465
474
 
466
475
  if (await isRecognizedBot(paywallsConfig, request)) {
467
476
  const authz = await checkAgentStatus(paywallsConfig, request);
@@ -538,7 +547,11 @@ async function cloudfront(config) {
538
547
  vaiPath: config.PAYWALLS_VAI_PATH || '/pw',
539
548
  vaiUAHmacKey: config.VAI_UA_HMAC_KEY || null,
540
549
  };
541
- await loadAgentPatterns(paywallsConfig);
550
+ // Load agent patterns + VAI metadata in parallel (both self-cache for 1 hour)
551
+ await Promise.all([
552
+ loadAgentPatterns(paywallsConfig),
553
+ loadVAIMetadata(paywallsConfig),
554
+ ]);
542
555
 
543
556
  return async function handle(event, ctx) {
544
557
  let request = event.Records[0].cf.request;
@@ -11,11 +11,17 @@
11
11
  * omit the header entirely (not send an empty value).
12
12
  */
13
13
 
14
- // ── §6.2.4 / Appendix A: Data-center ASN set ──────────────────────────────
15
- // Comprehensive cloud/hosting provider ASNs for DC classification.
16
- // Kept in sync with cloud-api DC_ASN_LIST (cloudflare/vai.js).
17
- // Source: public ASN registries (PeeringDB, RIPE, ARIN).
18
- const DC_ASN_SET = new Set([
14
+ // ── VAI Metadata: dynamic loading with hardcoded fallbacks ──────────────────
15
+ // (paywalls-site-fc4)
16
+ //
17
+ // These module-level vars are initialized from hardcoded defaults below.
18
+ // When loadVAIMetadata() is called, they are updated from the cloud-api
19
+ // /pw/vai/metadata endpoint. If the fetch fails, the hardcoded defaults
20
+ // remain in effect — no data loss, no crash.
21
+
22
+ // ── Hardcoded defaults (bootstrap / fallback) ──────────────────────────────
23
+
24
+ const DEFAULT_DC_ASNS = [
19
25
  // ── Major IaaS ───────────────────────────────────────────────────────────
20
26
  16509, 14618, // Amazon AWS (primary + secondary)
21
27
  396982, 36492, 15169, // Google Cloud + Google infra
@@ -34,7 +40,126 @@ const DC_ASN_SET = new Set([
34
40
  12876, // Scaleway
35
41
  51167, // Contabo
36
42
  60781, 28753, // Leaseweb (NL + global)
37
- ]);
43
+ ];
44
+
45
+ const DEFAULT_AUTOMATION_PATTERNS = [
46
+ 'Puppeteer', 'Playwright', 'Selenium', 'WebDriver',
47
+ 'PhantomJS', 'CasperJS',
48
+ 'python-requests', 'python-urllib', 'Go-http-client',
49
+ 'okhttp', 'Apache-HttpClient', 'libcurl',
50
+ '\\bcurl\\/', '\\bwget\\/', 'HTTPie',
51
+ 'node-fetch', 'undici', 'axios\\/', '\\bgot\\/', 'superagent',
52
+ 'Cypress', 'TestCafe', 'Nightwatch', 'WebdriverIO',
53
+ 'Scrapy', 'Java\\/|Java HttpURLConnection', 'PostmanRuntime\\/',
54
+ '\\bDeno\\/', '\\bhttpx\\b|python-httpx',
55
+ ];
56
+
57
+ const DEFAULT_HEADLESS_PATTERNS = [
58
+ 'HeadlessChrome', '\\bHeadless\\b',
59
+ ];
60
+
61
+ const DEFAULT_BOT_PATTERNS = [
62
+ 'Googlebot', 'bingbot', 'Baiduspider', 'YandexBot', 'DuckDuckBot',
63
+ 'Slurp', 'ia_archiver', 'GPTBot', 'ClaudeBot', 'CCBot', 'Bytespider',
64
+ 'Applebot', 'PetalBot', 'SemrushBot', 'AhrefsBot', 'DotBot',
65
+ ];
66
+
67
+ // ── Mutable state: updated by loadVAIMetadata() ────────────────────────────
68
+
69
+ /** @type {Set<number>} */
70
+ let DC_ASN_SET = new Set(DEFAULT_DC_ASNS);
71
+
72
+ /** @type {RegExp[]} */
73
+ let AUTOMATION_MARKERS = DEFAULT_AUTOMATION_PATTERNS.map(p => new RegExp(p, 'i'));
74
+
75
+ /** @type {RegExp[]} */
76
+ let HEADLESS_MARKERS = DEFAULT_HEADLESS_PATTERNS.map(p => new RegExp(p, 'i'));
77
+
78
+ /** @type {RegExp} — single combined regex for bot family detection */
79
+ let BOT_FAMILY_RE = new RegExp('\\b(' + DEFAULT_BOT_PATTERNS.join('|') + ')\\b', 'i');
80
+
81
+ // ── Metadata cache ─────────────────────────────────────────────────────────
82
+
83
+ let _vaiMetadataCache = null; // { data, ts }
84
+ const VAI_METADATA_TTL = 60 * 60 * 1000; // 1 hour
85
+
86
+ /**
87
+ * Compile pattern strings (from metadata JSON) into RegExp objects.
88
+ * Each string is treated as a regex source with case-insensitive flag.
89
+ * @param {string[]} patterns
90
+ * @returns {RegExp[]}
91
+ */
92
+ function compilePatterns(patterns) {
93
+ return patterns.map(p => new RegExp(p, 'i'));
94
+ }
95
+
96
+ /**
97
+ * Fetch VAI metadata from cloud-api and update mutable module state.
98
+ * Caches for 1 hour. Falls back to hardcoded defaults on failure.
99
+ *
100
+ * Pattern: matches loadAgentPatterns() in user-agent-classification.js.
101
+ *
102
+ * @param {Object} cfg Config with paywallsAPIHost (cloud-api base URL)
103
+ * @returns {Promise<void>}
104
+ */
105
+ export async function loadVAIMetadata(cfg) {
106
+ const now = Date.now();
107
+
108
+ // Return early if cache is still valid
109
+ if (_vaiMetadataCache && (now - _vaiMetadataCache.ts) < VAI_METADATA_TTL) {
110
+ return;
111
+ }
112
+
113
+ try {
114
+ const response = await fetch(`${cfg.paywallsAPIHost}/pw/vai/metadata`, {
115
+ method: 'GET',
116
+ headers: { 'Accept': 'application/json' },
117
+ });
118
+
119
+ if (!response.ok) {
120
+ throw new Error(`VAI metadata fetch failed: ${response.status} ${response.statusText}`);
121
+ }
122
+
123
+ const data = await response.json();
124
+
125
+ // Validate minimal schema
126
+ if (!data || typeof data.version !== 'number') {
127
+ throw new Error('VAI metadata: invalid schema (missing version)');
128
+ }
129
+
130
+ // Update mutable state from fetched data
131
+ if (Array.isArray(data.dc_asns) && data.dc_asns.length > 0) {
132
+ DC_ASN_SET = new Set(data.dc_asns);
133
+ }
134
+ if (Array.isArray(data.automation_patterns) && data.automation_patterns.length > 0) {
135
+ AUTOMATION_MARKERS = compilePatterns(data.automation_patterns);
136
+ }
137
+ if (Array.isArray(data.headless_patterns) && data.headless_patterns.length > 0) {
138
+ HEADLESS_MARKERS = compilePatterns(data.headless_patterns);
139
+ }
140
+ if (Array.isArray(data.bot_patterns) && data.bot_patterns.length > 0) {
141
+ BOT_FAMILY_RE = new RegExp('\\b(' + data.bot_patterns.join('|') + ')\\b', 'i');
142
+ }
143
+
144
+ _vaiMetadataCache = { data, ts: now };
145
+ } catch (error) {
146
+ console.error('loadVAIMetadata: fetch failed, using hardcoded defaults.', error.message || error);
147
+ // Mark cache so we don't retry immediately (back off for 5 minutes)
148
+ _vaiMetadataCache = { data: null, ts: now - VAI_METADATA_TTL + (5 * 60 * 1000) };
149
+ }
150
+ }
151
+
152
+ /**
153
+ * Reset metadata state to hardcoded defaults and clear cache.
154
+ * Exposed for testing only.
155
+ */
156
+ export function _resetVAIMetadata() {
157
+ DC_ASN_SET = new Set(DEFAULT_DC_ASNS);
158
+ AUTOMATION_MARKERS = DEFAULT_AUTOMATION_PATTERNS.map(p => new RegExp(p, 'i'));
159
+ HEADLESS_MARKERS = DEFAULT_HEADLESS_PATTERNS.map(p => new RegExp(p, 'i'));
160
+ BOT_FAMILY_RE = new RegExp('\\b(' + DEFAULT_BOT_PATTERNS.join('|') + ')\\b', 'i');
161
+ _vaiMetadataCache = null;
162
+ }
38
163
 
39
164
  // ── §6.2.1 Accept → X-PW-Accept ──────────────────────────────────────────
40
165
  /**
@@ -191,17 +316,9 @@ export function extractCHFeatures(secChUA, userAgent) {
191
316
  // ── §6.3.3 Automation marker detection ────────────────────────────────────
192
317
  // HeadlessChrome triggers 'headless' only (via HEADLESS_MARKERS).
193
318
  // Explicit automation tools (Puppeteer, Selenium, etc.) trigger 'automation'.
194
- const AUTOMATION_MARKERS = [
195
- /Puppeteer/i, /Playwright/i, /Selenium/i, /WebDriver/i,
196
- /PhantomJS/i, /CasperJS/i,
197
- /python-requests/i, /python-urllib/i, /Go-http-client/i,
198
- /okhttp/i, /Apache-HttpClient/i, /libcurl/i,
199
- /\bcurl\//i, /\bwget\//i, /HTTPie/i,
200
- /node-fetch/i, /undici/i, /axios\//i, /\bgot\//i, /superagent/i,
201
- /Cypress/i, /TestCafe/i, /Nightwatch/i, /WebdriverIO/i,
202
- ];
203
-
204
- const HEADLESS_MARKERS = [/HeadlessChrome/i, /\bHeadless\b/i];
319
+ // AUTOMATION_MARKERS and HEADLESS_MARKERS are now module-level mutable vars
320
+ // initialized from hardcoded defaults (top of file) and updated dynamically
321
+ // by loadVAIMetadata(). See paywalls-site-fc4.
205
322
 
206
323
  // ── §6.3.4 Entropy bucketing ──────────────────────────────────────────────
207
324
  /**
@@ -237,8 +354,18 @@ function computeUAEntropy(userAgent) {
237
354
 
238
355
  // ── §6.3.1 UA dpf/version parsing ─────────────────────────────────────────
239
356
 
240
- /** @returns {'desktop'|'mobile'|'tablet'|'server'|'unknown'} */
357
+ /** @returns {'desktop'|'mobile'|'tablet'|'smarttv'|'console'|'car'|'wearable'|'vr'|'server'|'unknown'} */
241
358
  function detectDevice(ua) {
359
+ // Smart TV: check before tablet/mobile (some TVs include Android)
360
+ if (/SmartTV|SMART-TV|\bTizen\b|\bWebOS\b|\bBRAVIA\b|\bVizio\b|\bRoku\b|\bAppleTV\b|\bFire TV\b|\bAndroidTV\b|\btvOS\b|\bHBBTV\b/i.test(ua)) return 'smarttv';
361
+ // Gaming consoles
362
+ if (/\b(PlayStation|PLAYSTATION|Xbox|Nintendo)\b/i.test(ua)) return 'console';
363
+ // VR headsets (Meta Quest / Oculus)
364
+ if (/OculusBrowser|\bQuest\b/i.test(ua)) return 'vr';
365
+ // Wearables (Apple Watch, etc.)
366
+ if (/\bWatch\b|\bwearable\b/i.test(ua)) return 'wearable';
367
+ // Automotive
368
+ if (/\bTesla\b|\bCarPlay\b/i.test(ua)) return 'car';
242
369
  if (/\b(iPad|Tablet|PlayBook|Silk|Kindle)\b/i.test(ua)) return 'tablet';
243
370
  if (/\b(iPhone|iPod|Android.*Mobile|Mobile.*Android|webOS|BlackBerry|Opera Mini|IEMobile|Windows Phone)\b/i.test(ua)) return 'mobile';
244
371
  if (/\b(Android)\b/i.test(ua) && !/Mobile/i.test(ua)) return 'tablet';
@@ -247,24 +374,31 @@ function detectDevice(ua) {
247
374
  return 'unknown';
248
375
  }
249
376
 
250
- /** @returns {'windows'|'mac'|'ios'|'android'|'linux'|'other'} */
377
+ /** @returns {'windows'|'mac'|'ios'|'android'|'linux'|'chromeos'|'freebsd'|'other'} */
251
378
  function detectPlatform(ua) {
252
379
  if (/\b(iPhone|iPad|iPod)\b/i.test(ua)) return 'ios';
253
380
  if (/\bAndroid\b/i.test(ua)) return 'android';
381
+ if (/\bCrOS\b/i.test(ua)) return 'chromeos';
254
382
  if (/\bMacintosh\b/i.test(ua)) return 'mac';
255
383
  if (/\bWindows\b/i.test(ua)) return 'windows';
384
+ if (/\bFreeBSD\b/i.test(ua)) return 'freebsd';
256
385
  if (/\bLinux\b/i.test(ua) || /\bX11\b/i.test(ua)) return 'linux';
257
386
  return 'other';
258
387
  }
259
388
 
260
- /** @returns {'chrome'|'safari'|'firefox'|'edge'|'other'|'bot'} */
389
+ /** @returns {'chrome'|'safari'|'firefox'|'edge'|'ucbrowser'|'other'|'bot'} */
261
390
  function detectFamily(ua) {
262
- if (/\b(Googlebot|bingbot|Baiduspider|YandexBot|DuckDuckBot|Slurp|ia_archiver)\b/i.test(ua)) return 'bot';
391
+ // Bots: search engine crawlers + AI/SEO crawlers (dynamic via loadVAIMetadata)
392
+ if (BOT_FAMILY_RE.test(ua)) return 'bot';
393
+ // UC Browser: mobile-heavy, no Client Hints — check before Chrome
394
+ if (/UCBrowser|UCWEB/i.test(ua)) return 'ucbrowser';
263
395
  // Order matters: Edge before Chrome (Edge UA contains "Chrome")
264
396
  if (/\bEdg(?:e|A)?\/\d/i.test(ua)) return 'edge';
265
397
  if (/\bFirefox\//i.test(ua)) return 'firefox';
266
398
  // Safari check: has "Safari/" but NOT "Chrome/" or "Chromium/" or "HeadlessChrome/"
267
399
  if (/\bSafari\//i.test(ua) && !/Chrome|Chromium|HeadlessChrome/i.test(ua)) return 'safari';
400
+ // Opera (OPR/) and Brave share Chromium engine; keep as 'chrome' family
401
+ // since they support Client Hints and score the same.
268
402
  if (/(?:\b|Headless)Chrom(?:e|ium)\//i.test(ua)) return 'chrome';
269
403
  return 'other';
270
404
  }
@@ -294,16 +428,21 @@ function extractMajorVersion(ua) {
294
428
 
295
429
  /**
296
430
  * Bucket a major version number into a range token.
431
+ * Uses math-based 20-version spans starting at 80, capped at 420+.
432
+ * Legacy range: 0-79. Then 80-99, 100-119, …, 400-419, 420+.
297
433
  * @param {number|null} ver
298
434
  * @returns {string}
299
435
  */
300
436
  function bucketVersion(ver) {
301
- if (ver == null) return '0-79';
302
- if (ver < 80) return '0-79';
303
- if (ver < 100) return '80-99';
304
- if (ver < 120) return '100-119';
305
- if (ver < 140) return '120-139';
306
- return '140+';
437
+ if (ver == null || ver < 80) return '0-79';
438
+ if (ver >= 420) return '420+';
439
+ // 20-version spans starting at 80: floor((ver - 80) / 20) gives bucket index
440
+ const base = 80;
441
+ const span = 20;
442
+ const bucketIndex = Math.floor((ver - base) / span);
443
+ const lo = base + bucketIndex * span;
444
+ const hi = lo + span - 1;
445
+ return `${lo}-${hi}`;
307
446
  }
308
447
 
309
448
  // ── §6.3.1 extractUAFeatures ──────────────────────────────────────────────
@@ -2,7 +2,7 @@
2
2
  * Unit tests for signal extraction functions (Tier 2 + Tier 3)
3
3
  *
4
4
  * Spec: specs/vai-privacy-v2.spec.md §6.2–§6.4
5
- * Issue: paywalls-site-drk
5
+ * Issue: paywalls-site-drk, paywalls-site-fc4
6
6
  */
7
7
  import {
8
8
  extractAcceptFeatures,
@@ -13,6 +13,8 @@ import {
13
13
  extractUAFeatures,
14
14
  computeUAHMAC,
15
15
  computeConfidenceToken,
16
+ loadVAIMetadata,
17
+ _resetVAIMetadata,
16
18
  } from '../src/signal-extraction.js';
17
19
 
18
20
  // ═══════════════════════════════════════════════════════════════════════════
@@ -209,6 +211,21 @@ describe('extractNetFeatures', () => {
209
211
  expect(extractNetFeatures('7922')).toBe('asn=consumer');
210
212
  });
211
213
 
214
+ // ── Full DC_ASN_SET coverage ───────────────────────────────────────────
215
+ test.each([
216
+ [16509, 'AWS primary'], [14618, 'AWS secondary'],
217
+ [396982, 'GCP'], [36492, 'GCP secondary'], [15169, 'Google infra'],
218
+ [8075, 'Azure'], [8069, 'Azure secondary'], [8068, 'Azure tertiary'],
219
+ [31898, 'Oracle Cloud'], [36351, 'IBM/SoftLayer'],
220
+ [45102, 'Alibaba Cloud'], [132203, 'Tencent Cloud'],
221
+ [14061, 'DigitalOcean'], [24940, 'Hetzner'], [213230, 'Hetzner Cloud'],
222
+ [16276, 'OVH'], [63949, 'Linode/Akamai'], [20473, 'Vultr'],
223
+ [12876, 'Scaleway'], [51167, 'Contabo'],
224
+ [60781, 'Leaseweb NL'], [28753, 'Leaseweb global'],
225
+ ])('DC ASN %i (%s) → asn=cloud', (asn) => {
226
+ expect(extractNetFeatures(asn)).toBe('asn=cloud');
227
+ });
228
+
212
229
  // ── Input type handling ────────────────────────────────────────────────
213
230
  test('numeric input (number, not string) → works', () => {
214
231
  expect(extractNetFeatures(16509)).toBe('asn=cloud');
@@ -360,6 +377,135 @@ describe('extractUAFeatures', () => {
360
377
  expect(extractUAFeatures(GOOGLEBOT)).toMatch(/dpf=server\/other\/bot/);
361
378
  });
362
379
 
380
+ // ── new device types ───────────────────────────────────────────────────
381
+ test('Smart TV (Tizen) → smarttv device', () => {
382
+ const ua = 'Mozilla/5.0 (SMART-TV; Linux; Tizen 5.0) AppleWebKit/537.36 Chrome/69.0.3497.106 TV Safari/537.36';
383
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=smarttv\//);
384
+ });
385
+
386
+ test('Smart TV (WebOS) → smarttv device', () => {
387
+ const ua = 'Mozilla/5.0 (Web0S; Linux/SmartTV) AppleWebKit/537.36 WebOS TV/5.0';
388
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=smarttv\//);
389
+ });
390
+
391
+ test('Smart TV (Fire TV) → smarttv device', () => {
392
+ const ua = 'Mozilla/5.0 (Linux; Android 9; AFTS Build/PS7233) AppleWebKit/537.36 (KHTML, like Gecko) Silk/120.4.1 like Chrome/120.0.0.0 Mobile Safari/537.36 Fire TV';
393
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=smarttv\//);
394
+ });
395
+
396
+ test('PlayStation → console device', () => {
397
+ const ua = 'Mozilla/5.0 (PlayStation 5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15';
398
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=console\//);
399
+ });
400
+
401
+ test('Xbox → console device', () => {
402
+ const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; Xbox; Xbox One) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edge/44.18363.8131';
403
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=console\//);
404
+ });
405
+
406
+ test('Nintendo → console device', () => {
407
+ const ua = 'Mozilla/5.0 (Nintendo Switch; WifiWebAuthApplet) AppleWebKit/606.4 (KHTML, like Gecko) NF/6.0.1.16.10 NintendoBrowser/5.1.0.22474';
408
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=console\//);
409
+ });
410
+
411
+ test('Meta Quest (Oculus) → vr device', () => {
412
+ const ua = 'Mozilla/5.0 (Linux; Android 12; Quest 3) AppleWebKit/537.36 (KHTML, like Gecko) OculusBrowser/33.0 Chrome/126.0.6478.122 Mobile VR Safari/537.36';
413
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=vr\//);
414
+ });
415
+
416
+ test('Apple Watch → wearable device', () => {
417
+ const ua = 'Mozilla/5.0 (Watch; CPU Watch OS 10_0 like Mac OS X) AppleWebKit/605.1.15';
418
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=wearable\//);
419
+ });
420
+
421
+ test('Tesla → car device', () => {
422
+ const ua = 'Mozilla/5.0 (X11; GNU/Linux; Tesla) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
423
+ expect(extractUAFeatures(ua)).toMatch(/^dpf=car\//);
424
+ });
425
+
426
+ // ── new platforms ──────────────────────────────────────────────────────
427
+ test('ChromeOS → chromeos platform', () => {
428
+ const ua = 'Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36';
429
+ expect(extractUAFeatures(ua)).toMatch(/dpf=desktop\/chromeos\/chrome/);
430
+ });
431
+
432
+ test('FreeBSD → freebsd platform', () => {
433
+ const ua = 'Mozilla/5.0 (X11; FreeBSD amd64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
434
+ expect(extractUAFeatures(ua)).toMatch(/dpf=desktop\/freebsd\/chrome/);
435
+ });
436
+
437
+ // ── new browser families ───────────────────────────────────────────────
438
+ test('UC Browser → ucbrowser family', () => {
439
+ const ua = 'Mozilla/5.0 (Linux; Android 10; SM-A505F) AppleWebKit/537.36 (KHTML, like Gecko) UCBrowser/16.0.1.3715 Mobile Safari/537.36';
440
+ expect(extractUAFeatures(ua)).toMatch(/dpf=mobile\/android\/ucbrowser/);
441
+ });
442
+
443
+ test('UCWEB variant → ucbrowser family', () => {
444
+ const ua = 'UCWEB/2.0 (Java; U; MIDP-2.0)';
445
+ expect(extractUAFeatures(ua)).toMatch(/\/ucbrowser/);
446
+ });
447
+
448
+ test('Opera (OPR/) → chrome family (Chromium-based)', () => {
449
+ const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 OPR/120.0.0.0';
450
+ expect(extractUAFeatures(ua)).toMatch(/\/chrome/);
451
+ });
452
+
453
+ test('Brave → chrome family (indistinguishable UA)', () => {
454
+ // Brave UA is intentionally identical to Chrome
455
+ const ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36';
456
+ expect(extractUAFeatures(ua)).toMatch(/\/chrome/);
457
+ });
458
+
459
+ // ── AI/SEO bot detection ───────────────────────────────────────────────
460
+ test('GPTBot → bot family', () => {
461
+ expect(extractUAFeatures('Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)')).toMatch(/\/bot/);
462
+ });
463
+
464
+ test('ClaudeBot → bot family', () => {
465
+ expect(extractUAFeatures('Mozilla/5.0 (compatible; ClaudeBot/1.0; +https://anthropic.com)')).toMatch(/\/bot/);
466
+ });
467
+
468
+ test('CCBot → bot family', () => {
469
+ expect(extractUAFeatures('CCBot/2.0 (https://commoncrawl.org/faq/)')).toMatch(/\/bot/);
470
+ });
471
+
472
+ test('Bytespider → bot family', () => {
473
+ expect(extractUAFeatures('Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)')).toMatch(/\/bot/);
474
+ });
475
+
476
+ test('Applebot → bot family', () => {
477
+ expect(extractUAFeatures('Mozilla/5.0 (compatible; Applebot/0.1; +http://www.apple.com/go/applebot)')).toMatch(/\/bot/);
478
+ });
479
+
480
+ test('SemrushBot → bot family', () => {
481
+ expect(extractUAFeatures('Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)')).toMatch(/\/bot/);
482
+ });
483
+
484
+ test('AhrefsBot → bot family', () => {
485
+ expect(extractUAFeatures('Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)')).toMatch(/\/bot/);
486
+ });
487
+
488
+ // ── new automation markers ─────────────────────────────────────────────
489
+ test('Scrapy → automation', () => {
490
+ expect(extractUAFeatures('Scrapy/2.11.0 (+https://scrapy.org)')).toMatch(/\bautomation\b/);
491
+ });
492
+
493
+ test('Java HttpURLConnection → automation', () => {
494
+ expect(extractUAFeatures('Java/17.0.2')).toMatch(/\bautomation\b/);
495
+ });
496
+
497
+ test('PostmanRuntime → automation', () => {
498
+ expect(extractUAFeatures('PostmanRuntime/7.36.1')).toMatch(/\bautomation\b/);
499
+ });
500
+
501
+ test('Deno → automation', () => {
502
+ expect(extractUAFeatures('Deno/1.40.0')).toMatch(/\bautomation\b/);
503
+ });
504
+
505
+ test('httpx → automation', () => {
506
+ expect(extractUAFeatures('python-httpx/0.27.0')).toMatch(/\bautomation\b/);
507
+ });
508
+
363
509
  test('iPad → tablet/ios/safari', () => {
364
510
  expect(extractUAFeatures(IPAD)).toMatch(/^dpf=tablet\/ios\/safari/);
365
511
  });
@@ -386,6 +532,7 @@ describe('extractUAFeatures', () => {
386
532
  });
387
533
 
388
534
  // Bucket boundary tests: verify version numbers at edges of each range
535
+ // Math-based 20-version spans starting at 80, capped at 420+.
389
536
  test.each([
390
537
  ['Chrome/79.0.0.0', '0-79'],
391
538
  ['Chrome/80.0.0.0', '80-99'],
@@ -394,8 +541,14 @@ describe('extractUAFeatures', () => {
394
541
  ['Chrome/119.0.0.0', '100-119'],
395
542
  ['Chrome/120.0.0.0', '120-139'],
396
543
  ['Chrome/139.0.0.0', '120-139'],
397
- ['Chrome/140.0.0.0', '140+'],
398
- ['Chrome/999.0.0.0', '140+'],
544
+ ['Chrome/140.0.0.0', '140-159'],
545
+ ['Chrome/159.0.0.0', '140-159'],
546
+ ['Chrome/160.0.0.0', '160-179'],
547
+ ['Chrome/200.0.0.0', '200-219'],
548
+ ['Chrome/400.0.0.0', '400-419'],
549
+ ['Chrome/419.0.0.0', '400-419'],
550
+ ['Chrome/420.0.0.0', '420+'],
551
+ ['Chrome/999.0.0.0', '420+'],
399
552
  ])('version bucket boundary: %s → ver=%s', (chromeToken, expected) => {
400
553
  // Wrap in a minimal browser-like UA so detectDevice/detectPlatform work
401
554
  const ua = `Mozilla/5.0 (X11; Linux x86_64) ${chromeToken} Safari/537.36`;
@@ -622,3 +775,228 @@ describe('computeConfidenceToken', () => {
622
775
  expect(a).not.toBe(b);
623
776
  });
624
777
  });
778
+
779
+ // ═══════════════════════════════════════════════════════════════════════════
780
+ // loadVAIMetadata — Dynamic metadata loading (paywalls-site-fc4)
781
+ // ═══════════════════════════════════════════════════════════════════════════
782
+
783
+ /** Simple tracking mock for fetch — records call count and args */
784
+ function mockFetchWith(response) {
785
+ let calls = 0;
786
+ const fn = async (url, opts) => {
787
+ calls++;
788
+ fn._lastUrl = url;
789
+ fn._lastOpts = opts;
790
+ return response;
791
+ };
792
+ fn.callCount = () => calls;
793
+ fn._lastUrl = null;
794
+ fn._lastOpts = null;
795
+ return fn;
796
+ }
797
+
798
+ function mockFetchReject(error) {
799
+ return async () => { throw error; };
800
+ }
801
+
802
+ describe('loadVAIMetadata', () => {
803
+ const originalFetch = globalThis.fetch;
804
+ const originalConsoleError = console.error;
805
+ const cfg = { paywallsAPIHost: 'https://cloud-api.example.com' };
806
+
807
+ afterEach(() => {
808
+ globalThis.fetch = originalFetch;
809
+ console.error = originalConsoleError;
810
+ _resetVAIMetadata();
811
+ });
812
+
813
+ test('updates DC_ASN_SET from fetched metadata', async () => {
814
+ // Before: hardcoded defaults include 16509 (AWS)
815
+ expect(extractNetFeatures(16509)).toBe('asn=cloud');
816
+ expect(extractNetFeatures(99999)).toBe('asn=consumer');
817
+
818
+ // Mock metadata with a custom ASN set
819
+ const mock = mockFetchWith({
820
+ ok: true,
821
+ json: async () => ({
822
+ version: 1,
823
+ dc_asns: [99999], // Custom: only 99999 is cloud
824
+ automation_patterns: ['Puppeteer'],
825
+ headless_patterns: ['HeadlessChrome'],
826
+ bot_patterns: ['Googlebot'],
827
+ }),
828
+ });
829
+ globalThis.fetch = mock;
830
+
831
+ await loadVAIMetadata(cfg);
832
+
833
+ // After: 99999 is now cloud, 16509 is not
834
+ expect(extractNetFeatures(99999)).toBe('asn=cloud');
835
+ expect(extractNetFeatures(16509)).toBe('asn=consumer');
836
+ expect(mock.callCount()).toBe(1);
837
+ expect(mock._lastUrl).toBe('https://cloud-api.example.com/pw/vai/metadata');
838
+ expect(mock._lastOpts.method).toBe('GET');
839
+ });
840
+
841
+ test('updates automation markers from fetched metadata', async () => {
842
+ // Before: hardcoded includes Puppeteer
843
+ const before = extractUAFeatures('Mozilla/5.0 Puppeteer/1.0');
844
+ expect(before).toContain('automation');
845
+
846
+ // Mock with a custom automation list that doesn't include Puppeteer
847
+ globalThis.fetch = mockFetchWith({
848
+ ok: true,
849
+ json: async () => ({
850
+ version: 1,
851
+ dc_asns: [16509],
852
+ automation_patterns: ['CustomBot'],
853
+ headless_patterns: ['HeadlessChrome'],
854
+ bot_patterns: ['Googlebot'],
855
+ }),
856
+ });
857
+
858
+ await loadVAIMetadata(cfg);
859
+
860
+ // Puppeteer no longer matches automation
861
+ const after = extractUAFeatures('Mozilla/5.0 Puppeteer/1.0');
862
+ expect(after).not.toContain('automation');
863
+
864
+ // CustomBot does
865
+ const custom = extractUAFeatures('Mozilla/5.0 CustomBot/2.0');
866
+ expect(custom).toContain('automation');
867
+ });
868
+
869
+ test('updates bot patterns from fetched metadata', async () => {
870
+ // Before: hardcoded includes Googlebot → family=bot
871
+ const before = extractUAFeatures('Googlebot/2.1');
872
+ expect(before).toContain('bot');
873
+
874
+ // Mock with a different bot list
875
+ globalThis.fetch = mockFetchWith({
876
+ ok: true,
877
+ json: async () => ({
878
+ version: 1,
879
+ dc_asns: [16509],
880
+ automation_patterns: ['Puppeteer'],
881
+ headless_patterns: ['HeadlessChrome'],
882
+ bot_patterns: ['NewAIBot'],
883
+ }),
884
+ });
885
+
886
+ await loadVAIMetadata(cfg);
887
+
888
+ // Googlebot no longer detected as bot family
889
+ const afterGoogle = extractUAFeatures('Googlebot/2.1');
890
+ expect(afterGoogle).not.toContain('dpf=server/other/bot');
891
+
892
+ // NewAIBot is now detected
893
+ const afterNew = extractUAFeatures('NewAIBot/1.0');
894
+ expect(afterNew).toContain('bot');
895
+ });
896
+
897
+ test('falls back to defaults when fetch fails (network error)', async () => {
898
+ globalThis.fetch = mockFetchReject(new Error('Network error'));
899
+ console.error = () => {}; // suppress expected error
900
+
901
+ await loadVAIMetadata(cfg);
902
+
903
+ // Defaults still active: 16509 is cloud
904
+ expect(extractNetFeatures(16509)).toBe('asn=cloud');
905
+ // Automation still works
906
+ const ua = extractUAFeatures('Mozilla/5.0 Puppeteer/1.0');
907
+ expect(ua).toContain('automation');
908
+ });
909
+
910
+ test('falls back to defaults when fetch returns non-OK', async () => {
911
+ globalThis.fetch = mockFetchWith({
912
+ ok: false,
913
+ status: 500,
914
+ statusText: 'Internal Server Error',
915
+ });
916
+ console.error = () => {}; // suppress expected error
917
+
918
+ await loadVAIMetadata(cfg);
919
+
920
+ // Defaults still active
921
+ expect(extractNetFeatures(16509)).toBe('asn=cloud');
922
+ });
923
+
924
+ test('falls back to defaults when response has invalid schema', async () => {
925
+ globalThis.fetch = mockFetchWith({
926
+ ok: true,
927
+ json: async () => ({ invalid: true }), // missing version
928
+ });
929
+ console.error = () => {}; // suppress expected error
930
+
931
+ await loadVAIMetadata(cfg);
932
+
933
+ // Defaults still active
934
+ expect(extractNetFeatures(16509)).toBe('asn=cloud');
935
+ });
936
+
937
+ test('caches metadata and does not re-fetch within TTL', async () => {
938
+ const mock = mockFetchWith({
939
+ ok: true,
940
+ json: async () => ({
941
+ version: 1,
942
+ dc_asns: [16509],
943
+ automation_patterns: ['Puppeteer'],
944
+ headless_patterns: ['HeadlessChrome'],
945
+ bot_patterns: ['Googlebot'],
946
+ }),
947
+ });
948
+ globalThis.fetch = mock;
949
+
950
+ await loadVAIMetadata(cfg);
951
+ expect(mock.callCount()).toBe(1);
952
+
953
+ // Second call within 1 hour — should not fetch again
954
+ await loadVAIMetadata(cfg);
955
+ expect(mock.callCount()).toBe(1);
956
+ });
957
+
958
+ test('_resetVAIMetadata restores hardcoded defaults', async () => {
959
+ // Load custom metadata
960
+ globalThis.fetch = mockFetchWith({
961
+ ok: true,
962
+ json: async () => ({
963
+ version: 1,
964
+ dc_asns: [99999],
965
+ automation_patterns: ['OnlyThisOne'],
966
+ headless_patterns: ['OnlyHeadless'],
967
+ bot_patterns: ['OnlyBot'],
968
+ }),
969
+ });
970
+
971
+ await loadVAIMetadata(cfg);
972
+ expect(extractNetFeatures(99999)).toBe('asn=cloud');
973
+ expect(extractNetFeatures(16509)).toBe('asn=consumer');
974
+
975
+ // Reset
976
+ _resetVAIMetadata();
977
+
978
+ // Back to defaults
979
+ expect(extractNetFeatures(16509)).toBe('asn=cloud');
980
+ expect(extractNetFeatures(99999)).toBe('asn=consumer');
981
+ });
982
+
983
+ test('ignores empty arrays in metadata (keeps defaults)', async () => {
984
+ globalThis.fetch = mockFetchWith({
985
+ ok: true,
986
+ json: async () => ({
987
+ version: 1,
988
+ dc_asns: [],
989
+ automation_patterns: [],
990
+ headless_patterns: [],
991
+ bot_patterns: [],
992
+ }),
993
+ });
994
+
995
+ await loadVAIMetadata(cfg);
996
+
997
+ // Defaults still active (empty arrays are ignored)
998
+ expect(extractNetFeatures(16509)).toBe('asn=cloud');
999
+ const ua = extractUAFeatures('Mozilla/5.0 Puppeteer/1.0');
1000
+ expect(ua).toContain('automation');
1001
+ });
1002
+ });