@paywalls-net/filter 1.3.11 → 1.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.js +4 -0
- package/src/signal-extraction.js +45 -0
- package/tests/bot-signal-extraction.test.js +204 -0
package/package.json
CHANGED
package/src/index.js
CHANGED
|
@@ -180,6 +180,10 @@ async function proxyVAIRequest(cfg, request) {
|
|
|
180
180
|
setIfPresent(forwardHeaders, 'X-PW-Net', extractNetFeatures(cf.asn));
|
|
181
181
|
setIfPresent(forwardHeaders, 'X-PW-CH', extractCHFeatures(headers['sec-ch-ua'], headers['user-agent']));
|
|
182
182
|
|
|
183
|
+
// Geo context: forward CF edge geo data for logging/investigation (paywalls-site-60rp)
|
|
184
|
+
setIfPresent(forwardHeaders, 'X-PW-Geo',
|
|
185
|
+
cf.country ? `co=${cf.country}, re=${cf.region || ''}, ci=${cf.city || ''}, asn=${cf.asn || ''}` : null);
|
|
186
|
+
|
|
183
187
|
// Tier 3: UA features + HMAC (§6.3)
|
|
184
188
|
setIfPresent(forwardHeaders, 'X-PW-UA', extractUAFeatures(headers['user-agent']));
|
|
185
189
|
setIfPresent(forwardHeaders, 'X-PW-UA-HMAC', await computeUAHMAC(headers['user-agent'], cfg.vaiUAHmacKey));
|
package/src/signal-extraction.js
CHANGED
|
@@ -446,6 +446,50 @@ function bucketVersion(ver) {
|
|
|
446
446
|
}
|
|
447
447
|
|
|
448
448
|
// ── §6.3.1 extractUAFeatures ──────────────────────────────────────────────
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* Detect structurally impossible or fabricated browser version strings.
|
|
452
|
+
*
|
|
453
|
+
* Chrome frozen UA policy (since Chrome 107, late 2022):
|
|
454
|
+
* Real Chrome reports Chrome/[major].0.0.0 — minor, build, and patch are
|
|
455
|
+
* always zero. Any major >= 107 with non-zero build or patch is fabricated.
|
|
456
|
+
*
|
|
457
|
+
* Legacy Chrome with 4-digit patch (e.g. Chrome/48.0.1025.1402):
|
|
458
|
+
* Chrome patch numbers are 1-4 digits (max ~6367 in historical builds).
|
|
459
|
+
* A 4+ digit patch on an old Chrome version is structurally fabricated.
|
|
460
|
+
*
|
|
461
|
+
* Fabricated Edge (e.g. Edge/18.19582):
|
|
462
|
+
* Edge/18 was EdgeHTML-era; real minor versions were at most 3 digits.
|
|
463
|
+
* A 5-digit minor on EdgeHTML is structurally impossible.
|
|
464
|
+
*
|
|
465
|
+
* @param {string} ua
|
|
466
|
+
* @returns {boolean}
|
|
467
|
+
*/
|
|
468
|
+
function isFabricatedVersion(ua) {
|
|
469
|
+
// Chrome / HeadlessChrome / Chromium: full version parse
|
|
470
|
+
const chromeMatch = ua.match(/(?:\b|Headless)Chrom(?:e|ium)\/(\d+)\.(\d+)\.(\d+)\.(\d+)/);
|
|
471
|
+
if (chromeMatch) {
|
|
472
|
+
const major = parseInt(chromeMatch[1], 10);
|
|
473
|
+
const build = parseInt(chromeMatch[3], 10);
|
|
474
|
+
const patch = parseInt(chromeMatch[4], 10);
|
|
475
|
+
|
|
476
|
+
// Frozen UA policy: Chrome >= 107 must be major.0.0.0
|
|
477
|
+
if (major >= 107 && (build !== 0 || patch !== 0)) return true;
|
|
478
|
+
|
|
479
|
+
// 4-digit patch on any Chrome version is structurally impossible
|
|
480
|
+
if (chromeMatch[4].length >= 4) return true;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// EdgeHTML-era (Edge/12-18): minor version should be ≤ 3 digits
|
|
484
|
+
const edgeMatch = ua.match(/\bEdge\/(\d+)\.(\d+)/);
|
|
485
|
+
if (edgeMatch) {
|
|
486
|
+
const major = parseInt(edgeMatch[1], 10);
|
|
487
|
+
if (major <= 18 && edgeMatch[2].length >= 5) return true;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
return false;
|
|
491
|
+
}
|
|
492
|
+
|
|
449
493
|
/**
|
|
450
494
|
* Parse a User-Agent string into an SF-Dictionary of derived features.
|
|
451
495
|
*
|
|
@@ -468,6 +512,7 @@ export function extractUAFeatures(userAgent) {
|
|
|
468
512
|
|
|
469
513
|
if (HEADLESS_MARKERS.some(re => re.test(ua))) parts.push('headless');
|
|
470
514
|
if (AUTOMATION_MARKERS.some(re => re.test(ua))) parts.push('automation');
|
|
515
|
+
if (isFabricatedVersion(ua)) parts.push('fabricated');
|
|
471
516
|
|
|
472
517
|
parts.push(`entropy=${computeUAEntropy(ua)}`);
|
|
473
518
|
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for bot/automation signal extraction via extractUAFeatures
|
|
3
|
+
*
|
|
4
|
+
* Tests the Tier 3 UA feature extractor against real-world bot signals
|
|
5
|
+
* discovered from Cloudflare production logs (2026-03-14).
|
|
6
|
+
*
|
|
7
|
+
* Fixture: paywalls-site/tests/fixtures/cloudflare-prod-paywalls-2026-03-14.csv
|
|
8
|
+
* Issue: (to be assigned)
|
|
9
|
+
*
|
|
10
|
+
* These tests verify that extractUAFeatures correctly identifies:
|
|
11
|
+
* - headless markers (HeadlessChrome)
|
|
12
|
+
* - automation markers (Puppeteer, Selenium, etc.)
|
|
13
|
+
* - bot family detection (Googlebot, Applebot, Bytespider, etc.)
|
|
14
|
+
* - device/platform/family parsing for suspicious UAs
|
|
15
|
+
* - fabricated version patterns
|
|
16
|
+
*/
|
|
17
|
+
import {
|
|
18
|
+
extractUAFeatures,
|
|
19
|
+
_resetVAIMetadata,
|
|
20
|
+
} from '../src/signal-extraction.js';
|
|
21
|
+
|
|
22
|
+
beforeEach(() => {
|
|
23
|
+
_resetVAIMetadata();
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
// ── 1. HeadlessChrome detection ────────────────────────────────────────────
|
|
27
|
+
|
|
28
|
+
describe('HeadlessChrome signal extraction', () => {
|
|
29
|
+
test('HeadlessChrome/145 should have headless marker', () => {
|
|
30
|
+
const result = extractUAFeatures(
|
|
31
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/145.0.0.0 Safari/537.36'
|
|
32
|
+
);
|
|
33
|
+
expect(result).toMatch(/\bheadless\b/);
|
|
34
|
+
expect(result).toMatch(/dpf=desktop\/linux\/chrome/);
|
|
35
|
+
expect(result).toMatch(/browser/);
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
test('HeadlessChrome/143 should have headless and fabricated markers', () => {
|
|
39
|
+
const result = extractUAFeatures(
|
|
40
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/143.0.7499.4 Safari/537.36'
|
|
41
|
+
);
|
|
42
|
+
expect(result).toMatch(/\bheadless\b/);
|
|
43
|
+
expect(result).toMatch(/dpf=desktop\/linux\/chrome/);
|
|
44
|
+
expect(result).toMatch(/\bfabricated\b/);
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
// ── 2. Self-identified bots ────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
describe('Self-identified bot signal extraction', () => {
|
|
51
|
+
test('Applebot should be detected as bot family', () => {
|
|
52
|
+
const result = extractUAFeatures(
|
|
53
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15 (Applebot/0.1; +http://www.apple.com/go/applebot)'
|
|
54
|
+
);
|
|
55
|
+
expect(result).toMatch(/\/bot/);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
test('Googlebot mobile should be detected as bot family', () => {
|
|
59
|
+
const result = extractUAFeatures(
|
|
60
|
+
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.7632.116 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
|
61
|
+
);
|
|
62
|
+
expect(result).toMatch(/\/bot/);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
test('Googlebot desktop should be detected as bot family', () => {
|
|
66
|
+
const result = extractUAFeatures(
|
|
67
|
+
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/145.0.7632.116 Safari/537.36'
|
|
68
|
+
);
|
|
69
|
+
expect(result).toMatch(/\/bot/);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
test('Bytespider should be detected as bot family', () => {
|
|
73
|
+
const result = extractUAFeatures(
|
|
74
|
+
'Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; https://zhanzhang.toutiao.com/)'
|
|
75
|
+
);
|
|
76
|
+
expect(result).toMatch(/\/bot/);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
test('amazon-Quick non-browser UA should have low entropy', () => {
|
|
80
|
+
const result = extractUAFeatures('amazon-Quick-on-behalf-of-20e61c5a');
|
|
81
|
+
expect(result).not.toMatch(/browser/);
|
|
82
|
+
expect(result).toMatch(/entropy=low/);
|
|
83
|
+
});
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
// ── 3. Fabricated Chrome versions (4-digit patch) ──────────────────────────
|
|
87
|
+
|
|
88
|
+
describe('Fabricated Chrome version UAs — signal extraction', () => {
|
|
89
|
+
// These have impossible 4-digit patch numbers. extractUAFeatures doesn't
|
|
90
|
+
// currently detect version fabrication, but it should at minimum:
|
|
91
|
+
// - Parse the very old major version into the low bucket (0-79)
|
|
92
|
+
// - Have medium entropy (looks like a browser UA)
|
|
93
|
+
|
|
94
|
+
const fabricatedUAs = [
|
|
95
|
+
{ ua: 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.1025.1402 Mobile Safari/537.36', ver: '0-79' },
|
|
96
|
+
{ ua: 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.5596.1136 Mobile Safari/537.36', ver: '0-79' },
|
|
97
|
+
{ ua: 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2714.1709 Mobile Safari/537.36', ver: '0-79' },
|
|
98
|
+
{ ua: 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.5974.1013 Mobile Safari/537.36', ver: '0-79' },
|
|
99
|
+
{ ua: 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.1957.1646 Mobile Safari/537.36', ver: '0-79' },
|
|
100
|
+
{ ua: 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.4130.1795 Mobile Safari/537.36', ver: '0-79' },
|
|
101
|
+
{ ua: 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.7842.1119 Mobile Safari/537.36', ver: '0-79' },
|
|
102
|
+
];
|
|
103
|
+
|
|
104
|
+
test.each(fabricatedUAs)('$ver Chrome with 4-digit patch → fabricated marker', ({ ua, ver }) => {
|
|
105
|
+
const result = extractUAFeatures(ua);
|
|
106
|
+
expect(result).toMatch(new RegExp(`ver=${ver}`));
|
|
107
|
+
expect(result).toMatch(/browser/);
|
|
108
|
+
expect(result).toMatch(/\bfabricated\b/);
|
|
109
|
+
});
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
// ── 4. Fabricated Edge version ─────────────────────────────────────────────
|
|
113
|
+
|
|
114
|
+
describe('Fabricated Edge version — signal extraction', () => {
|
|
115
|
+
test('Edge/18.19582 should parse as edge family and be fabricated', () => {
|
|
116
|
+
const result = extractUAFeatures(
|
|
117
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.7680.71 Safari/537.36 Edge/18.19582'
|
|
118
|
+
);
|
|
119
|
+
expect(result).toMatch(/\/edge/);
|
|
120
|
+
expect(result).toMatch(/dpf=desktop\/windows\/edge/);
|
|
121
|
+
expect(result).toMatch(/\bfabricated\b/);
|
|
122
|
+
});
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
// ── 5. Outdated browser UAs from bot farm ──────────────────────────────────
|
|
126
|
+
|
|
127
|
+
describe('Outdated browser UAs from bot farm — version bucketing', () => {
|
|
128
|
+
test('Chrome/59 (2017) should bucket to 0-79', () => {
|
|
129
|
+
const result = extractUAFeatures(
|
|
130
|
+
'Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36'
|
|
131
|
+
);
|
|
132
|
+
expect(result).toMatch(/ver=0-79/);
|
|
133
|
+
expect(result).toMatch(/dpf=mobile\/android\/chrome/);
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
test('Chrome/117 with non-zero build (frozen UA violation) → fabricated', () => {
|
|
137
|
+
const result = extractUAFeatures(
|
|
138
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36'
|
|
139
|
+
);
|
|
140
|
+
expect(result).toMatch(/ver=100-119/);
|
|
141
|
+
expect(result).toMatch(/\bfabricated\b/);
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
test('Chrome/83 should bucket to 80-99', () => {
|
|
145
|
+
const result = extractUAFeatures(
|
|
146
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
|
|
147
|
+
);
|
|
148
|
+
expect(result).toMatch(/ver=80-99/);
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
test('Chrome/79 should bucket to 0-79', () => {
|
|
152
|
+
const result = extractUAFeatures(
|
|
153
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
|
|
154
|
+
);
|
|
155
|
+
expect(result).toMatch(/ver=0-79/);
|
|
156
|
+
});
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
// ── 6. Legitimate browser UAs ──────────────────────────────────────────────
|
|
160
|
+
|
|
161
|
+
describe('Legitimate browser UAs — correct feature extraction', () => {
|
|
162
|
+
test('Chrome/145 on macOS → desktop/mac/chrome, current version bucket', () => {
|
|
163
|
+
const result = extractUAFeatures(
|
|
164
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36'
|
|
165
|
+
);
|
|
166
|
+
expect(result).toMatch(/dpf=desktop\/mac\/chrome/);
|
|
167
|
+
expect(result).toMatch(/ver=140-159/);
|
|
168
|
+
expect(result).toMatch(/browser/);
|
|
169
|
+
expect(result).not.toMatch(/headless/);
|
|
170
|
+
expect(result).not.toMatch(/automation/);
|
|
171
|
+
expect(result).not.toMatch(/fabricated/);
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
test('Chrome/146 on Windows → desktop/windows/chrome, current version bucket', () => {
|
|
175
|
+
const result = extractUAFeatures(
|
|
176
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36'
|
|
177
|
+
);
|
|
178
|
+
expect(result).toMatch(/dpf=desktop\/windows\/chrome/);
|
|
179
|
+
expect(result).toMatch(/ver=140-159/);
|
|
180
|
+
expect(result).not.toMatch(/headless/);
|
|
181
|
+
expect(result).not.toMatch(/automation/);
|
|
182
|
+
expect(result).not.toMatch(/fabricated/);
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
test('Safari/17.4.1 on macOS → desktop/mac/safari', () => {
|
|
186
|
+
const result = extractUAFeatures(
|
|
187
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15'
|
|
188
|
+
);
|
|
189
|
+
expect(result).toMatch(/dpf=desktop\/mac\/safari/);
|
|
190
|
+
expect(result).not.toMatch(/headless/);
|
|
191
|
+
expect(result).not.toMatch(/automation/);
|
|
192
|
+
expect(result).not.toMatch(/fabricated/);
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
test('Edge/122 on Windows → desktop/windows/edge', () => {
|
|
196
|
+
const result = extractUAFeatures(
|
|
197
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
|
|
198
|
+
);
|
|
199
|
+
expect(result).toMatch(/dpf=desktop\/windows\/edge/);
|
|
200
|
+
expect(result).toMatch(/ver=120-139/);
|
|
201
|
+
expect(result).not.toMatch(/headless/);
|
|
202
|
+
expect(result).not.toMatch(/fabricated/);
|
|
203
|
+
});
|
|
204
|
+
});
|