@paywalls-net/filter 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.js +117 -3
- package/src/user-agent-classification.js +53 -2
package/package.json
CHANGED
package/src/index.js
CHANGED
|
@@ -51,6 +51,71 @@ function getAllHeaders(request) {
|
|
|
51
51
|
return headers;
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
+
/**
|
|
55
|
+
* Check if the request is for a VAI endpoint (vai.json or vai.js)
|
|
56
|
+
* @param {Request} request - The incoming request
|
|
57
|
+
* @param {string} vaiPath - The path prefix for VAI endpoints (default: '/pw')
|
|
58
|
+
* @returns {boolean} - True if this is a VAI endpoint request
|
|
59
|
+
*/
|
|
60
|
+
function isVAIRequest(request, vaiPath = '/pw') {
|
|
61
|
+
try {
|
|
62
|
+
const url = new URL(request.url || `http://host${request.uri || ''}`);
|
|
63
|
+
const pathname = url.pathname;
|
|
64
|
+
return pathname === `${vaiPath}/vai.json` || pathname === `${vaiPath}/vai.js`;
|
|
65
|
+
} catch (err) {
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Proxy VAI requests to the cloud-api service
|
|
72
|
+
* @param {Object} cfg - Configuration object with paywallsAPIHost and paywallsAPIKey
|
|
73
|
+
* @param {Request} request - The incoming request
|
|
74
|
+
* @returns {Promise<Response>} - The proxied response from cloud-api
|
|
75
|
+
*/
|
|
76
|
+
async function proxyVAIRequest(cfg, request) {
|
|
77
|
+
try {
|
|
78
|
+
const url = new URL(request.url || `http://host${request.uri || ''}`);
|
|
79
|
+
const isJson = url.pathname.endsWith('/vai.json');
|
|
80
|
+
const cloudApiPath = isJson ? '/pw/vai.json' : '/pw/vai.js';
|
|
81
|
+
|
|
82
|
+
// Get all request headers
|
|
83
|
+
const headers = getAllHeaders(request);
|
|
84
|
+
|
|
85
|
+
// Build forwarding headers
|
|
86
|
+
const forwardHeaders = {
|
|
87
|
+
'User-Agent': headers['user-agent'] || sdkUserAgent,
|
|
88
|
+
'Authorization': `Bearer ${cfg.paywallsAPIKey}`
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
// Add forwarding headers if available
|
|
92
|
+
if (headers['x-forwarded-for']) {
|
|
93
|
+
forwardHeaders['X-Forwarded-For'] = headers['x-forwarded-for'];
|
|
94
|
+
} else if (headers['cf-connecting-ip']) {
|
|
95
|
+
forwardHeaders['X-Forwarded-For'] = headers['cf-connecting-ip'];
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (headers['host']) {
|
|
99
|
+
forwardHeaders['X-Original-Host'] = headers['host'];
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Forward request to cloud-api
|
|
103
|
+
const response = await fetch(`${cfg.paywallsAPIHost}${cloudApiPath}`, {
|
|
104
|
+
method: 'GET',
|
|
105
|
+
headers: forwardHeaders
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
if (!response.ok) {
|
|
109
|
+
console.error(`VAI proxy error: ${response.status} ${response.statusText}`);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return response;
|
|
113
|
+
} catch (err) {
|
|
114
|
+
console.error(`Error proxying VAI request: ${err.message}`);
|
|
115
|
+
return new Response('Internal Server Error', { status: 500 });
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
54
119
|
async function logAccess(cfg, request, access) {
|
|
55
120
|
// Separate html from the status in the access object.
|
|
56
121
|
const { response, ...status } = access;
|
|
@@ -263,8 +328,15 @@ async function cloudflare(config = null) {
|
|
|
263
328
|
const paywallsConfig = {
|
|
264
329
|
paywallsAPIHost: env.PAYWALLS_CLOUD_API_HOST || PAYWALLS_CLOUD_API_HOST,
|
|
265
330
|
paywallsAPIKey: env.PAYWALLS_CLOUD_API_KEY,
|
|
266
|
-
paywallsPublisherId: env.PAYWALLS_PUBLISHER_ID
|
|
331
|
+
paywallsPublisherId: env.PAYWALLS_PUBLISHER_ID,
|
|
332
|
+
vaiPath: env.PAYWALLS_VAI_PATH || '/pw'
|
|
267
333
|
};
|
|
334
|
+
|
|
335
|
+
// Check if this is a VAI endpoint request and proxy it
|
|
336
|
+
if (isVAIRequest(request, paywallsConfig.vaiPath)) {
|
|
337
|
+
return await proxyVAIRequest(paywallsConfig, request);
|
|
338
|
+
}
|
|
339
|
+
|
|
268
340
|
await loadAgentPatterns(paywallsConfig);
|
|
269
341
|
|
|
270
342
|
if (await isRecognizedBot(paywallsConfig, request)) {
|
|
@@ -288,8 +360,14 @@ async function fastly() {
|
|
|
288
360
|
const paywallsConfig = {
|
|
289
361
|
paywallsAPIHost: config.get('PAYWALLS_CLOUD_API_HOST') || PAYWALLS_CLOUD_API_HOST,
|
|
290
362
|
paywallsAPIKey: config.get('PAYWALLS_API_KEY'),
|
|
291
|
-
paywallsPublisherId: config.get('PAYWALLS_PUBLISHER_ID')
|
|
363
|
+
paywallsPublisherId: config.get('PAYWALLS_PUBLISHER_ID'),
|
|
364
|
+
vaiPath: config.get('PAYWALLS_VAI_PATH') || '/pw'
|
|
292
365
|
};
|
|
366
|
+
|
|
367
|
+
// Check if this is a VAI endpoint request and proxy it
|
|
368
|
+
if (isVAIRequest(request, paywallsConfig.vaiPath)) {
|
|
369
|
+
return await proxyVAIRequest(paywallsConfig, request);
|
|
370
|
+
}
|
|
293
371
|
|
|
294
372
|
await loadAgentPatterns(paywallsConfig);
|
|
295
373
|
|
|
@@ -304,6 +382,34 @@ async function fastly() {
|
|
|
304
382
|
}
|
|
305
383
|
};
|
|
306
384
|
}
|
|
385
|
+
/**
|
|
386
|
+
* Convert a standard Response to CloudFront format
|
|
387
|
+
* @param {Response} response - Standard fetch Response object
|
|
388
|
+
* @returns {Promise<Object>} - CloudFront-formatted response
|
|
389
|
+
*/
|
|
390
|
+
async function responseToCloudFront(response) {
|
|
391
|
+
const headers = {};
|
|
392
|
+
|
|
393
|
+
// Convert response headers to CloudFront format
|
|
394
|
+
for (const [key, value] of response.headers.entries()) {
|
|
395
|
+
headers[key.toLowerCase()] = [
|
|
396
|
+
{
|
|
397
|
+
key: key,
|
|
398
|
+
value: value
|
|
399
|
+
}
|
|
400
|
+
];
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
const body = await response.text();
|
|
404
|
+
|
|
405
|
+
return {
|
|
406
|
+
status: response.status,
|
|
407
|
+
statusDescription: response.statusText || 'OK',
|
|
408
|
+
headers: headers,
|
|
409
|
+
body: body
|
|
410
|
+
};
|
|
411
|
+
}
|
|
412
|
+
|
|
307
413
|
/**
|
|
308
414
|
* Adapt to CloudFront format
|
|
309
415
|
* Lambda@Edge events see https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/lambda-event-structure.html#lambda-event-structure-request
|
|
@@ -336,13 +442,21 @@ async function cloudfront(config) {
|
|
|
336
442
|
const paywallsConfig = {
|
|
337
443
|
paywallsAPIHost: config.PAYWALLS_CLOUD_API_HOST || PAYWALLS_CLOUD_API_HOST,
|
|
338
444
|
paywallsAPIKey: config.PAYWALLS_API_KEY,
|
|
339
|
-
paywallsPublisherId: config.PAYWALLS_PUBLISHER_ID
|
|
445
|
+
paywallsPublisherId: config.PAYWALLS_PUBLISHER_ID,
|
|
446
|
+
vaiPath: config.PAYWALLS_VAI_PATH || '/pw'
|
|
340
447
|
};
|
|
341
448
|
await loadAgentPatterns(paywallsConfig);
|
|
342
449
|
|
|
343
450
|
return async function handle(event, ctx) {
|
|
344
451
|
let request = event.Records[0].cf.request;
|
|
345
452
|
request = requestShim(request);
|
|
453
|
+
|
|
454
|
+
// Check if this is a VAI endpoint request and proxy it
|
|
455
|
+
if (isVAIRequest(request, paywallsConfig.vaiPath)) {
|
|
456
|
+
const response = await proxyVAIRequest(paywallsConfig, request);
|
|
457
|
+
return await responseToCloudFront(response);
|
|
458
|
+
}
|
|
459
|
+
|
|
346
460
|
if (await isRecognizedBot(paywallsConfig, request)) {
|
|
347
461
|
const authz = await checkAgentStatus(paywallsConfig, request);
|
|
348
462
|
|
|
@@ -5,6 +5,39 @@ let cachedUserAgentPatterns = null;
|
|
|
5
5
|
let cacheTimestamp = null;
|
|
6
6
|
const CACHE_DURATION = 60 * 60 * 1000; // 1 hour
|
|
7
7
|
|
|
8
|
+
// Cache for user agent classifications
|
|
9
|
+
//
|
|
10
|
+
// CACHE STRATEGY CONSIDERATIONS:
|
|
11
|
+
//
|
|
12
|
+
// Current approach: Raw user-agent string as cache key
|
|
13
|
+
// - Pro: No parsing overhead before cache lookup
|
|
14
|
+
// - Pro: Exact matches are very fast
|
|
15
|
+
// - Con: User-agents with minor version differences create separate cache entries
|
|
16
|
+
// - Con: Cache could grow large with many unique UAs (especially browser traffic)
|
|
17
|
+
//
|
|
18
|
+
// Alternative approaches to consider:
|
|
19
|
+
// 1. Normalized keys (e.g., browser name + major version + OS)
|
|
20
|
+
// - Would improve hit rate and reduce memory
|
|
21
|
+
// - But adds parsing cost before every cache check
|
|
22
|
+
// - Risk: Might miss pattern-specific matches if patterns are version-sensitive
|
|
23
|
+
//
|
|
24
|
+
// 2. LRU cache with size limit
|
|
25
|
+
// - Bounds memory usage
|
|
26
|
+
// - Evicts least-recently-used entries
|
|
27
|
+
// - Good if traffic patterns are consistent
|
|
28
|
+
//
|
|
29
|
+
// 3. Separate caches for bots vs browsers
|
|
30
|
+
// - Bot UAs are typically more stable (better cache hit rate)
|
|
31
|
+
// - Browser UAs change frequently with versions (lower hit rate)
|
|
32
|
+
// - Could optimize each differently
|
|
33
|
+
//
|
|
34
|
+
// Decision: Start with raw UA keys until we have production metrics showing:
|
|
35
|
+
// - Actual cache size growth
|
|
36
|
+
// - Cache hit rates
|
|
37
|
+
// - Memory pressure
|
|
38
|
+
// Then optimize based on data rather than speculation.
|
|
39
|
+
let classificationCache = new Map();
|
|
40
|
+
|
|
8
41
|
/**
|
|
9
42
|
* Fetch user agent patterns from the API and cache them.
|
|
10
43
|
* @returns {Promise<Array>} The user agent patterns.
|
|
@@ -39,6 +72,10 @@ export async function loadAgentPatterns(cfg) {
|
|
|
39
72
|
}));
|
|
40
73
|
|
|
41
74
|
cacheTimestamp = now;
|
|
75
|
+
|
|
76
|
+
// Clear classification cache when patterns are refreshed
|
|
77
|
+
classificationCache.clear();
|
|
78
|
+
|
|
42
79
|
return cachedUserAgentPatterns;
|
|
43
80
|
} catch (error) {
|
|
44
81
|
console.error('Error loading agent patterns:', error);
|
|
@@ -53,6 +90,14 @@ export async function loadAgentPatterns(cfg) {
|
|
|
53
90
|
* @returns {Promise<Object>} An object containing the browser, OS, operator, usage, and user_initiated status.
|
|
54
91
|
*/
|
|
55
92
|
export async function classifyUserAgent(cfg, userAgent) {
|
|
93
|
+
// Check classification cache first (single lookup is more efficient than has + get)
|
|
94
|
+
const cached = classificationCache.get(userAgent);
|
|
95
|
+
if (cached !== undefined) {
|
|
96
|
+
console.log(`User agent classification cache hit for: ${userAgent}`);
|
|
97
|
+
return cached;
|
|
98
|
+
}
|
|
99
|
+
console.log(`User agent classification cache miss for: ${userAgent}`);
|
|
100
|
+
|
|
56
101
|
const parsedUA = new UAParser(userAgent).getResult();
|
|
57
102
|
|
|
58
103
|
const browser = parsedUA.browser.name || 'Unknown';
|
|
@@ -64,7 +109,7 @@ export async function classifyUserAgent(cfg, userAgent) {
|
|
|
64
109
|
if (!config.patterns) continue;
|
|
65
110
|
for (const pattern of config.patterns) {
|
|
66
111
|
if (new RegExp(pattern).test(userAgent)) {
|
|
67
|
-
|
|
112
|
+
const result = {
|
|
68
113
|
operator: config.operator,
|
|
69
114
|
agent: config.agent || browser,
|
|
70
115
|
usage: config.usage,
|
|
@@ -72,12 +117,18 @@ export async function classifyUserAgent(cfg, userAgent) {
|
|
|
72
117
|
browser,
|
|
73
118
|
os,
|
|
74
119
|
};
|
|
120
|
+
// Cache the classification result
|
|
121
|
+
classificationCache.set(userAgent, result);
|
|
122
|
+
return result;
|
|
75
123
|
}
|
|
76
124
|
}
|
|
77
125
|
}
|
|
78
126
|
|
|
79
|
-
|
|
127
|
+
const result = {
|
|
80
128
|
browser,
|
|
81
129
|
os
|
|
82
130
|
};
|
|
131
|
+
// Cache the default classification
|
|
132
|
+
classificationCache.set(userAgent, result);
|
|
133
|
+
return result;
|
|
83
134
|
}
|