@paywalls-net/filter 1.0.6 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.js +10 -7
- package/src/user-agent-classification.js +46 -176
package/package.json
CHANGED
package/src/index.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* filters bot-like requests by using paywalls.net authorization services.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
-
import { classifyUserAgent } from './user-agent-classification.js';
|
|
6
|
+
import { classifyUserAgent, loadAgentPatterns } from './user-agent-classification.js';
|
|
7
7
|
|
|
8
8
|
async function logAccess(cfg, request, access) {
|
|
9
9
|
// Separate html from the status in the access object.
|
|
@@ -69,7 +69,7 @@ async function checkAgentStatus(cfg, request) {
|
|
|
69
69
|
};
|
|
70
70
|
}
|
|
71
71
|
|
|
72
|
-
const agentInfo = classifyUserAgent(userAgent);
|
|
72
|
+
const agentInfo = classifyUserAgent(cfg, userAgent);
|
|
73
73
|
|
|
74
74
|
const body = JSON.stringify({
|
|
75
75
|
account_id: cfg.paywallsPublisherId,
|
|
@@ -129,14 +129,14 @@ function isTestBot(request) {
|
|
|
129
129
|
const uaParam = url.searchParams.get("user-agent");
|
|
130
130
|
return uaParam && uaParam.includes("bot");
|
|
131
131
|
}
|
|
132
|
-
function isPaywallsKnownBot(request) {
|
|
132
|
+
function isPaywallsKnownBot(cfg,request) {
|
|
133
133
|
const userAgent = request.headers.get("User-Agent");
|
|
134
134
|
const uaClassification = classifyUserAgent(userAgent);
|
|
135
135
|
return uaClassification.operator && uaClassification.agent;
|
|
136
136
|
}
|
|
137
137
|
|
|
138
|
-
function isRecognizedBot(request) {
|
|
139
|
-
return isFastlyKnownBot(request) || isCloudflareKnownBot(request) || isTestBot(request) || isPaywallsKnownBot(request);
|
|
138
|
+
function isRecognizedBot(cfg,request) {
|
|
139
|
+
return isFastlyKnownBot(request) || isCloudflareKnownBot(request) || isTestBot(request) || isPaywallsKnownBot(cfg,request);
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
|
|
@@ -172,8 +172,9 @@ async function cloudflare(config = null) {
|
|
|
172
172
|
paywallsAPIKey: env.PAYWALLS_CLOUD_API_KEY,
|
|
173
173
|
paywallsPublisherId: env.PAYWALLS_PUBLISHER_ID
|
|
174
174
|
};
|
|
175
|
+
await loadAgentPatterns(paywallsConfig);
|
|
175
176
|
|
|
176
|
-
if (isRecognizedBot(request)) {
|
|
177
|
+
if (isRecognizedBot(paywallsConfig, request)) {
|
|
177
178
|
const authz = await checkAgentStatus(paywallsConfig, request);
|
|
178
179
|
|
|
179
180
|
ctx.waitUntil(logAccess(paywallsConfig, request, authz));
|
|
@@ -196,9 +197,10 @@ async function fastly(config) {
|
|
|
196
197
|
paywallsAPIKey: config.get('PAYWALLS_API_KEY'),
|
|
197
198
|
paywallsPublisherId: config.get('PAYWALLS_PUBLISHER_ID')
|
|
198
199
|
};
|
|
200
|
+
await loadAgentPatterns(paywallsConfig);
|
|
199
201
|
|
|
200
202
|
return async function handle(request) {
|
|
201
|
-
if (isRecognizedBot(request)) {
|
|
203
|
+
if (isRecognizedBot(paywallsConfig,request)) {
|
|
202
204
|
const authz = await checkAgentStatus(paywallsConfig, request);
|
|
203
205
|
|
|
204
206
|
await logAccess(paywallsConfig, request, authz);
|
|
@@ -220,6 +222,7 @@ async function fastly(config) {
|
|
|
220
222
|
* @returns {Function} - The handler function for the specified CDN.
|
|
221
223
|
*/
|
|
222
224
|
export async function init(cdn, config = {}) {
|
|
225
|
+
|
|
223
226
|
switch (cdn.toLowerCase()) {
|
|
224
227
|
case 'cloudflare':
|
|
225
228
|
return await cloudflare(config);
|
|
@@ -1,200 +1,70 @@
|
|
|
1
1
|
import { UAParser } from 'ua-parser-js';
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
usage: ['ai_training'],
|
|
8
|
-
user_initiated: 'no',
|
|
9
|
-
patterns: [/ClaudeBot/, /anthropic-ai/]
|
|
10
|
-
},
|
|
11
|
-
{
|
|
12
|
-
operator: 'Anthropic',
|
|
13
|
-
agent: 'Claude-User',
|
|
14
|
-
usage: ['ai_chat'],
|
|
15
|
-
user_initiated: 'yes',
|
|
16
|
-
patterns: [/Claude-User/]
|
|
17
|
-
},
|
|
18
|
-
{
|
|
19
|
-
operator: 'Anthropic',
|
|
20
|
-
agent: 'Claude-SearchBot',
|
|
21
|
-
usage: ['ai_indexing'],
|
|
22
|
-
user_initiated: 'maybe',
|
|
23
|
-
patterns: [/Claude-SearchBot/]
|
|
24
|
-
},
|
|
3
|
+
// Cache for user agent patterns
|
|
4
|
+
let cachedUserAgentPatterns = null;
|
|
5
|
+
let cacheTimestamp = null;
|
|
6
|
+
const CACHE_DURATION = 60 * 60 * 1000; // 1 hour
|
|
25
7
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
},
|
|
33
|
-
|
|
34
|
-
{
|
|
35
|
-
operator: 'Google',
|
|
36
|
-
agent: 'Googlebot',
|
|
37
|
-
usage: ['search_indexing','ai_training'],
|
|
38
|
-
user_initiated: 'maybe',
|
|
39
|
-
patterns: [/Googlebot/]
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
operator: 'Google',
|
|
43
|
-
agent: 'Gemini-Deep-Research',
|
|
44
|
-
usage: ['ai_agents'],
|
|
45
|
-
user_initiated: 'maybe',
|
|
46
|
-
patterns: [/Gemini-Deep-Research/]
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
operator: 'Google',
|
|
50
|
-
agent: 'Google-Extended',
|
|
51
|
-
usage: ['ai_training'],
|
|
52
|
-
usage_prefs_only: true,
|
|
53
|
-
user_initiated: 'no',
|
|
54
|
-
},
|
|
55
|
-
{
|
|
56
|
-
operator: 'Google',
|
|
57
|
-
agent: 'Googlebot-News',
|
|
58
|
-
usage: ['Google News'],
|
|
59
|
-
usage_prefs_only: true,
|
|
60
|
-
user_initiated: 'no'
|
|
61
|
-
},
|
|
62
|
-
{
|
|
63
|
-
operator: 'Google',
|
|
64
|
-
agent: 'Googlebot-Image',
|
|
65
|
-
usage: ['image indexing'],
|
|
66
|
-
user_initiated: 'no',
|
|
67
|
-
patterns: [/Googlebot-Image/]
|
|
68
|
-
},
|
|
69
|
-
{
|
|
70
|
-
operator: 'Google',
|
|
71
|
-
agent: 'Google-Site-Verification',
|
|
72
|
-
usage: ['site verification'],
|
|
73
|
-
user_initiated: 'no',
|
|
74
|
-
patterns: [/Google-Site-Verification/]
|
|
75
|
-
},
|
|
76
|
-
{
|
|
77
|
-
operator: 'Google',
|
|
78
|
-
agent: 'Google Web Preview',
|
|
79
|
-
usage: ['web preview'],
|
|
80
|
-
user_initiated: 'no',
|
|
81
|
-
patterns: [/Google Web Preview/]
|
|
82
|
-
},
|
|
83
|
-
{
|
|
84
|
-
operator: 'Google',
|
|
85
|
-
agent: 'Googlebot-Video',
|
|
86
|
-
usage: ['video indexing'],
|
|
87
|
-
user_initiated: 'no',
|
|
88
|
-
patterns: [/Googlebot-Video/]
|
|
89
|
-
},
|
|
90
|
-
{
|
|
91
|
-
operator: 'Google',
|
|
92
|
-
agent: 'FeedFetcher-Google',
|
|
93
|
-
usage: ['Feed crawling'],
|
|
94
|
-
user_initiated: 'yes',
|
|
95
|
-
patterns: [/FeedFetcher-Google/]
|
|
96
|
-
},
|
|
8
|
+
/**
|
|
9
|
+
* Fetch user agent patterns from the API and cache them.
|
|
10
|
+
* @returns {Promise<Array>} The user agent patterns.
|
|
11
|
+
*/
|
|
12
|
+
export async function loadAgentPatterns(cfg) {
|
|
13
|
+
const now = Date.now();
|
|
97
14
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
user_initiated: 'no',
|
|
103
|
-
patterns: [/GPTBot/]
|
|
104
|
-
},
|
|
105
|
-
{
|
|
106
|
-
operator: 'OpenAI',
|
|
107
|
-
agent: 'OAI-SearchBot',
|
|
108
|
-
usage: ['ai_indexing'],
|
|
109
|
-
user_initiated: 'no',
|
|
110
|
-
patterns: [/OAI-SearchBot/]
|
|
111
|
-
},
|
|
112
|
-
{
|
|
113
|
-
operator: 'OpenAI',
|
|
114
|
-
agent: 'ChatGPT-User',
|
|
115
|
-
usage: ['ai_chat'],
|
|
116
|
-
user_initiated: 'yes',
|
|
117
|
-
patterns: [/ChatGPT-User/]
|
|
118
|
-
},
|
|
15
|
+
// Return cached patterns if still valid
|
|
16
|
+
if (cachedUserAgentPatterns && (now - cacheTimestamp < CACHE_DURATION)) {
|
|
17
|
+
return cachedUserAgentPatterns;
|
|
18
|
+
}
|
|
119
19
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
{
|
|
129
|
-
operator: 'Meta',
|
|
130
|
-
agent: 'meta-externalagent',
|
|
131
|
-
usage: ['ai_training'],
|
|
132
|
-
user_initiated: 'no',
|
|
133
|
-
patterns: [/meta-externalagent/]
|
|
134
|
-
},
|
|
135
|
-
{
|
|
136
|
-
operator: 'Meta',
|
|
137
|
-
agent: 'meta-externalfetcher',
|
|
138
|
-
usage: ['web preview'],
|
|
139
|
-
user_initiated: 'no',
|
|
140
|
-
patterns: [/meta-externalfetcher/]
|
|
141
|
-
},
|
|
20
|
+
try {
|
|
21
|
+
const response = await fetch(`${cfg.paywallsAPIHost}/api/filter/agents/metadata`, {
|
|
22
|
+
method: 'POST',
|
|
23
|
+
headers: {
|
|
24
|
+
'Content-Type': 'application/json',
|
|
25
|
+
Authorization: `Bearer ${cfg.paywallsAPIKey}`
|
|
26
|
+
}
|
|
27
|
+
});
|
|
142
28
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
usage: ['ai_chat'],
|
|
147
|
-
user_initiated: 'yes',
|
|
148
|
-
patterns: [/Perplexity-User/]
|
|
149
|
-
},
|
|
150
|
-
{
|
|
151
|
-
operator: 'Perplexity',
|
|
152
|
-
agent: 'PerplexityBot',
|
|
153
|
-
usage: ['ai_indexing'],
|
|
154
|
-
user_initiated: 'maybe',
|
|
155
|
-
patterns: [/PerplexityBot/]
|
|
156
|
-
},
|
|
157
|
-
|
|
158
|
-
{
|
|
159
|
-
operator: 'Cohere',
|
|
160
|
-
agent: 'cohere-ai',
|
|
161
|
-
usage: ['ai_training'],
|
|
162
|
-
user_initiated: 'no',
|
|
163
|
-
patterns: [/cohere-ai/i]
|
|
164
|
-
},
|
|
29
|
+
if (!response.ok) {
|
|
30
|
+
throw new Error(`Failed to fetch user agent patterns: ${response.status} ${response.statusText}`);
|
|
31
|
+
}
|
|
165
32
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
}
|
|
33
|
+
cachedUserAgentPatterns = await response.json();
|
|
34
|
+
cacheTimestamp = now;
|
|
35
|
+
return cachedUserAgentPatterns;
|
|
36
|
+
} catch (error) {
|
|
37
|
+
console.error('Failed to fetch user agent patterns:', error);
|
|
38
|
+
throw new Error('Could not load user agent patterns');
|
|
39
|
+
}
|
|
40
|
+
}
|
|
173
41
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
patterns: [/BF-DirectLine/]
|
|
42
|
+
function getAgentPatterns() {
|
|
43
|
+
if (cachedUserAgentPatterns) {
|
|
44
|
+
return cachedUserAgentPatterns;
|
|
45
|
+
} else {
|
|
46
|
+
throw new Error('User agent patterns not loaded. Call loadAgentPatterns first.');
|
|
180
47
|
}
|
|
181
|
-
|
|
48
|
+
}
|
|
182
49
|
|
|
183
50
|
/**
|
|
184
|
-
* Classifies the user agent string based on
|
|
51
|
+
* Classifies the user agent string based on fetched patterns.
|
|
52
|
+
* @param {Object} cfg - Configuration object containing API host details.
|
|
185
53
|
* @param {string} userAgent - The user agent string to classify.
|
|
186
|
-
* @returns {Object} An object containing the browser, OS, operator, usage, and user_initiated status.
|
|
54
|
+
* @returns {Promise<Object>} An object containing the browser, OS, operator, usage, and user_initiated status.
|
|
187
55
|
*/
|
|
188
|
-
export function classifyUserAgent(userAgent) {
|
|
56
|
+
export function classifyUserAgent(cfg, userAgent) {
|
|
189
57
|
const parsedUA = new UAParser(userAgent).getResult();
|
|
190
58
|
|
|
191
59
|
const browser = parsedUA.browser.name || 'Unknown';
|
|
192
60
|
const os = parsedUA.os.name || 'Unknown';
|
|
193
61
|
|
|
62
|
+
const userAgentPatterns = getAgentPatterns();
|
|
63
|
+
|
|
194
64
|
for (const config of userAgentPatterns) {
|
|
195
65
|
if (!config.patterns) continue;
|
|
196
66
|
for (const pattern of config.patterns) {
|
|
197
|
-
if (pattern.test(userAgent)) {
|
|
67
|
+
if (new RegExp(pattern).test(userAgent)) {
|
|
198
68
|
return {
|
|
199
69
|
operator: config.operator,
|
|
200
70
|
agent: config.agent || browser,
|