@paywalls-net/filter 1.0.6 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.js +11 -8
- package/src/user-agent-classification.js +39 -177
package/package.json
CHANGED
package/src/index.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* filters bot-like requests by using paywalls.net authorization services.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
-
import { classifyUserAgent } from './user-agent-classification.js';
|
|
6
|
+
import { classifyUserAgent, loadAgentPatterns } from './user-agent-classification.js';
|
|
7
7
|
|
|
8
8
|
async function logAccess(cfg, request, access) {
|
|
9
9
|
// Separate html from the status in the access object.
|
|
@@ -69,7 +69,7 @@ async function checkAgentStatus(cfg, request) {
|
|
|
69
69
|
};
|
|
70
70
|
}
|
|
71
71
|
|
|
72
|
-
const agentInfo = classifyUserAgent(userAgent);
|
|
72
|
+
const agentInfo = await classifyUserAgent(cfg, userAgent);
|
|
73
73
|
|
|
74
74
|
const body = JSON.stringify({
|
|
75
75
|
account_id: cfg.paywallsPublisherId,
|
|
@@ -129,14 +129,14 @@ function isTestBot(request) {
|
|
|
129
129
|
const uaParam = url.searchParams.get("user-agent");
|
|
130
130
|
return uaParam && uaParam.includes("bot");
|
|
131
131
|
}
|
|
132
|
-
function isPaywallsKnownBot(request) {
|
|
132
|
+
async function isPaywallsKnownBot(cfg, request) {
|
|
133
133
|
const userAgent = request.headers.get("User-Agent");
|
|
134
|
-
const uaClassification = classifyUserAgent(userAgent);
|
|
134
|
+
const uaClassification = await classifyUserAgent(cfg, userAgent);
|
|
135
135
|
return uaClassification.operator && uaClassification.agent;
|
|
136
136
|
}
|
|
137
137
|
|
|
138
|
-
function isRecognizedBot(request) {
|
|
139
|
-
return isFastlyKnownBot(request) || isCloudflareKnownBot(request) || isTestBot(request) || isPaywallsKnownBot(request);
|
|
138
|
+
async function isRecognizedBot(cfg, request) {
|
|
139
|
+
return isFastlyKnownBot(request) || isCloudflareKnownBot(request) || isTestBot(request) || await isPaywallsKnownBot(cfg, request);
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
|
|
@@ -172,8 +172,9 @@ async function cloudflare(config = null) {
|
|
|
172
172
|
paywallsAPIKey: env.PAYWALLS_CLOUD_API_KEY,
|
|
173
173
|
paywallsPublisherId: env.PAYWALLS_PUBLISHER_ID
|
|
174
174
|
};
|
|
175
|
+
await loadAgentPatterns(paywallsConfig);
|
|
175
176
|
|
|
176
|
-
if (isRecognizedBot(request)) {
|
|
177
|
+
if (isRecognizedBot(paywallsConfig, request)) {
|
|
177
178
|
const authz = await checkAgentStatus(paywallsConfig, request);
|
|
178
179
|
|
|
179
180
|
ctx.waitUntil(logAccess(paywallsConfig, request, authz));
|
|
@@ -196,9 +197,10 @@ async function fastly(config) {
|
|
|
196
197
|
paywallsAPIKey: config.get('PAYWALLS_API_KEY'),
|
|
197
198
|
paywallsPublisherId: config.get('PAYWALLS_PUBLISHER_ID')
|
|
198
199
|
};
|
|
200
|
+
await loadAgentPatterns(paywallsConfig);
|
|
199
201
|
|
|
200
202
|
return async function handle(request) {
|
|
201
|
-
if (isRecognizedBot(request)) {
|
|
203
|
+
if (isRecognizedBot(paywallsConfig,request)) {
|
|
202
204
|
const authz = await checkAgentStatus(paywallsConfig, request);
|
|
203
205
|
|
|
204
206
|
await logAccess(paywallsConfig, request, authz);
|
|
@@ -220,6 +222,7 @@ async function fastly(config) {
|
|
|
220
222
|
* @returns {Function} - The handler function for the specified CDN.
|
|
221
223
|
*/
|
|
222
224
|
export async function init(cdn, config = {}) {
|
|
225
|
+
|
|
223
226
|
switch (cdn.toLowerCase()) {
|
|
224
227
|
case 'cloudflare':
|
|
225
228
|
return await cloudflare(config);
|
|
@@ -1,200 +1,62 @@
|
|
|
1
1
|
import { UAParser } from 'ua-parser-js';
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
usage: ['ai_training'],
|
|
8
|
-
user_initiated: 'no',
|
|
9
|
-
patterns: [/ClaudeBot/, /anthropic-ai/]
|
|
10
|
-
},
|
|
11
|
-
{
|
|
12
|
-
operator: 'Anthropic',
|
|
13
|
-
agent: 'Claude-User',
|
|
14
|
-
usage: ['ai_chat'],
|
|
15
|
-
user_initiated: 'yes',
|
|
16
|
-
patterns: [/Claude-User/]
|
|
17
|
-
},
|
|
18
|
-
{
|
|
19
|
-
operator: 'Anthropic',
|
|
20
|
-
agent: 'Claude-SearchBot',
|
|
21
|
-
usage: ['ai_indexing'],
|
|
22
|
-
user_initiated: 'maybe',
|
|
23
|
-
patterns: [/Claude-SearchBot/]
|
|
24
|
-
},
|
|
3
|
+
// Cache for user agent patterns
|
|
4
|
+
let cachedUserAgentPatterns = null;
|
|
5
|
+
let cacheTimestamp = null;
|
|
6
|
+
const CACHE_DURATION = 60 * 60 * 1000; // 1 hour
|
|
25
7
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
},
|
|
33
|
-
|
|
34
|
-
{
|
|
35
|
-
operator: 'Google',
|
|
36
|
-
agent: 'Googlebot',
|
|
37
|
-
usage: ['search_indexing','ai_training'],
|
|
38
|
-
user_initiated: 'maybe',
|
|
39
|
-
patterns: [/Googlebot/]
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
operator: 'Google',
|
|
43
|
-
agent: 'Gemini-Deep-Research',
|
|
44
|
-
usage: ['ai_agents'],
|
|
45
|
-
user_initiated: 'maybe',
|
|
46
|
-
patterns: [/Gemini-Deep-Research/]
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
operator: 'Google',
|
|
50
|
-
agent: 'Google-Extended',
|
|
51
|
-
usage: ['ai_training'],
|
|
52
|
-
usage_prefs_only: true,
|
|
53
|
-
user_initiated: 'no',
|
|
54
|
-
},
|
|
55
|
-
{
|
|
56
|
-
operator: 'Google',
|
|
57
|
-
agent: 'Googlebot-News',
|
|
58
|
-
usage: ['Google News'],
|
|
59
|
-
usage_prefs_only: true,
|
|
60
|
-
user_initiated: 'no'
|
|
61
|
-
},
|
|
62
|
-
{
|
|
63
|
-
operator: 'Google',
|
|
64
|
-
agent: 'Googlebot-Image',
|
|
65
|
-
usage: ['image indexing'],
|
|
66
|
-
user_initiated: 'no',
|
|
67
|
-
patterns: [/Googlebot-Image/]
|
|
68
|
-
},
|
|
69
|
-
{
|
|
70
|
-
operator: 'Google',
|
|
71
|
-
agent: 'Google-Site-Verification',
|
|
72
|
-
usage: ['site verification'],
|
|
73
|
-
user_initiated: 'no',
|
|
74
|
-
patterns: [/Google-Site-Verification/]
|
|
75
|
-
},
|
|
76
|
-
{
|
|
77
|
-
operator: 'Google',
|
|
78
|
-
agent: 'Google Web Preview',
|
|
79
|
-
usage: ['web preview'],
|
|
80
|
-
user_initiated: 'no',
|
|
81
|
-
patterns: [/Google Web Preview/]
|
|
82
|
-
},
|
|
83
|
-
{
|
|
84
|
-
operator: 'Google',
|
|
85
|
-
agent: 'Googlebot-Video',
|
|
86
|
-
usage: ['video indexing'],
|
|
87
|
-
user_initiated: 'no',
|
|
88
|
-
patterns: [/Googlebot-Video/]
|
|
89
|
-
},
|
|
90
|
-
{
|
|
91
|
-
operator: 'Google',
|
|
92
|
-
agent: 'FeedFetcher-Google',
|
|
93
|
-
usage: ['Feed crawling'],
|
|
94
|
-
user_initiated: 'yes',
|
|
95
|
-
patterns: [/FeedFetcher-Google/]
|
|
96
|
-
},
|
|
97
|
-
|
|
98
|
-
{
|
|
99
|
-
operator: 'OpenAI',
|
|
100
|
-
agent: 'GPTBot',
|
|
101
|
-
usage: ['ai_training'],
|
|
102
|
-
user_initiated: 'no',
|
|
103
|
-
patterns: [/GPTBot/]
|
|
104
|
-
},
|
|
105
|
-
{
|
|
106
|
-
operator: 'OpenAI',
|
|
107
|
-
agent: 'OAI-SearchBot',
|
|
108
|
-
usage: ['ai_indexing'],
|
|
109
|
-
user_initiated: 'no',
|
|
110
|
-
patterns: [/OAI-SearchBot/]
|
|
111
|
-
},
|
|
112
|
-
{
|
|
113
|
-
operator: 'OpenAI',
|
|
114
|
-
agent: 'ChatGPT-User',
|
|
115
|
-
usage: ['ai_chat'],
|
|
116
|
-
user_initiated: 'yes',
|
|
117
|
-
patterns: [/ChatGPT-User/]
|
|
118
|
-
},
|
|
8
|
+
/**
|
|
9
|
+
* Fetch user agent patterns from the API and cache them.
|
|
10
|
+
* @returns {Promise<Array>} The user agent patterns.
|
|
11
|
+
*/
|
|
12
|
+
export async function loadAgentPatterns(cfg) {
|
|
13
|
+
const now = Date.now();
|
|
119
14
|
|
|
120
|
-
|
|
121
|
-
{
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
usage: ['content sharing'],
|
|
125
|
-
user_initiated: 'no',
|
|
126
|
-
patterns: [/facebookexternalhit/]
|
|
127
|
-
},
|
|
128
|
-
{
|
|
129
|
-
operator: 'Meta',
|
|
130
|
-
agent: 'meta-externalagent',
|
|
131
|
-
usage: ['ai_training'],
|
|
132
|
-
user_initiated: 'no',
|
|
133
|
-
patterns: [/meta-externalagent/]
|
|
134
|
-
},
|
|
135
|
-
{
|
|
136
|
-
operator: 'Meta',
|
|
137
|
-
agent: 'meta-externalfetcher',
|
|
138
|
-
usage: ['web preview'],
|
|
139
|
-
user_initiated: 'no',
|
|
140
|
-
patterns: [/meta-externalfetcher/]
|
|
141
|
-
},
|
|
15
|
+
// Return cached patterns if still valid
|
|
16
|
+
if (cachedUserAgentPatterns && (now - cacheTimestamp < CACHE_DURATION)) {
|
|
17
|
+
return cachedUserAgentPatterns;
|
|
18
|
+
}
|
|
142
19
|
|
|
143
|
-
{
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
operator: 'Perplexity',
|
|
152
|
-
agent: 'PerplexityBot',
|
|
153
|
-
usage: ['ai_indexing'],
|
|
154
|
-
user_initiated: 'maybe',
|
|
155
|
-
patterns: [/PerplexityBot/]
|
|
156
|
-
},
|
|
157
|
-
|
|
158
|
-
{
|
|
159
|
-
operator: 'Cohere',
|
|
160
|
-
agent: 'cohere-ai',
|
|
161
|
-
usage: ['ai_training'],
|
|
162
|
-
user_initiated: 'no',
|
|
163
|
-
patterns: [/cohere-ai/i]
|
|
164
|
-
},
|
|
20
|
+
try {
|
|
21
|
+
const response = await fetch(`${cfg.paywallsAPIHost}/api/filter/agents/metadata`, {
|
|
22
|
+
method: 'POST',
|
|
23
|
+
headers: {
|
|
24
|
+
'Content-Type': 'application/json',
|
|
25
|
+
Authorization: `Bearer ${cfg.paywallsAPIKey}`
|
|
26
|
+
}
|
|
27
|
+
});
|
|
165
28
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
usage: ['search_indexing','ai_indexing'],
|
|
170
|
-
user_initiated: 'maybe',
|
|
171
|
-
patterns: [/bingbot/i, /BingPreview/]
|
|
172
|
-
},
|
|
29
|
+
if (!response.ok) {
|
|
30
|
+
throw new Error(`Failed to fetch user agent patterns: ${response.status} ${response.statusText}`);
|
|
31
|
+
}
|
|
173
32
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
patterns
|
|
33
|
+
cachedUserAgentPatterns = await response.json();
|
|
34
|
+
cacheTimestamp = now;
|
|
35
|
+
return cachedUserAgentPatterns;
|
|
36
|
+
} catch (error) {
|
|
37
|
+
console.error('Failed to fetch user agent patterns:', error);
|
|
38
|
+
throw new Error('Could not load user agent patterns');
|
|
180
39
|
}
|
|
181
|
-
|
|
40
|
+
}
|
|
182
41
|
|
|
183
42
|
/**
|
|
184
|
-
* Classifies the user agent string based on
|
|
43
|
+
* Classifies the user agent string based on fetched patterns.
|
|
44
|
+
* @param {Object} cfg - Configuration object containing API host details.
|
|
185
45
|
* @param {string} userAgent - The user agent string to classify.
|
|
186
|
-
* @returns {Object} An object containing the browser, OS, operator, usage, and user_initiated status.
|
|
46
|
+
* @returns {Promise<Object>} An object containing the browser, OS, operator, usage, and user_initiated status.
|
|
187
47
|
*/
|
|
188
|
-
export function classifyUserAgent(userAgent) {
|
|
48
|
+
export async function classifyUserAgent(cfg, userAgent) {
|
|
189
49
|
const parsedUA = new UAParser(userAgent).getResult();
|
|
190
50
|
|
|
191
51
|
const browser = parsedUA.browser.name || 'Unknown';
|
|
192
52
|
const os = parsedUA.os.name || 'Unknown';
|
|
193
53
|
|
|
54
|
+
const userAgentPatterns = await loadAgentPatterns(cfg);
|
|
55
|
+
|
|
194
56
|
for (const config of userAgentPatterns) {
|
|
195
57
|
if (!config.patterns) continue;
|
|
196
58
|
for (const pattern of config.patterns) {
|
|
197
|
-
if (pattern.test(userAgent)) {
|
|
59
|
+
if (new RegExp(pattern).test(userAgent)) {
|
|
198
60
|
return {
|
|
199
61
|
operator: config.operator,
|
|
200
62
|
agent: config.agent || browser,
|