@paywalls-net/filter 1.0.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -3,7 +3,7 @@
3
3
  "description": "Client SDK for integrating paywalls.net bot filtering and authorization services into your server or CDN.",
4
4
  "author": "paywalls.net",
5
5
  "license": "MIT",
6
- "version": "1.0.6",
6
+ "version": "1.1.1",
7
7
  "publishConfig": {
8
8
  "access": "public"
9
9
  },
package/src/index.js CHANGED
@@ -3,7 +3,7 @@
3
3
  * filters bot-like requests by using paywalls.net authorization services.
4
4
  */
5
5
 
6
- import { classifyUserAgent } from './user-agent-classification.js';
6
+ import { classifyUserAgent, loadAgentPatterns } from './user-agent-classification.js';
7
7
 
8
8
  async function logAccess(cfg, request, access) {
9
9
  // Separate html from the status in the access object.
@@ -69,7 +69,7 @@ async function checkAgentStatus(cfg, request) {
69
69
  };
70
70
  }
71
71
 
72
- const agentInfo = classifyUserAgent(userAgent);
72
+ const agentInfo = await classifyUserAgent(cfg, userAgent);
73
73
 
74
74
  const body = JSON.stringify({
75
75
  account_id: cfg.paywallsPublisherId,
@@ -129,14 +129,14 @@ function isTestBot(request) {
129
129
  const uaParam = url.searchParams.get("user-agent");
130
130
  return uaParam && uaParam.includes("bot");
131
131
  }
132
- function isPaywallsKnownBot(request) {
132
+ async function isPaywallsKnownBot(cfg, request) {
133
133
  const userAgent = request.headers.get("User-Agent");
134
- const uaClassification = classifyUserAgent(userAgent);
134
+ const uaClassification = await classifyUserAgent(cfg, userAgent);
135
135
  return uaClassification.operator && uaClassification.agent;
136
136
  }
137
137
 
138
- function isRecognizedBot(request) {
139
- return isFastlyKnownBot(request) || isCloudflareKnownBot(request) || isTestBot(request) || isPaywallsKnownBot(request);
138
+ async function isRecognizedBot(cfg, request) {
139
+ return isFastlyKnownBot(request) || isCloudflareKnownBot(request) || isTestBot(request) || await isPaywallsKnownBot(cfg, request);
140
140
  }
141
141
 
142
142
 
@@ -172,8 +172,9 @@ async function cloudflare(config = null) {
172
172
  paywallsAPIKey: env.PAYWALLS_CLOUD_API_KEY,
173
173
  paywallsPublisherId: env.PAYWALLS_PUBLISHER_ID
174
174
  };
175
+ await loadAgentPatterns(paywallsConfig);
175
176
 
176
- if (isRecognizedBot(request)) {
177
+ if (isRecognizedBot(paywallsConfig, request)) {
177
178
  const authz = await checkAgentStatus(paywallsConfig, request);
178
179
 
179
180
  ctx.waitUntil(logAccess(paywallsConfig, request, authz));
@@ -196,9 +197,10 @@ async function fastly(config) {
196
197
  paywallsAPIKey: config.get('PAYWALLS_API_KEY'),
197
198
  paywallsPublisherId: config.get('PAYWALLS_PUBLISHER_ID')
198
199
  };
200
+ await loadAgentPatterns(paywallsConfig);
199
201
 
200
202
  return async function handle(request) {
201
- if (isRecognizedBot(request)) {
203
+ if (isRecognizedBot(paywallsConfig,request)) {
202
204
  const authz = await checkAgentStatus(paywallsConfig, request);
203
205
 
204
206
  await logAccess(paywallsConfig, request, authz);
@@ -220,6 +222,7 @@ async function fastly(config) {
220
222
  * @returns {Function} - The handler function for the specified CDN.
221
223
  */
222
224
  export async function init(cdn, config = {}) {
225
+
223
226
  switch (cdn.toLowerCase()) {
224
227
  case 'cloudflare':
225
228
  return await cloudflare(config);
@@ -1,200 +1,62 @@
1
1
  import { UAParser } from 'ua-parser-js';
2
2
 
3
- const userAgentPatterns = [
4
- {
5
- operator: 'Anthropic',
6
- agent: 'ClaudeBot',
7
- usage: ['ai_training'],
8
- user_initiated: 'no',
9
- patterns: [/ClaudeBot/, /anthropic-ai/]
10
- },
11
- {
12
- operator: 'Anthropic',
13
- agent: 'Claude-User',
14
- usage: ['ai_chat'],
15
- user_initiated: 'yes',
16
- patterns: [/Claude-User/]
17
- },
18
- {
19
- operator: 'Anthropic',
20
- agent: 'Claude-SearchBot',
21
- usage: ['ai_indexing'],
22
- user_initiated: 'maybe',
23
- patterns: [/Claude-SearchBot/]
24
- },
3
+ // Cache for user agent patterns
4
+ let cachedUserAgentPatterns = null;
5
+ let cacheTimestamp = null;
6
+ const CACHE_DURATION = 60 * 60 * 1000; // 1 hour
25
7
 
26
- {
27
- operator: 'Lumar',
28
- agent: 'DeepCrawl',
29
- usage: ['webmaster tools'],
30
- user_initiated: 'no',
31
- patterns: [/deepcrawl.com/]
32
- },
33
-
34
- {
35
- operator: 'Google',
36
- agent: 'Googlebot',
37
- usage: ['search_indexing','ai_training'],
38
- user_initiated: 'maybe',
39
- patterns: [/Googlebot/]
40
- },
41
- {
42
- operator: 'Google',
43
- agent: 'Gemini-Deep-Research',
44
- usage: ['ai_agents'],
45
- user_initiated: 'maybe',
46
- patterns: [/Gemini-Deep-Research/]
47
- },
48
- {
49
- operator: 'Google',
50
- agent: 'Google-Extended',
51
- usage: ['ai_training'],
52
- usage_prefs_only: true,
53
- user_initiated: 'no',
54
- },
55
- {
56
- operator: 'Google',
57
- agent: 'Googlebot-News',
58
- usage: ['Google News'],
59
- usage_prefs_only: true,
60
- user_initiated: 'no'
61
- },
62
- {
63
- operator: 'Google',
64
- agent: 'Googlebot-Image',
65
- usage: ['image indexing'],
66
- user_initiated: 'no',
67
- patterns: [/Googlebot-Image/]
68
- },
69
- {
70
- operator: 'Google',
71
- agent: 'Google-Site-Verification',
72
- usage: ['site verification'],
73
- user_initiated: 'no',
74
- patterns: [/Google-Site-Verification/]
75
- },
76
- {
77
- operator: 'Google',
78
- agent: 'Google Web Preview',
79
- usage: ['web preview'],
80
- user_initiated: 'no',
81
- patterns: [/Google Web Preview/]
82
- },
83
- {
84
- operator: 'Google',
85
- agent: 'Googlebot-Video',
86
- usage: ['video indexing'],
87
- user_initiated: 'no',
88
- patterns: [/Googlebot-Video/]
89
- },
90
- {
91
- operator: 'Google',
92
- agent: 'FeedFetcher-Google',
93
- usage: ['Feed crawling'],
94
- user_initiated: 'yes',
95
- patterns: [/FeedFetcher-Google/]
96
- },
97
-
98
- {
99
- operator: 'OpenAI',
100
- agent: 'GPTBot',
101
- usage: ['ai_training'],
102
- user_initiated: 'no',
103
- patterns: [/GPTBot/]
104
- },
105
- {
106
- operator: 'OpenAI',
107
- agent: 'OAI-SearchBot',
108
- usage: ['ai_indexing'],
109
- user_initiated: 'no',
110
- patterns: [/OAI-SearchBot/]
111
- },
112
- {
113
- operator: 'OpenAI',
114
- agent: 'ChatGPT-User',
115
- usage: ['ai_chat'],
116
- user_initiated: 'yes',
117
- patterns: [/ChatGPT-User/]
118
- },
8
+ /**
9
+ * Fetch user agent patterns from the API and cache them.
10
+ * @returns {Promise<Array>} The user agent patterns.
11
+ */
12
+ export async function loadAgentPatterns(cfg) {
13
+ const now = Date.now();
119
14
 
120
-
121
- {
122
- operator: 'Meta',
123
- agent: 'facebookexternalhit',
124
- usage: ['content sharing'],
125
- user_initiated: 'no',
126
- patterns: [/facebookexternalhit/]
127
- },
128
- {
129
- operator: 'Meta',
130
- agent: 'meta-externalagent',
131
- usage: ['ai_training'],
132
- user_initiated: 'no',
133
- patterns: [/meta-externalagent/]
134
- },
135
- {
136
- operator: 'Meta',
137
- agent: 'meta-externalfetcher',
138
- usage: ['web preview'],
139
- user_initiated: 'no',
140
- patterns: [/meta-externalfetcher/]
141
- },
15
+ // Return cached patterns if still valid
16
+ if (cachedUserAgentPatterns && (now - cacheTimestamp < CACHE_DURATION)) {
17
+ return cachedUserAgentPatterns;
18
+ }
142
19
 
143
- {
144
- operator: 'Perplexity',
145
- agent: 'Perplexity-User',
146
- usage: ['ai_chat'],
147
- user_initiated: 'yes',
148
- patterns: [/Perplexity-User/]
149
- },
150
- {
151
- operator: 'Perplexity',
152
- agent: 'PerplexityBot',
153
- usage: ['ai_indexing'],
154
- user_initiated: 'maybe',
155
- patterns: [/PerplexityBot/]
156
- },
157
-
158
- {
159
- operator: 'Cohere',
160
- agent: 'cohere-ai',
161
- usage: ['ai_training'],
162
- user_initiated: 'no',
163
- patterns: [/cohere-ai/i]
164
- },
20
+ try {
21
+ const response = await fetch(`${cfg.paywallsAPIHost}/api/filter/agents/metadata`, {
22
+ method: 'POST',
23
+ headers: {
24
+ 'Content-Type': 'application/json',
25
+ Authorization: `Bearer ${cfg.paywallsAPIKey}`
26
+ }
27
+ });
165
28
 
166
- {
167
- operator: 'Bing',
168
- agent: 'BingBot',
169
- usage: ['search_indexing','ai_indexing'],
170
- user_initiated: 'maybe',
171
- patterns: [/bingbot/i, /BingPreview/]
172
- },
29
+ if (!response.ok) {
30
+ throw new Error(`Failed to fetch user agent patterns: ${response.status} ${response.statusText}`);
31
+ }
173
32
 
174
- {
175
- operator: 'Microsoft',
176
- agent: 'BF-DirectLine',
177
- usage: ['Bot Framework SDK'],
178
- user_initiated: 'no',
179
- patterns: [/BF-DirectLine/]
33
+ cachedUserAgentPatterns = await response.json();
34
+ cacheTimestamp = now;
35
+ return cachedUserAgentPatterns;
36
+ } catch (error) {
37
+ console.error('Failed to fetch user agent patterns:', error);
38
+ throw new Error('Could not load user agent patterns');
180
39
  }
181
- ];
40
+ }
182
41
 
183
42
  /**
184
- * Classifies the user agent string based on predefined patterns.
43
+ * Classifies the user agent string based on fetched patterns.
44
+ * @param {Object} cfg - Configuration object containing API host details.
185
45
  * @param {string} userAgent - The user agent string to classify.
186
- * @returns {Object} An object containing the browser, OS, operator, usage, and user_initiated status.
46
+ * @returns {Promise<Object>} An object containing the browser, OS, operator, usage, and user_initiated status.
187
47
  */
188
- export function classifyUserAgent(userAgent) {
48
+ export async function classifyUserAgent(cfg, userAgent) {
189
49
  const parsedUA = new UAParser(userAgent).getResult();
190
50
 
191
51
  const browser = parsedUA.browser.name || 'Unknown';
192
52
  const os = parsedUA.os.name || 'Unknown';
193
53
 
54
+ const userAgentPatterns = await loadAgentPatterns(cfg);
55
+
194
56
  for (const config of userAgentPatterns) {
195
57
  if (!config.patterns) continue;
196
58
  for (const pattern of config.patterns) {
197
- if (pattern.test(userAgent)) {
59
+ if (new RegExp(pattern).test(userAgent)) {
198
60
  return {
199
61
  operator: config.operator,
200
62
  agent: config.agent || browser,