@paywalls-net/filter 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,11 +1,11 @@
1
- # paywalls-net/client
1
+ # paywalls-net/filter
2
2
 
3
3
  SDK for integrating paywalls.net authorization services with CDN or edge environments.
4
4
 
5
5
  ## Install
6
6
 
7
7
  ```bash
8
- npm install @paywalls-net/client
8
+ npm install @paywalls-net/filter
9
9
  ```
10
10
 
11
11
  ## Environment Variables
@@ -15,7 +15,7 @@ npm install @paywalls-net/client
15
15
 
16
16
  ## Usage
17
17
  ```javascript
18
- import { init } from '@paywalls-net/client';
18
+ import { init } from '@paywalls-net/filter';
19
19
 
20
20
  const handleRequest = await init('cloudflare');
21
21
 
package/package.json CHANGED
@@ -3,7 +3,7 @@
3
3
  "description": "Client SDK for integrating paywalls.net bot filtering and authorization services into your server or CDN.",
4
4
  "author": "paywalls.net",
5
5
  "license": "MIT",
6
- "version": "1.0.5",
6
+ "version": "1.1.0",
7
7
  "publishConfig": {
8
8
  "access": "public"
9
9
  },
package/src/index.js CHANGED
@@ -3,7 +3,7 @@
3
3
  * filters bot-like requests by using paywalls.net authorization services.
4
4
  */
5
5
 
6
- import { classifyUserAgent } from './user-agent-classification.js';
6
+ import { classifyUserAgent, loadAgentPatterns } from './user-agent-classification.js';
7
7
 
8
8
  async function logAccess(cfg, request, access) {
9
9
  // Separate html from the status in the access object.
@@ -69,7 +69,7 @@ async function checkAgentStatus(cfg, request) {
69
69
  };
70
70
  }
71
71
 
72
- const agentInfo = classifyUserAgent(userAgent);
72
+ const agentInfo = classifyUserAgent(cfg, userAgent);
73
73
 
74
74
  const body = JSON.stringify({
75
75
  account_id: cfg.paywallsPublisherId,
@@ -129,14 +129,14 @@ function isTestBot(request) {
129
129
  const uaParam = url.searchParams.get("user-agent");
130
130
  return uaParam && uaParam.includes("bot");
131
131
  }
132
- function isPaywallsKnownBot(request) {
132
+ function isPaywallsKnownBot(cfg,request) {
133
133
  const userAgent = request.headers.get("User-Agent");
134
134
  const uaClassification = classifyUserAgent(userAgent);
135
135
  return uaClassification.operator && uaClassification.agent;
136
136
  }
137
137
 
138
- function isRecognizedBot(request) {
139
- return isFastlyKnownBot(request) || isCloudflareKnownBot(request) || isTestBot(request) || isPaywallsKnownBot(request);
138
+ function isRecognizedBot(cfg,request) {
139
+ return isFastlyKnownBot(request) || isCloudflareKnownBot(request) || isTestBot(request) || isPaywallsKnownBot(cfg,request);
140
140
  }
141
141
 
142
142
 
@@ -172,8 +172,9 @@ async function cloudflare(config = null) {
172
172
  paywallsAPIKey: env.PAYWALLS_CLOUD_API_KEY,
173
173
  paywallsPublisherId: env.PAYWALLS_PUBLISHER_ID
174
174
  };
175
+ await loadAgentPatterns(paywallsConfig);
175
176
 
176
- if (isRecognizedBot(request)) {
177
+ if (isRecognizedBot(paywallsConfig, request)) {
177
178
  const authz = await checkAgentStatus(paywallsConfig, request);
178
179
 
179
180
  ctx.waitUntil(logAccess(paywallsConfig, request, authz));
@@ -196,9 +197,10 @@ async function fastly(config) {
196
197
  paywallsAPIKey: config.get('PAYWALLS_API_KEY'),
197
198
  paywallsPublisherId: config.get('PAYWALLS_PUBLISHER_ID')
198
199
  };
200
+ await loadAgentPatterns(paywallsConfig);
199
201
 
200
202
  return async function handle(request) {
201
- if (isRecognizedBot(request)) {
203
+ if (isRecognizedBot(paywallsConfig,request)) {
202
204
  const authz = await checkAgentStatus(paywallsConfig, request);
203
205
 
204
206
  await logAccess(paywallsConfig, request, authz);
@@ -220,6 +222,7 @@ async function fastly(config) {
220
222
  * @returns {Function} - The handler function for the specified CDN.
221
223
  */
222
224
  export async function init(cdn, config = {}) {
225
+
223
226
  switch (cdn.toLowerCase()) {
224
227
  case 'cloudflare':
225
228
  return await cloudflare(config);
@@ -1,200 +1,70 @@
1
1
  import { UAParser } from 'ua-parser-js';
2
2
 
3
- const userAgentPatterns = [
4
- {
5
- operator: 'Anthropic',
6
- agent: 'ClaudeBot',
7
- usage: ['ai_training'],
8
- user_initiated: 'no',
9
- patterns: [/ClaudeBot/, /anthropic-ai/]
10
- },
11
- {
12
- operator: 'Anthropic',
13
- agent: 'Claude-User',
14
- usage: ['ai_chat'],
15
- user_initiated: 'yes',
16
- patterns: [/Claude-User/]
17
- },
18
- {
19
- operator: 'Anthropic',
20
- agent: 'Claude-SearchBot',
21
- usage: ['ai_indexing'],
22
- user_initiated: 'maybe',
23
- patterns: [/Claude-SearchBot/]
24
- },
3
+ // Cache for user agent patterns
4
+ let cachedUserAgentPatterns = null;
5
+ let cacheTimestamp = null;
6
+ const CACHE_DURATION = 60 * 60 * 1000; // 1 hour
25
7
 
26
- {
27
- operator: 'Lumar',
28
- agent: 'DeepCrawl',
29
- usage: ['webmaster tools'],
30
- user_initiated: 'no',
31
- patterns: [/deepcrawl.com/]
32
- },
33
-
34
- {
35
- operator: 'Google',
36
- agent: 'Googlebot',
37
- usage: ['search_indexing','ai_training'],
38
- user_initiated: 'maybe',
39
- patterns: [/Googlebot/]
40
- },
41
- {
42
- operator: 'Google',
43
- agent: 'Gemini-Deep-Research',
44
- usage: ['ai_agents'],
45
- user_initiated: 'maybe',
46
- patterns: [/Gemini-Deep-Research/]
47
- },
48
- {
49
- operator: 'Google',
50
- agent: 'Google-Extended',
51
- usage: ['ai_training'],
52
- usage_prefs_only: true,
53
- user_initiated: 'no',
54
- },
55
- {
56
- operator: 'Google',
57
- agent: 'Googlebot-News',
58
- usage: ['Google News'],
59
- usage_prefs_only: true,
60
- user_initiated: 'no'
61
- },
62
- {
63
- operator: 'Google',
64
- agent: 'Googlebot-Image',
65
- usage: ['image indexing'],
66
- user_initiated: 'no',
67
- patterns: [/Googlebot-Image/]
68
- },
69
- {
70
- operator: 'Google',
71
- agent: 'Google-Site-Verification',
72
- usage: ['site verification'],
73
- user_initiated: 'no',
74
- patterns: [/Google-Site-Verification/]
75
- },
76
- {
77
- operator: 'Google',
78
- agent: 'Google Web Preview',
79
- usage: ['web preview'],
80
- user_initiated: 'no',
81
- patterns: [/Google Web Preview/]
82
- },
83
- {
84
- operator: 'Google',
85
- agent: 'Googlebot-Video',
86
- usage: ['video indexing'],
87
- user_initiated: 'no',
88
- patterns: [/Googlebot-Video/]
89
- },
90
- {
91
- operator: 'Google',
92
- agent: 'FeedFetcher-Google',
93
- usage: ['Feed crawling'],
94
- user_initiated: 'yes',
95
- patterns: [/FeedFetcher-Google/]
96
- },
8
+ /**
9
+ * Fetch user agent patterns from the API and cache them.
10
+ * @returns {Promise<Array>} The user agent patterns.
11
+ */
12
+ export async function loadAgentPatterns(cfg) {
13
+ const now = Date.now();
97
14
 
98
- {
99
- operator: 'OpenAI',
100
- agent: 'GPTBot',
101
- usage: ['ai_training'],
102
- user_initiated: 'no',
103
- patterns: [/GPTBot/]
104
- },
105
- {
106
- operator: 'OpenAI',
107
- agent: 'OAI-SearchBot',
108
- usage: ['ai_indexing'],
109
- user_initiated: 'no',
110
- patterns: [/OAI-SearchBot/]
111
- },
112
- {
113
- operator: 'OpenAI',
114
- agent: 'ChatGPT-User',
115
- usage: ['ai_chat'],
116
- user_initiated: 'yes',
117
- patterns: [/ChatGPT-User/]
118
- },
15
+ // Return cached patterns if still valid
16
+ if (cachedUserAgentPatterns && (now - cacheTimestamp < CACHE_DURATION)) {
17
+ return cachedUserAgentPatterns;
18
+ }
119
19
 
120
-
121
- {
122
- operator: 'Meta',
123
- agent: 'facebookexternalhit',
124
- usage: ['content sharing'],
125
- user_initiated: 'no',
126
- patterns: [/facebookexternalhit/]
127
- },
128
- {
129
- operator: 'Meta',
130
- agent: 'meta-externalagent',
131
- usage: ['ai_training'],
132
- user_initiated: 'no',
133
- patterns: [/meta-externalagent/]
134
- },
135
- {
136
- operator: 'Meta',
137
- agent: 'meta-externalfetcher',
138
- usage: ['web preview'],
139
- user_initiated: 'no',
140
- patterns: [/meta-externalfetcher/]
141
- },
20
+ try {
21
+ const response = await fetch(`${cfg.paywallsAPIHost}/api/filter/agents/metadata`, {
22
+ method: 'POST',
23
+ headers: {
24
+ 'Content-Type': 'application/json',
25
+ Authorization: `Bearer ${cfg.paywallsAPIKey}`
26
+ }
27
+ });
142
28
 
143
- {
144
- operator: 'Perplexity',
145
- agent: 'Perplexity-User',
146
- usage: ['ai_chat'],
147
- user_initiated: 'yes',
148
- patterns: [/Perplexity-User/]
149
- },
150
- {
151
- operator: 'Perplexity',
152
- agent: 'PerplexityBot',
153
- usage: ['ai_indexing'],
154
- user_initiated: 'maybe',
155
- patterns: [/PerplexityBot/]
156
- },
157
-
158
- {
159
- operator: 'Cohere',
160
- agent: 'cohere-ai',
161
- usage: ['ai_training'],
162
- user_initiated: 'no',
163
- patterns: [/cohere-ai/i]
164
- },
29
+ if (!response.ok) {
30
+ throw new Error(`Failed to fetch user agent patterns: ${response.status} ${response.statusText}`);
31
+ }
165
32
 
166
- {
167
- operator: 'Bing',
168
- agent: 'BingBot',
169
- usage: ['search_indexing','ai_indexing'],
170
- user_initiated: 'maybe',
171
- patterns: [/bingbot/i, /BingPreview/]
172
- },
33
+ cachedUserAgentPatterns = await response.json();
34
+ cacheTimestamp = now;
35
+ return cachedUserAgentPatterns;
36
+ } catch (error) {
37
+ console.error('Failed to fetch user agent patterns:', error);
38
+ throw new Error('Could not load user agent patterns');
39
+ }
40
+ }
173
41
 
174
- {
175
- operator: 'Microsoft',
176
- agent: 'BF-DirectLine',
177
- usage: ['Bot Framework SDK'],
178
- user_initiated: 'no',
179
- patterns: [/BF-DirectLine/]
42
+ function getAgentPatterns() {
43
+ if (cachedUserAgentPatterns) {
44
+ return cachedUserAgentPatterns;
45
+ } else {
46
+ throw new Error('User agent patterns not loaded. Call loadAgentPatterns first.');
180
47
  }
181
- ];
48
+ }
182
49
 
183
50
  /**
184
- * Classifies the user agent string based on predefined patterns.
51
+ * Classifies the user agent string based on fetched patterns.
52
+ * @param {Object} cfg - Configuration object containing API host details.
185
53
  * @param {string} userAgent - The user agent string to classify.
186
- * @returns {Object} An object containing the browser, OS, operator, usage, and user_initiated status.
54
+ * @returns {Promise<Object>} An object containing the browser, OS, operator, usage, and user_initiated status.
187
55
  */
188
- export function classifyUserAgent(userAgent) {
56
+ export function classifyUserAgent(cfg, userAgent) {
189
57
  const parsedUA = new UAParser(userAgent).getResult();
190
58
 
191
59
  const browser = parsedUA.browser.name || 'Unknown';
192
60
  const os = parsedUA.os.name || 'Unknown';
193
61
 
62
+ const userAgentPatterns = getAgentPatterns();
63
+
194
64
  for (const config of userAgentPatterns) {
195
65
  if (!config.patterns) continue;
196
66
  for (const pattern of config.patterns) {
197
- if (pattern.test(userAgent)) {
67
+ if (new RegExp(pattern).test(userAgent)) {
198
68
  return {
199
69
  operator: config.operator,
200
70
  agent: config.agent || browser,