@adobe/spacecat-shared-utils 1.89.0 → 1.89.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/package.json +1 -1
- package/src/bot-blocker-detect/bot-blocker-detect.js +154 -2
- package/src/index.js +7 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-utils-v1.89.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.89.0...@adobe/spacecat-shared-utils-v1.89.1) (2026-01-26)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* Additional checks and methods on bot protection ([#1250](https://github.com/adobe/spacecat-shared/issues/1250)) ([0c34a8d](https://github.com/adobe/spacecat-shared/commit/0c34a8d850abef6e2a024132bc1c61d10865c1a0))
|
|
7
|
+
|
|
1
8
|
# [@adobe/spacecat-shared-utils-v1.89.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.88.0...@adobe/spacecat-shared-utils-v1.89.0) (2026-01-22)
|
|
2
9
|
|
|
3
10
|
|
package/package.json
CHANGED
|
@@ -30,7 +30,85 @@ const CONFIDENCE_MEDIUM = 0.95;
|
|
|
30
30
|
const CONFIDENCE_ABSOLUTE = 1.0;
|
|
31
31
|
const DEFAULT_TIMEOUT = 5000;
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
/**
|
|
34
|
+
* SpaceCat bot identification constants
|
|
35
|
+
*/
|
|
36
|
+
export const SPACECAT_BOT_USER_AGENT = 'Spacecat/1.0';
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Gets SpaceCat bot IPs from environment variable
|
|
40
|
+
* @param {string} ipsString - Comma-separated IPs (from env/secrets) - REQUIRED
|
|
41
|
+
* @returns {Array<string>} Array of IP addresses
|
|
42
|
+
* @throws {Error} If ipsString is not provided
|
|
43
|
+
*/
|
|
44
|
+
export function getSpacecatBotIps(ipsString) {
|
|
45
|
+
if (!ipsString) {
|
|
46
|
+
throw new Error('SPACECAT_BOT_IPS environment variable is required but not set');
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return ipsString.split(',').map((ip) => ip.trim()).filter((ip) => ip);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Formats allowlist message with current bot IPs
|
|
54
|
+
* @param {string} botIps - Comma-separated IPs from secrets - REQUIRED
|
|
55
|
+
* @returns {object} Formatted message with IPs and user-agent
|
|
56
|
+
* @throws {Error} If botIps is not provided
|
|
57
|
+
*/
|
|
58
|
+
export function formatAllowlistMessage(botIps) {
|
|
59
|
+
const ips = getSpacecatBotIps(botIps);
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
title: 'To allowlist SpaceCat bot:',
|
|
63
|
+
ips,
|
|
64
|
+
userAgent: SPACECAT_BOT_USER_AGENT,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* HTML patterns for detecting challenge pages
|
|
70
|
+
*/
|
|
71
|
+
const CHALLENGE_PATTERNS = {
|
|
72
|
+
cloudflare: [
|
|
73
|
+
/Checking your browser/i,
|
|
74
|
+
/Just a moment\.\.\./i,
|
|
75
|
+
/Verifying you are human/i,
|
|
76
|
+
/Please wait.*CloudFlare/i,
|
|
77
|
+
/cf-turnstile/i,
|
|
78
|
+
/challenge-platform/i,
|
|
79
|
+
/cf-chl-widget/i, // Cloudflare challenge widget
|
|
80
|
+
/ray\s*id.*cloudflare/i, // Cloudflare Ray ID in error pages
|
|
81
|
+
/__cf_chl_tk/i, // Cloudflare challenge token
|
|
82
|
+
/cloudflare.*security/i,
|
|
83
|
+
/attention required.*cloudflare/i,
|
|
84
|
+
],
|
|
85
|
+
imperva: [
|
|
86
|
+
/_Incapsula_Resource/i,
|
|
87
|
+
/Incapsula incident ID/i,
|
|
88
|
+
/incap_ses/i, // Imperva session cookie
|
|
89
|
+
/visid_incap/i, // Imperva visitor ID
|
|
90
|
+
],
|
|
91
|
+
akamai: [
|
|
92
|
+
/Access Denied.*Akamai/i,
|
|
93
|
+
/Reference.*Akamai/i,
|
|
94
|
+
],
|
|
95
|
+
general: [
|
|
96
|
+
/captcha/i,
|
|
97
|
+
/human verification/i,
|
|
98
|
+
/recaptcha/i,
|
|
99
|
+
/hcaptcha/i,
|
|
100
|
+
/datadome/i,
|
|
101
|
+
/dd-request-id/i,
|
|
102
|
+
],
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Analyzes response for bot protection indicators
|
|
107
|
+
* @param {Object} response - Response object with status and headers
|
|
108
|
+
* @param {string} [html] - Optional HTML content for deeper analysis
|
|
109
|
+
* @returns {Object} Detection result
|
|
110
|
+
*/
|
|
111
|
+
function analyzeResponse(response, html = null) {
|
|
34
112
|
const { status, headers } = response;
|
|
35
113
|
|
|
36
114
|
// Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
|
|
@@ -45,6 +123,12 @@ function analyzeResponse(response) {
|
|
|
45
123
|
|| headers.get('x-amz-cf-pop')
|
|
46
124
|
|| headers.get('via')?.includes('CloudFront');
|
|
47
125
|
|
|
126
|
+
// Check HTML content for challenge page patterns (if HTML provided)
|
|
127
|
+
const htmlHasChallenge = (patterns) => {
|
|
128
|
+
if (!html) return false;
|
|
129
|
+
return patterns.some((pattern) => pattern.test(html));
|
|
130
|
+
};
|
|
131
|
+
|
|
48
132
|
// Active blocking (403 status with known blocker)
|
|
49
133
|
if (status === 403 && hasCloudflare()) {
|
|
50
134
|
return {
|
|
@@ -88,6 +172,16 @@ function analyzeResponse(response) {
|
|
|
88
172
|
|
|
89
173
|
// Success with known infrastructure present (infrastructure detected but allowing requests)
|
|
90
174
|
if (status === 200 && hasCloudflare()) {
|
|
175
|
+
// Check if HTML contains challenge page (even though status is 200)
|
|
176
|
+
if (htmlHasChallenge(CHALLENGE_PATTERNS.cloudflare)) {
|
|
177
|
+
return {
|
|
178
|
+
crawlable: false,
|
|
179
|
+
type: 'cloudflare',
|
|
180
|
+
confidence: CONFIDENCE_HIGH,
|
|
181
|
+
reason: 'Challenge page detected despite 200 status',
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
|
|
91
185
|
return {
|
|
92
186
|
crawlable: true,
|
|
93
187
|
type: 'cloudflare-allowed',
|
|
@@ -96,6 +190,14 @@ function analyzeResponse(response) {
|
|
|
96
190
|
}
|
|
97
191
|
|
|
98
192
|
if (status === 200 && hasImperva()) {
|
|
193
|
+
if (htmlHasChallenge(CHALLENGE_PATTERNS.imperva)) {
|
|
194
|
+
return {
|
|
195
|
+
crawlable: false,
|
|
196
|
+
type: 'imperva',
|
|
197
|
+
confidence: CONFIDENCE_HIGH,
|
|
198
|
+
reason: 'Challenge page detected despite 200 status',
|
|
199
|
+
};
|
|
200
|
+
}
|
|
99
201
|
return {
|
|
100
202
|
crawlable: true,
|
|
101
203
|
type: 'imperva-allowed',
|
|
@@ -104,6 +206,14 @@ function analyzeResponse(response) {
|
|
|
104
206
|
}
|
|
105
207
|
|
|
106
208
|
if (status === 200 && hasAkamai()) {
|
|
209
|
+
if (htmlHasChallenge(CHALLENGE_PATTERNS.akamai)) {
|
|
210
|
+
return {
|
|
211
|
+
crawlable: false,
|
|
212
|
+
type: 'akamai',
|
|
213
|
+
confidence: CONFIDENCE_HIGH,
|
|
214
|
+
reason: 'Challenge page detected despite 200 status',
|
|
215
|
+
};
|
|
216
|
+
}
|
|
107
217
|
return {
|
|
108
218
|
crawlable: true,
|
|
109
219
|
type: 'akamai-allowed',
|
|
@@ -129,6 +239,15 @@ function analyzeResponse(response) {
|
|
|
129
239
|
|
|
130
240
|
// Success with no known infrastructure
|
|
131
241
|
if (status === 200) {
|
|
242
|
+
// Still check for generic challenge patterns
|
|
243
|
+
if (htmlHasChallenge(CHALLENGE_PATTERNS.general)) {
|
|
244
|
+
return {
|
|
245
|
+
crawlable: false,
|
|
246
|
+
type: 'unknown',
|
|
247
|
+
confidence: 0.7,
|
|
248
|
+
reason: 'Generic challenge patterns detected',
|
|
249
|
+
};
|
|
250
|
+
}
|
|
132
251
|
return {
|
|
133
252
|
crawlable: true,
|
|
134
253
|
type: 'none',
|
|
@@ -136,7 +255,16 @@ function analyzeResponse(response) {
|
|
|
136
255
|
};
|
|
137
256
|
}
|
|
138
257
|
|
|
139
|
-
//
|
|
258
|
+
// Potential CDN/protection blocked the request
|
|
259
|
+
if (status === 403) {
|
|
260
|
+
return {
|
|
261
|
+
crawlable: false,
|
|
262
|
+
type: 'unknown',
|
|
263
|
+
confidence: 0.7,
|
|
264
|
+
reason: 'HTTP 403 Forbidden - access denied',
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
|
|
140
268
|
return {
|
|
141
269
|
crawlable: true,
|
|
142
270
|
type: 'unknown',
|
|
@@ -207,3 +335,27 @@ export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
|
|
|
207
335
|
return analyzeError(error);
|
|
208
336
|
}
|
|
209
337
|
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Analyzes already-fetched response data for bot protection.
|
|
341
|
+
* Used by content scraper to analyze Puppeteer results without making another request.
|
|
342
|
+
*
|
|
343
|
+
* @param {Object} data - Response data to analyze
|
|
344
|
+
* @param {number} data.status - HTTP status code
|
|
345
|
+
* @param {Object} data.headers - Response headers (plain object or Headers object)
|
|
346
|
+
* @param {string} [data.html] - Optional HTML content for challenge page detection
|
|
347
|
+
* @returns {Object} Detection result (same format as detectBotBlocker)
|
|
348
|
+
*/
|
|
349
|
+
export function analyzeBotProtection({ status, headers, html }) {
|
|
350
|
+
// Convert headers to Headers object if plain object
|
|
351
|
+
const headersObj = headers instanceof Headers
|
|
352
|
+
? headers
|
|
353
|
+
: new Headers(Object.entries(headers || {}));
|
|
354
|
+
|
|
355
|
+
const response = {
|
|
356
|
+
status,
|
|
357
|
+
headers: headersObj,
|
|
358
|
+
};
|
|
359
|
+
|
|
360
|
+
return analyzeResponse(response, html);
|
|
361
|
+
}
|
package/src/index.js
CHANGED
|
@@ -111,7 +111,13 @@ export * as llmoStrategy from './llmo-strategy.js';
|
|
|
111
111
|
export * as schemas from './schemas.js';
|
|
112
112
|
|
|
113
113
|
export { detectLocale } from './locale-detect/locale-detect.js';
|
|
114
|
-
export {
|
|
114
|
+
export {
|
|
115
|
+
detectBotBlocker,
|
|
116
|
+
analyzeBotProtection,
|
|
117
|
+
SPACECAT_BOT_USER_AGENT,
|
|
118
|
+
getSpacecatBotIps,
|
|
119
|
+
formatAllowlistMessage,
|
|
120
|
+
} from './bot-blocker-detect/bot-blocker-detect.js';
|
|
115
121
|
export { prettifyLogForwardingConfig } from './cdn-helpers.js';
|
|
116
122
|
|
|
117
123
|
export {
|