@adobe/spacecat-shared-utils 1.89.0 → 1.90.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ # [@adobe/spacecat-shared-utils-v1.90.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.89.1...@adobe/spacecat-shared-utils-v1.90.0) (2026-01-29)
2
+
3
+
4
+ ### Features
5
+
6
+ * LLMO-1534 Pretty format old and new log forwarding credentials ([#1290](https://github.com/adobe/spacecat-shared/issues/1290)) ([df84629](https://github.com/adobe/spacecat-shared/commit/df84629c1a49e81b43e352eb4b8b542b9eb3085e))
7
+
8
+ # [@adobe/spacecat-shared-utils-v1.89.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.89.0...@adobe/spacecat-shared-utils-v1.89.1) (2026-01-26)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * Additional checks and methods on bot protection ([#1250](https://github.com/adobe/spacecat-shared/issues/1250)) ([0c34a8d](https://github.com/adobe/spacecat-shared/commit/0c34a8d850abef6e2a024132bc1c61d10865c1a0))
14
+
1
15
  # [@adobe/spacecat-shared-utils-v1.89.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.88.0...@adobe/spacecat-shared-utils-v1.89.0) (2026-01-22)
2
16
 
3
17
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-utils",
3
- "version": "1.89.0",
3
+ "version": "1.90.0",
4
4
  "description": "Shared modules of the Spacecat Services - utils",
5
5
  "type": "module",
6
6
  "exports": {
@@ -30,7 +30,85 @@ const CONFIDENCE_MEDIUM = 0.95;
30
30
  const CONFIDENCE_ABSOLUTE = 1.0;
31
31
  const DEFAULT_TIMEOUT = 5000;
32
32
 
33
- function analyzeResponse(response) {
33
+ /**
34
+ * SpaceCat bot identification constants
35
+ */
36
+ export const SPACECAT_BOT_USER_AGENT = 'Spacecat/1.0';
37
+
38
+ /**
39
+ * Gets SpaceCat bot IPs from environment variable
40
+ * @param {string} ipsString - Comma-separated IPs (from env/secrets) - REQUIRED
41
+ * @returns {Array<string>} Array of IP addresses
42
+ * @throws {Error} If ipsString is not provided
43
+ */
44
+ export function getSpacecatBotIps(ipsString) {
45
+ if (!ipsString) {
46
+ throw new Error('SPACECAT_BOT_IPS environment variable is required but not set');
47
+ }
48
+
49
+ return ipsString.split(',').map((ip) => ip.trim()).filter((ip) => ip);
50
+ }
51
+
52
+ /**
53
+ * Formats allowlist message with current bot IPs
54
+ * @param {string} botIps - Comma-separated IPs from secrets - REQUIRED
55
+ * @returns {object} Formatted message with IPs and user-agent
56
+ * @throws {Error} If botIps is not provided
57
+ */
58
+ export function formatAllowlistMessage(botIps) {
59
+ const ips = getSpacecatBotIps(botIps);
60
+
61
+ return {
62
+ title: 'To allowlist SpaceCat bot:',
63
+ ips,
64
+ userAgent: SPACECAT_BOT_USER_AGENT,
65
+ };
66
+ }
67
+
68
+ /**
69
+ * HTML patterns for detecting challenge pages
70
+ */
71
+ const CHALLENGE_PATTERNS = {
72
+ cloudflare: [
73
+ /Checking your browser/i,
74
+ /Just a moment\.\.\./i,
75
+ /Verifying you are human/i,
76
+ /Please wait.*CloudFlare/i,
77
+ /cf-turnstile/i,
78
+ /challenge-platform/i,
79
+ /cf-chl-widget/i, // Cloudflare challenge widget
80
+ /ray\s*id.*cloudflare/i, // Cloudflare Ray ID in error pages
81
+ /__cf_chl_tk/i, // Cloudflare challenge token
82
+ /cloudflare.*security/i,
83
+ /attention required.*cloudflare/i,
84
+ ],
85
+ imperva: [
86
+ /_Incapsula_Resource/i,
87
+ /Incapsula incident ID/i,
88
+ /incap_ses/i, // Imperva session cookie
89
+ /visid_incap/i, // Imperva visitor ID
90
+ ],
91
+ akamai: [
92
+ /Access Denied.*Akamai/i,
93
+ /Reference.*Akamai/i,
94
+ ],
95
+ general: [
96
+ /captcha/i,
97
+ /human verification/i,
98
+ /recaptcha/i,
99
+ /hcaptcha/i,
100
+ /datadome/i,
101
+ /dd-request-id/i,
102
+ ],
103
+ };
104
+
105
+ /**
106
+ * Analyzes response for bot protection indicators
107
+ * @param {Object} response - Response object with status and headers
108
+ * @param {string} [html] - Optional HTML content for deeper analysis
109
+ * @returns {Object} Detection result
110
+ */
111
+ function analyzeResponse(response, html = null) {
34
112
  const { status, headers } = response;
35
113
 
36
114
  // Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
@@ -45,6 +123,12 @@ function analyzeResponse(response) {
45
123
  || headers.get('x-amz-cf-pop')
46
124
  || headers.get('via')?.includes('CloudFront');
47
125
 
126
+ // Check HTML content for challenge page patterns (if HTML provided)
127
+ const htmlHasChallenge = (patterns) => {
128
+ if (!html) return false;
129
+ return patterns.some((pattern) => pattern.test(html));
130
+ };
131
+
48
132
  // Active blocking (403 status with known blocker)
49
133
  if (status === 403 && hasCloudflare()) {
50
134
  return {
@@ -88,6 +172,16 @@ function analyzeResponse(response) {
88
172
 
89
173
  // Success with known infrastructure present (infrastructure detected but allowing requests)
90
174
  if (status === 200 && hasCloudflare()) {
175
+ // Check if HTML contains challenge page (even though status is 200)
176
+ if (htmlHasChallenge(CHALLENGE_PATTERNS.cloudflare)) {
177
+ return {
178
+ crawlable: false,
179
+ type: 'cloudflare',
180
+ confidence: CONFIDENCE_HIGH,
181
+ reason: 'Challenge page detected despite 200 status',
182
+ };
183
+ }
184
+
91
185
  return {
92
186
  crawlable: true,
93
187
  type: 'cloudflare-allowed',
@@ -96,6 +190,14 @@ function analyzeResponse(response) {
96
190
  }
97
191
 
98
192
  if (status === 200 && hasImperva()) {
193
+ if (htmlHasChallenge(CHALLENGE_PATTERNS.imperva)) {
194
+ return {
195
+ crawlable: false,
196
+ type: 'imperva',
197
+ confidence: CONFIDENCE_HIGH,
198
+ reason: 'Challenge page detected despite 200 status',
199
+ };
200
+ }
99
201
  return {
100
202
  crawlable: true,
101
203
  type: 'imperva-allowed',
@@ -104,6 +206,14 @@ function analyzeResponse(response) {
104
206
  }
105
207
 
106
208
  if (status === 200 && hasAkamai()) {
209
+ if (htmlHasChallenge(CHALLENGE_PATTERNS.akamai)) {
210
+ return {
211
+ crawlable: false,
212
+ type: 'akamai',
213
+ confidence: CONFIDENCE_HIGH,
214
+ reason: 'Challenge page detected despite 200 status',
215
+ };
216
+ }
107
217
  return {
108
218
  crawlable: true,
109
219
  type: 'akamai-allowed',
@@ -129,6 +239,15 @@ function analyzeResponse(response) {
129
239
 
130
240
  // Success with no known infrastructure
131
241
  if (status === 200) {
242
+ // Still check for generic challenge patterns
243
+ if (htmlHasChallenge(CHALLENGE_PATTERNS.general)) {
244
+ return {
245
+ crawlable: false,
246
+ type: 'unknown',
247
+ confidence: 0.7,
248
+ reason: 'Generic challenge patterns detected',
249
+ };
250
+ }
132
251
  return {
133
252
  crawlable: true,
134
253
  type: 'none',
@@ -136,7 +255,16 @@ function analyzeResponse(response) {
136
255
  };
137
256
  }
138
257
 
139
- // Unknown status without known blocker signature
258
+ // Potential CDN/protection blocked the request
259
+ if (status === 403) {
260
+ return {
261
+ crawlable: false,
262
+ type: 'unknown',
263
+ confidence: 0.7,
264
+ reason: 'HTTP 403 Forbidden - access denied',
265
+ };
266
+ }
267
+
140
268
  return {
141
269
  crawlable: true,
142
270
  type: 'unknown',
@@ -207,3 +335,27 @@ export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
207
335
  return analyzeError(error);
208
336
  }
209
337
  }
338
+
339
+ /**
340
+ * Analyzes already-fetched response data for bot protection.
341
+ * Used by content scraper to analyze Puppeteer results without making another request.
342
+ *
343
+ * @param {Object} data - Response data to analyze
344
+ * @param {number} data.status - HTTP status code
345
+ * @param {Object} data.headers - Response headers (plain object or Headers object)
346
+ * @param {string} [data.html] - Optional HTML content for challenge page detection
347
+ * @returns {Object} Detection result (same format as detectBotBlocker)
348
+ */
349
+ export function analyzeBotProtection({ status, headers, html }) {
350
+ // Convert headers to Headers object if plain object
351
+ const headersObj = headers instanceof Headers
352
+ ? headers
353
+ : new Headers(Object.entries(headers || {}));
354
+
355
+ const response = {
356
+ status,
357
+ headers: headersObj,
358
+ };
359
+
360
+ return analyzeResponse(response, html);
361
+ }
@@ -11,9 +11,44 @@
11
11
  */
12
12
 
13
13
  /**
14
- * CDN-specific transformations for log forwarding configuration preparation
14
+ * Transforms credential fields object with backwards compatibility
15
+ * @param {Object} payload - The payload containing credential information
16
+ * @returns {Object} - Object with credential fields
15
17
  */
18
+ const transformCredentialFields = (payload) => {
19
+ const fields = {};
20
+
21
+ if (payload.currentAccessKey) {
22
+ fields['Access Key (current)'] = payload.currentAccessKey;
23
+ } else if (payload.accessKey) {
24
+ fields['Access Key'] = payload.accessKey;
25
+ }
26
+
27
+ if (payload.currentSecretKey) {
28
+ fields['Secret Key (current)'] = payload.currentSecretKey;
29
+ } else if (payload.secretKey) {
30
+ fields['Secret Key'] = payload.secretKey;
31
+ }
32
+
33
+ if (payload.oldAccessKey) {
34
+ fields['Access Key (to be retired)'] = payload.oldAccessKey;
35
+ }
16
36
 
37
+ if (payload.oldSecretKey) {
38
+ fields['Secret Key (to be retired)'] = payload.oldSecretKey;
39
+ }
40
+
41
+ fields.currentCredentialsCreatedAt = payload.currentCredentialsCreatedAt;
42
+ fields.currentCredentialsLastUsed = payload.currentCredentialsLastUsed;
43
+ fields.oldCredentialsCreatedAt = payload.oldCredentialsCreatedAt;
44
+ fields.oldCredentialsLastUsed = payload.oldCredentialsLastUsed;
45
+
46
+ return fields;
47
+ };
48
+
49
+ /**
50
+ * CDN-specific transformations for log forwarding configuration preparation
51
+ */
17
52
  const FASTLY_LOG_FORMAT = `{
18
53
  "timestamp": "%{strftime(\\{"%Y-%m-%dT%H:%M:%S%z"\\}, time.start)}V",
19
54
  "host": "%{if(req.http.Fastly-Orig-Host, req.http.Fastly-Orig-Host, req.http.Host)}V",
@@ -35,8 +70,7 @@ const CDN_TRANSFORMATIONS = {
35
70
  Placement: 'Format Version Default',
36
71
  'Log format': FASTLY_LOG_FORMAT,
37
72
  'Access method': 'User credentials',
38
- 'Access key': payload.accessKey,
39
- 'Secret key': payload.secretKey,
73
+ ...transformCredentialFields(payload),
40
74
  Period: 300,
41
75
  'Log line format': 'Blank',
42
76
  Compression: 'Gzip',
@@ -67,8 +101,7 @@ const CDN_TRANSFORMATIONS = {
67
101
  'Log file prefix': '{%Y}-{%m}-{%d}T{%H}:{%M}:{%S}.000',
68
102
  'Log file suffix': '.log',
69
103
  'Log interval': '60 seconds',
70
- 'Access key': payload.accessKey,
71
- 'Secret key': payload.secretKey,
104
+ ...transformCredentialFields(payload),
72
105
  HelpUrl: 'https://techdocs.akamai.com/datastream2/docs/stream-amazon-s3',
73
106
  }),
74
107
  'byocdn-cloudflare': (payload) => ({
@@ -156,8 +189,7 @@ const CDN_TRANSFORMATIONS = {
156
189
  'Bucket name': payload.bucketName,
157
190
  Region: payload.region,
158
191
  Path: `${payload.allowedPaths?.[0] || ''}<year>/<month>/<day>`,
159
- 'Access Key': payload.accessKey,
160
- 'Secret Key': payload.secretKey,
192
+ ...transformCredentialFields(payload),
161
193
  'Timestamp format': 'RFC3339',
162
194
  'Log format': 'JSON lines (one log per line)',
163
195
  Compression: 'Optional, but prefered. Please use Gzip compression if you decide to compress the log files.',
@@ -206,11 +238,11 @@ const prettifyLogForwardingConfig = (payload) => {
206
238
  }
207
239
 
208
240
  if (payload.logSource === 'byocdn-fastly' || payload.logSource === 'byocdn-akamai' || payload.logSource === 'byocdn-other') {
209
- if (!payload.accessKey) {
210
- throw new Error('accessKey is required in payload');
241
+ if (!payload.accessKey && !payload.currentAccessKey) {
242
+ throw new Error('accessKey or currentAccessKey is required in payload');
211
243
  }
212
- if (!payload.secretKey) {
213
- throw new Error('secretKey is required in payload');
244
+ if (!payload.secretKey && !payload.currentSecretKey) {
245
+ throw new Error('secretKey or currentSecretKey is required in payload');
214
246
  }
215
247
  }
216
248
 
package/src/index.js CHANGED
@@ -111,7 +111,13 @@ export * as llmoStrategy from './llmo-strategy.js';
111
111
  export * as schemas from './schemas.js';
112
112
 
113
113
  export { detectLocale } from './locale-detect/locale-detect.js';
114
- export { detectBotBlocker } from './bot-blocker-detect/bot-blocker-detect.js';
114
+ export {
115
+ detectBotBlocker,
116
+ analyzeBotProtection,
117
+ SPACECAT_BOT_USER_AGENT,
118
+ getSpacecatBotIps,
119
+ formatAllowlistMessage,
120
+ } from './bot-blocker-detect/bot-blocker-detect.js';
115
121
  export { prettifyLogForwardingConfig } from './cdn-helpers.js';
116
122
 
117
123
  export {