@adobe/spacecat-shared-utils 1.88.0 → 1.89.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ # [@adobe/spacecat-shared-utils-v1.89.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.89.0...@adobe/spacecat-shared-utils-v1.89.1) (2026-01-26)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * Additional checks and methods on bot protection ([#1250](https://github.com/adobe/spacecat-shared/issues/1250)) ([0c34a8d](https://github.com/adobe/spacecat-shared/commit/0c34a8d850abef6e2a024132bc1c61d10865c1a0))
7
+
8
+ # [@adobe/spacecat-shared-utils-v1.89.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.88.0...@adobe/spacecat-shared-utils-v1.89.0) (2026-01-22)
9
+
10
+
11
+ ### Features
12
+
13
+ * moved `calculateCpcValue` function to spacecat-shared-utils ([#1048](https://github.com/adobe/spacecat-shared/issues/1048)) ([5006e5b](https://github.com/adobe/spacecat-shared/commit/5006e5be51c8959f2e6d72664251e1e0264d44a1))
14
+
1
15
  # [@adobe/spacecat-shared-utils-v1.88.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.87.1...@adobe/spacecat-shared-utils-v1.88.0) (2026-01-22)
2
16
 
3
17
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-utils",
3
- "version": "1.88.0",
3
+ "version": "1.89.1",
4
4
  "description": "Shared modules of the Spacecat Services - utils",
5
5
  "type": "module",
6
6
  "exports": {
@@ -30,7 +30,85 @@ const CONFIDENCE_MEDIUM = 0.95;
30
30
  const CONFIDENCE_ABSOLUTE = 1.0;
31
31
  const DEFAULT_TIMEOUT = 5000;
32
32
 
33
- function analyzeResponse(response) {
33
+ /**
34
+ * SpaceCat bot identification constants
35
+ */
36
+ export const SPACECAT_BOT_USER_AGENT = 'Spacecat/1.0';
37
+
38
+ /**
39
+ * Gets SpaceCat bot IPs from environment variable
40
+ * @param {string} ipsString - Comma-separated IPs (from env/secrets) - REQUIRED
41
+ * @returns {Array<string>} Array of IP addresses
42
+ * @throws {Error} If ipsString is not provided
43
+ */
44
+ export function getSpacecatBotIps(ipsString) {
45
+ if (!ipsString) {
46
+ throw new Error('SPACECAT_BOT_IPS environment variable is required but not set');
47
+ }
48
+
49
+ return ipsString.split(',').map((ip) => ip.trim()).filter((ip) => ip);
50
+ }
51
+
52
+ /**
53
+ * Formats allowlist message with current bot IPs
54
+ * @param {string} botIps - Comma-separated IPs from secrets - REQUIRED
55
+ * @returns {object} Formatted message with IPs and user-agent
56
+ * @throws {Error} If botIps is not provided
57
+ */
58
+ export function formatAllowlistMessage(botIps) {
59
+ const ips = getSpacecatBotIps(botIps);
60
+
61
+ return {
62
+ title: 'To allowlist SpaceCat bot:',
63
+ ips,
64
+ userAgent: SPACECAT_BOT_USER_AGENT,
65
+ };
66
+ }
67
+
68
+ /**
69
+ * HTML patterns for detecting challenge pages
70
+ */
71
+ const CHALLENGE_PATTERNS = {
72
+ cloudflare: [
73
+ /Checking your browser/i,
74
+ /Just a moment\.\.\./i,
75
+ /Verifying you are human/i,
76
+ /Please wait.*CloudFlare/i,
77
+ /cf-turnstile/i,
78
+ /challenge-platform/i,
79
+ /cf-chl-widget/i, // Cloudflare challenge widget
80
+ /ray\s*id.*cloudflare/i, // Cloudflare Ray ID in error pages
81
+ /__cf_chl_tk/i, // Cloudflare challenge token
82
+ /cloudflare.*security/i,
83
+ /attention required.*cloudflare/i,
84
+ ],
85
+ imperva: [
86
+ /_Incapsula_Resource/i,
87
+ /Incapsula incident ID/i,
88
+ /incap_ses/i, // Imperva session cookie
89
+ /visid_incap/i, // Imperva visitor ID
90
+ ],
91
+ akamai: [
92
+ /Access Denied.*Akamai/i,
93
+ /Reference.*Akamai/i,
94
+ ],
95
+ general: [
96
+ /captcha/i,
97
+ /human verification/i,
98
+ /recaptcha/i,
99
+ /hcaptcha/i,
100
+ /datadome/i,
101
+ /dd-request-id/i,
102
+ ],
103
+ };
104
+
105
+ /**
106
+ * Analyzes response for bot protection indicators
107
+ * @param {Object} response - Response object with status and headers
108
+ * @param {string} [html] - Optional HTML content for deeper analysis
109
+ * @returns {Object} Detection result
110
+ */
111
+ function analyzeResponse(response, html = null) {
34
112
  const { status, headers } = response;
35
113
 
36
114
  // Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
@@ -45,6 +123,12 @@ function analyzeResponse(response) {
45
123
  || headers.get('x-amz-cf-pop')
46
124
  || headers.get('via')?.includes('CloudFront');
47
125
 
126
+ // Check HTML content for challenge page patterns (if HTML provided)
127
+ const htmlHasChallenge = (patterns) => {
128
+ if (!html) return false;
129
+ return patterns.some((pattern) => pattern.test(html));
130
+ };
131
+
48
132
  // Active blocking (403 status with known blocker)
49
133
  if (status === 403 && hasCloudflare()) {
50
134
  return {
@@ -88,6 +172,16 @@ function analyzeResponse(response) {
88
172
 
89
173
  // Success with known infrastructure present (infrastructure detected but allowing requests)
90
174
  if (status === 200 && hasCloudflare()) {
175
+ // Check if HTML contains challenge page (even though status is 200)
176
+ if (htmlHasChallenge(CHALLENGE_PATTERNS.cloudflare)) {
177
+ return {
178
+ crawlable: false,
179
+ type: 'cloudflare',
180
+ confidence: CONFIDENCE_HIGH,
181
+ reason: 'Challenge page detected despite 200 status',
182
+ };
183
+ }
184
+
91
185
  return {
92
186
  crawlable: true,
93
187
  type: 'cloudflare-allowed',
@@ -96,6 +190,14 @@ function analyzeResponse(response) {
96
190
  }
97
191
 
98
192
  if (status === 200 && hasImperva()) {
193
+ if (htmlHasChallenge(CHALLENGE_PATTERNS.imperva)) {
194
+ return {
195
+ crawlable: false,
196
+ type: 'imperva',
197
+ confidence: CONFIDENCE_HIGH,
198
+ reason: 'Challenge page detected despite 200 status',
199
+ };
200
+ }
99
201
  return {
100
202
  crawlable: true,
101
203
  type: 'imperva-allowed',
@@ -104,6 +206,14 @@ function analyzeResponse(response) {
104
206
  }
105
207
 
106
208
  if (status === 200 && hasAkamai()) {
209
+ if (htmlHasChallenge(CHALLENGE_PATTERNS.akamai)) {
210
+ return {
211
+ crawlable: false,
212
+ type: 'akamai',
213
+ confidence: CONFIDENCE_HIGH,
214
+ reason: 'Challenge page detected despite 200 status',
215
+ };
216
+ }
107
217
  return {
108
218
  crawlable: true,
109
219
  type: 'akamai-allowed',
@@ -129,6 +239,15 @@ function analyzeResponse(response) {
129
239
 
130
240
  // Success with no known infrastructure
131
241
  if (status === 200) {
242
+ // Still check for generic challenge patterns
243
+ if (htmlHasChallenge(CHALLENGE_PATTERNS.general)) {
244
+ return {
245
+ crawlable: false,
246
+ type: 'unknown',
247
+ confidence: 0.7,
248
+ reason: 'Generic challenge patterns detected',
249
+ };
250
+ }
132
251
  return {
133
252
  crawlable: true,
134
253
  type: 'none',
@@ -136,7 +255,16 @@ function analyzeResponse(response) {
136
255
  };
137
256
  }
138
257
 
139
- // Unknown status without known blocker signature
258
+ // Potential CDN/protection blocked the request
259
+ if (status === 403) {
260
+ return {
261
+ crawlable: false,
262
+ type: 'unknown',
263
+ confidence: 0.7,
264
+ reason: 'HTTP 403 Forbidden - access denied',
265
+ };
266
+ }
267
+
140
268
  return {
141
269
  crawlable: true,
142
270
  type: 'unknown',
@@ -207,3 +335,27 @@ export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
207
335
  return analyzeError(error);
208
336
  }
209
337
  }
338
+
339
+ /**
340
+ * Analyzes already-fetched response data for bot protection.
341
+ * Used by content scraper to analyze Puppeteer results without making another request.
342
+ *
343
+ * @param {Object} data - Response data to analyze
344
+ * @param {number} data.status - HTTP status code
345
+ * @param {Object} data.headers - Response headers (plain object or Headers object)
346
+ * @param {string} [data.html] - Optional HTML content for challenge page detection
347
+ * @returns {Object} Detection result (same format as detectBotBlocker)
348
+ */
349
+ export function analyzeBotProtection({ status, headers, html }) {
350
+ // Convert headers to Headers object if plain object
351
+ const headersObj = headers instanceof Headers
352
+ ? headers
353
+ : new Headers(Object.entries(headers || {}));
354
+
355
+ const response = {
356
+ status,
357
+ headers: headersObj,
358
+ };
359
+
360
+ return analyzeResponse(response, html);
361
+ }
package/src/constants.js CHANGED
@@ -68,3 +68,5 @@ export const OPPORTUNITY_TYPES = /** @type {const} */ ({
68
68
  // Wikipedia Analysis (LLMO)
69
69
  WIKIPEDIA_ANALYSIS: 'wikipedia-analysis',
70
70
  });
71
+
72
+ export const DEFAULT_CPC_VALUE = 1.5;
package/src/index.d.ts CHANGED
@@ -17,6 +17,8 @@ export { AUTHORING_TYPES, DELIVERY_TYPES } from './aem.js';
17
17
 
18
18
  export { OPPORTUNITY_TYPES } from './constants.js';
19
19
 
20
+ export const DEFAULT_CPC_VALUE: number;
21
+
20
22
  /** UTILITY FUNCTIONS */
21
23
  export function arrayEquals<T>(a: T[], b: T[]): boolean;
22
24
 
@@ -287,6 +289,35 @@ export function getStoredMetrics(config: object, context: object):
287
289
  */
288
290
  export function storeMetrics(content: object, config: object, context: object): Promise<string>;
289
291
 
292
+ /**
293
+ * Retrieves an object from S3 by its key and returns its JSON parsed content.
294
+ * If the object is not JSON, returns the raw body.
295
+ * If the object is not found, returns null.
296
+ * @param s3Client - The S3 client
297
+ * @param bucketName - The name of the S3 bucket
298
+ * @param key - The key of the S3 object
299
+ * @param log - A logger instance
300
+ * @returns The content of the S3 object or null if not found
301
+ */
302
+ export function getObjectFromKey(
303
+ s3Client: any,
304
+ bucketName: string,
305
+ key: string,
306
+ log: any
307
+ ): Promise<any | null>;
308
+
309
+ /**
310
+ * Fetches the organic traffic data for a site from S3 and calculates the CPC value
311
+ * @param context - Context object
312
+ * @param context.env - Environment variables
313
+ * @param context.env.S3_IMPORTER_BUCKET_NAME - S3 importer bucket name
314
+ * @param context.s3Client - S3 client
315
+ * @param context.log - Logger
316
+ * @param siteId - The site ID
317
+ * @returns CPC value in dollars
318
+ */
319
+ export function calculateCPCValue(context: object, siteId: string): Promise<number>;
320
+
290
321
  export function s3Wrapper(fn: (request: object, context: object) => Promise<Response>):
291
322
  (request: object, context: object) => Promise<Response>;
292
323
 
package/src/index.js CHANGED
@@ -76,11 +76,11 @@ export {
76
76
  extractUrlsFromSuggestion,
77
77
  } from './url-extractors.js';
78
78
 
79
- export { getStoredMetrics, storeMetrics } from './metrics-store.js';
79
+ export { getStoredMetrics, storeMetrics, calculateCPCValue } from './metrics-store.js';
80
80
 
81
- export { s3Wrapper } from './s3.js';
81
+ export { s3Wrapper, getObjectFromKey } from './s3.js';
82
82
 
83
- export { OPPORTUNITY_TYPES } from './constants.js';
83
+ export { OPPORTUNITY_TYPES, DEFAULT_CPC_VALUE } from './constants.js';
84
84
 
85
85
  export { fetch } from './adobe-fetch.js';
86
86
  export { tracingFetch, SPACECAT_USER_AGENT } from './tracing-fetch.js';
@@ -111,7 +111,13 @@ export * as llmoStrategy from './llmo-strategy.js';
111
111
  export * as schemas from './schemas.js';
112
112
 
113
113
  export { detectLocale } from './locale-detect/locale-detect.js';
114
- export { detectBotBlocker } from './bot-blocker-detect/bot-blocker-detect.js';
114
+ export {
115
+ detectBotBlocker,
116
+ analyzeBotProtection,
117
+ SPACECAT_BOT_USER_AGENT,
118
+ getSpacecatBotIps,
119
+ formatAllowlistMessage,
120
+ } from './bot-blocker-detect/bot-blocker-detect.js';
115
121
  export { prettifyLogForwardingConfig } from './cdn-helpers.js';
116
122
 
117
123
  export {
@@ -10,6 +10,8 @@
10
10
  * governing permissions and limitations under the License.
11
11
  */
12
12
  import { GetObjectCommand, PutObjectCommand } from '@aws-sdk/client-s3';
13
+ import { getObjectFromKey } from './s3.js';
14
+ import { DEFAULT_CPC_VALUE } from './constants.js';
13
15
 
14
16
  function createFilePath({ siteId, source, metric }) {
15
17
  if (!siteId) {
@@ -80,3 +82,61 @@ export async function storeMetrics(content, config, context) {
80
82
  throw new Error(`Failed to upload metrics to ${filePath}, error: ${e.message}`);
81
83
  }
82
84
  }
85
+
86
+ /**
87
+ * Fetches the organic traffic data for a site from S3 and calculate the CPC value as per
88
+ * https://wiki.corp.adobe.com/pages/viewpage.action?spaceKey=AEMSites&title=Success+Studio+Projected+Business+Impact+Metrics#SuccessStudioProjectedBusinessImpactMetrics-IdentifyingCPCvalueforadomain
89
+ * @param context
90
+ * @param siteId
91
+ * @returns {object} Object containing either { success: true, value: number } on success
92
+ * or { success: false, reason: string, value: number } on failure
93
+ */
94
+ export async function calculateCPCValue(context, siteId) {
95
+ if (!context?.env?.S3_IMPORTER_BUCKET_NAME) {
96
+ throw new Error('S3 importer bucket name is required');
97
+ }
98
+ if (!context.s3Client) {
99
+ throw new Error('S3 client is required');
100
+ }
101
+ if (!context.log) {
102
+ throw new Error('Logger is required');
103
+ }
104
+ if (!siteId) {
105
+ throw new Error('SiteId is required');
106
+ }
107
+ const { s3Client, log } = context;
108
+ const bucketName = context.env.S3_IMPORTER_BUCKET_NAME;
109
+ const key = `metrics/${siteId}/ahrefs/organic-traffic.json`;
110
+ try {
111
+ const organicTrafficData = await getObjectFromKey(s3Client, bucketName, key, log);
112
+ if (!Array.isArray(organicTrafficData) || organicTrafficData.length === 0) {
113
+ log.warn(`Organic traffic data not available for ${siteId}. Using Default CPC value.`);
114
+ return {
115
+ success: false,
116
+ reason: 'Organic traffic data not available',
117
+ value: DEFAULT_CPC_VALUE,
118
+ };
119
+ }
120
+ const lastTraffic = organicTrafficData.at(-1);
121
+ if (!lastTraffic.cost || !lastTraffic.value) {
122
+ log.warn(`Invalid organic traffic data present for ${siteId} - cost:${lastTraffic.cost} value:${lastTraffic.value}, Using Default CPC value.`);
123
+ return {
124
+ success: false,
125
+ reason: 'Invalid organic traffic data',
126
+ value: DEFAULT_CPC_VALUE,
127
+ };
128
+ }
129
+ // dividing by 100 for cents to dollar conversion
130
+ return {
131
+ success: true,
132
+ value: lastTraffic.cost / lastTraffic.value / 100,
133
+ };
134
+ } catch (err) {
135
+ log.error(`Error fetching organic traffic data for site ${siteId}. Using Default CPC value.`, err);
136
+ return {
137
+ success: false,
138
+ reason: 'Error fetching organic traffic data',
139
+ value: DEFAULT_CPC_VALUE,
140
+ };
141
+ }
142
+ }
package/src/s3.js CHANGED
@@ -10,9 +10,55 @@
10
10
  * governing permissions and limitations under the License.
11
11
  */
12
12
 
13
- import { S3Client } from '@aws-sdk/client-s3';
13
+ import { GetObjectCommand, S3Client } from '@aws-sdk/client-s3';
14
14
  import { instrumentAWSClient } from './xray.js';
15
15
 
16
+ /**
17
+ * Retrieves an object from S3 by its key and returns its JSON parsed content.
18
+ * If the object is not JSON, returns the raw body.
19
+ * If the object is not found, returns null.
20
+ * @param {import('@aws-sdk/client-s3').S3Client} s3Client - an S3 client
21
+ * @param {string} bucketName - the name of the S3 bucket
22
+ * @param {string} key - the key of the S3 object
23
+ * @param {import('@azure/logger').Logger} log - a logger instance
24
+ * @returns {Promise<import('@aws-sdk/client-s3').GetObjectOutput['Body'] | null>}
25
+ * - the content of the S3 object
26
+ */
27
+ export async function getObjectFromKey(s3Client, bucketName, key, log) {
28
+ if (!s3Client || !bucketName || !key) {
29
+ log.error(
30
+ 'Invalid input parameters in getObjectFromKey: ensure s3Client, bucketName, and key are provided.',
31
+ );
32
+ return null;
33
+ }
34
+ const command = new GetObjectCommand({
35
+ Bucket: bucketName,
36
+ Key: key,
37
+ });
38
+ try {
39
+ const response = await s3Client.send(command);
40
+ const contentType = response.ContentType;
41
+ const body = await response.Body.transformToString();
42
+
43
+ if (contentType && contentType.includes('application/json')) {
44
+ try {
45
+ return JSON.parse(body);
46
+ } catch (parseError) {
47
+ log.error(`Unable to parse content for key ${key}`, parseError);
48
+ return null;
49
+ }
50
+ }
51
+ // Always return body for non-JSON content types
52
+ return body;
53
+ } catch (err) {
54
+ log.error(
55
+ `Error while fetching S3 object from bucket ${bucketName} using key ${key}`,
56
+ err,
57
+ );
58
+ return null;
59
+ }
60
+ }
61
+
16
62
  /**
17
63
  * Adds an S3Client instance and bucket to the context.
18
64
  *