@adobe/spacecat-shared-utils 1.85.1 → 1.86.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-utils-v1.86.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.85.2...@adobe/spacecat-shared-utils-v1.86.0) (2025-12-12)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* add detection for Akamai, Fastly, and CloudFront ([#1238](https://github.com/adobe/spacecat-shared/issues/1238)) ([3f7aad9](https://github.com/adobe/spacecat-shared/commit/3f7aad96fbc823b2e9d59541a71ba3b4e6d315e8))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-utils-v1.85.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.85.1...@adobe/spacecat-shared-utils-v1.85.2) (2025-12-11)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* Implement Structured (JSON) Logging for Spacecat Audits - rollback ([#1239](https://github.com/adobe/spacecat-shared/issues/1239)) ([1f174d7](https://github.com/adobe/spacecat-shared/commit/1f174d7dd188dbdc610b75bf58644992925755b1))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-utils-v1.85.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.85.0...@adobe/spacecat-shared-utils-v1.85.1) (2025-12-11)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
|
@@ -16,7 +16,8 @@ import { isValidUrl } from '../functions.js';
|
|
|
16
16
|
/**
|
|
17
17
|
* Confidence levels used in bot blocker detection:
|
|
18
18
|
* - 1.0 (ABSOLUTE): Site responds successfully with 200 OK - definitively crawlable
|
|
19
|
-
* - 0.99 (HIGH): Known bot blocker signature detected
|
|
19
|
+
* - 0.99 (HIGH): Known bot blocker signature detected
|
|
20
|
+
* (Cloudflare, Imperva, Akamai, Fastly, CloudFront)
|
|
20
21
|
* - 0.95 (MEDIUM): HTTP/2 protocol errors indicating potential blocking
|
|
21
22
|
* - 0.5: Unknown status code without known blocker signature (e.g., 403 without headers)
|
|
22
23
|
* - 0.3: Unknown error occurred during request
|
|
@@ -32,7 +33,20 @@ const DEFAULT_TIMEOUT = 5000;
|
|
|
32
33
|
function analyzeResponse(response) {
|
|
33
34
|
const { status, headers } = response;
|
|
34
35
|
|
|
35
|
-
|
|
36
|
+
// Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
|
|
37
|
+
const hasCloudflare = () => headers.get('cf-ray') || headers.get('server') === 'cloudflare';
|
|
38
|
+
const hasImperva = () => headers.get('x-iinfo') || headers.get('x-cdn') === 'Incapsula';
|
|
39
|
+
const hasAkamai = () => headers.get('x-akamai-request-id')
|
|
40
|
+
|| headers.get('x-akamai-session-id')
|
|
41
|
+
|| headers.get('server')?.includes('AkamaiGHost');
|
|
42
|
+
const hasFastly = () => headers.get('x-served-by')?.startsWith('cache-')
|
|
43
|
+
|| headers.get('fastly-io-info');
|
|
44
|
+
const hasCloudFront = () => headers.get('x-amz-cf-id')
|
|
45
|
+
|| headers.get('x-amz-cf-pop')
|
|
46
|
+
|| headers.get('via')?.includes('CloudFront');
|
|
47
|
+
|
|
48
|
+
// Active blocking (403 status with known blocker)
|
|
49
|
+
if (status === 403 && hasCloudflare()) {
|
|
36
50
|
return {
|
|
37
51
|
crawlable: false,
|
|
38
52
|
type: 'cloudflare',
|
|
@@ -40,7 +54,7 @@ function analyzeResponse(response) {
|
|
|
40
54
|
};
|
|
41
55
|
}
|
|
42
56
|
|
|
43
|
-
if (status === 403 && (
|
|
57
|
+
if (status === 403 && hasImperva()) {
|
|
44
58
|
return {
|
|
45
59
|
crawlable: false,
|
|
46
60
|
type: 'imperva',
|
|
@@ -48,6 +62,72 @@ function analyzeResponse(response) {
|
|
|
48
62
|
};
|
|
49
63
|
}
|
|
50
64
|
|
|
65
|
+
if (status === 403 && hasAkamai()) {
|
|
66
|
+
return {
|
|
67
|
+
crawlable: false,
|
|
68
|
+
type: 'akamai',
|
|
69
|
+
confidence: CONFIDENCE_HIGH,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (status === 403 && hasFastly()) {
|
|
74
|
+
return {
|
|
75
|
+
crawlable: false,
|
|
76
|
+
type: 'fastly',
|
|
77
|
+
confidence: CONFIDENCE_HIGH,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (status === 403 && hasCloudFront()) {
|
|
82
|
+
return {
|
|
83
|
+
crawlable: false,
|
|
84
|
+
type: 'cloudfront',
|
|
85
|
+
confidence: CONFIDENCE_HIGH,
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Success with known infrastructure present (infrastructure detected but allowing requests)
|
|
90
|
+
if (status === 200 && hasCloudflare()) {
|
|
91
|
+
return {
|
|
92
|
+
crawlable: true,
|
|
93
|
+
type: 'cloudflare-allowed',
|
|
94
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (status === 200 && hasImperva()) {
|
|
99
|
+
return {
|
|
100
|
+
crawlable: true,
|
|
101
|
+
type: 'imperva-allowed',
|
|
102
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (status === 200 && hasAkamai()) {
|
|
107
|
+
return {
|
|
108
|
+
crawlable: true,
|
|
109
|
+
type: 'akamai-allowed',
|
|
110
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (status === 200 && hasFastly()) {
|
|
115
|
+
return {
|
|
116
|
+
crawlable: true,
|
|
117
|
+
type: 'fastly-allowed',
|
|
118
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (status === 200 && hasCloudFront()) {
|
|
123
|
+
return {
|
|
124
|
+
crawlable: true,
|
|
125
|
+
type: 'cloudfront-allowed',
|
|
126
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Success with no known infrastructure
|
|
51
131
|
if (status === 200) {
|
|
52
132
|
return {
|
|
53
133
|
crawlable: true,
|
|
@@ -56,6 +136,7 @@ function analyzeResponse(response) {
|
|
|
56
136
|
};
|
|
57
137
|
}
|
|
58
138
|
|
|
139
|
+
// Unknown status without known blocker signature
|
|
59
140
|
return {
|
|
60
141
|
crawlable: true,
|
|
61
142
|
type: 'unknown',
|
|
@@ -86,14 +167,24 @@ function analyzeError(error) {
|
|
|
86
167
|
* Currently detects:
|
|
87
168
|
* - Cloudflare bot blocking (403 + cf-ray header)
|
|
88
169
|
* - Imperva/Incapsula (403 + x-iinfo or x-cdn: Incapsula header)
|
|
170
|
+
* - Akamai (403 + x-akamai-request-id or related headers)
|
|
171
|
+
* - Fastly (403 + x-served-by or fastly-io-info headers)
|
|
172
|
+
* - AWS CloudFront (403 + x-amz-cf-id or via: CloudFront header)
|
|
89
173
|
* - HTTP/2 stream errors (NGHTTP2_INTERNAL_ERROR, ERR_HTTP2_STREAM_ERROR)
|
|
90
174
|
*
|
|
175
|
+
* Also detects infrastructure presence on successful requests (200 OK):
|
|
176
|
+
* - Returns 'cloudflare-allowed', 'imperva-allowed', 'akamai-allowed',
|
|
177
|
+
* 'fastly-allowed', or 'cloudfront-allowed' when infrastructure is present
|
|
178
|
+
* but allowing the request through
|
|
179
|
+
*
|
|
91
180
|
* @param {Object} config - Configuration object
|
|
92
181
|
* @param {string} config.baseUrl - The base URL to check
|
|
93
182
|
* @param {number} [config.timeout=5000] - Request timeout in milliseconds
|
|
94
183
|
* @returns {Promise<Object>} Detection result with:
|
|
95
184
|
* - crawlable {boolean}: Whether the site can be crawled by bots
|
|
96
|
-
* - type {string}: Blocker type ('cloudflare', 'imperva', '
|
|
185
|
+
* - type {string}: Blocker type ('cloudflare', 'imperva', 'akamai', 'fastly',
|
|
186
|
+
* 'cloudfront', 'http2-block', 'cloudflare-allowed', 'imperva-allowed',
|
|
187
|
+
* 'akamai-allowed', 'fastly-allowed', 'cloudfront-allowed', 'none', 'unknown')
|
|
97
188
|
* - confidence {number}: Confidence level (0.0-1.0, see confidence level constants)
|
|
98
189
|
* @throws {Error} If baseUrl is invalid
|
|
99
190
|
*/
|
|
@@ -17,7 +17,10 @@ export interface BotBlockerConfig {
|
|
|
17
17
|
|
|
18
18
|
export interface BotBlockerResult {
|
|
19
19
|
crawlable: boolean;
|
|
20
|
-
type: 'cloudflare' | 'imperva' | '
|
|
20
|
+
type: 'cloudflare' | 'imperva' | 'akamai' | 'fastly' | 'cloudfront'
|
|
21
|
+
| 'cloudflare-allowed' | 'imperva-allowed' | 'akamai-allowed'
|
|
22
|
+
| 'fastly-allowed' | 'cloudfront-allowed'
|
|
23
|
+
| 'http2-block' | 'none' | 'unknown';
|
|
21
24
|
confidence: number;
|
|
22
25
|
}
|
|
23
26
|
|
package/src/log-wrapper.js
CHANGED
|
@@ -13,92 +13,64 @@
|
|
|
13
13
|
import { getTraceId } from './xray.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
&& !(value instanceof Error)
|
|
25
|
-
&& value.constructor === Object;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
* A higher-order function that wraps a given function and enhances logging by converting
|
|
30
|
-
* all logs to JSON format and appending `severity`, `jobId`and `traceId`
|
|
31
|
-
* to log messages when available.
|
|
16
|
+
* A higher-order function that wraps a given function and enhances logging by appending
|
|
17
|
+
* a `jobId` and `traceId` to log messages when available. This improves traceability of logs
|
|
18
|
+
* associated with specific jobs or processes.
|
|
19
|
+
*
|
|
20
|
+
* The wrapper checks if a `log` object exists in the `context` and whether the `message`
|
|
21
|
+
* contains a `jobId`. It also extracts the AWS X-Ray trace ID if available. If found, log
|
|
22
|
+
* methods (e.g., `info`, `error`, etc.) will prepend the `jobId` and/or `traceId` to all log
|
|
23
|
+
* statements. All existing code using `context.log` will automatically include these markers.
|
|
32
24
|
*
|
|
33
|
-
*
|
|
34
|
-
*
|
|
35
|
-
* -
|
|
36
|
-
*
|
|
25
|
+
* @param {function} fn - The original function to be wrapped, called with the provided
|
|
26
|
+
* message and context after logging enhancement.
|
|
27
|
+
* @returns {function(object, object): Promise<Response>} - A wrapped function that enhances
|
|
28
|
+
* logging and returns the result of the original function.
|
|
37
29
|
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
30
|
+
* `context.log` will be enhanced in place to include `jobId` and/or `traceId` prefixed to all
|
|
31
|
+
* log messages. No code changes needed - existing `context.log` calls work automatically.
|
|
40
32
|
*/
|
|
41
33
|
export function logWrapper(fn) {
|
|
42
34
|
return async (message, context) => {
|
|
43
35
|
const { log } = context;
|
|
44
36
|
|
|
45
37
|
if (log && !context.contextualLog) {
|
|
46
|
-
const markers =
|
|
38
|
+
const markers = [];
|
|
47
39
|
|
|
48
40
|
// Extract jobId from message if available
|
|
49
41
|
if (typeof message === 'object' && message !== null && 'jobId' in message) {
|
|
50
|
-
|
|
42
|
+
const { jobId } = message;
|
|
43
|
+
markers.push(`[jobId=${jobId}]`);
|
|
51
44
|
}
|
|
52
45
|
|
|
53
46
|
// Extract traceId from AWS X-Ray
|
|
54
47
|
const traceId = getTraceId();
|
|
55
48
|
if (traceId) {
|
|
56
|
-
markers.traceId
|
|
49
|
+
markers.push(`[traceId=${traceId}]`);
|
|
57
50
|
}
|
|
58
51
|
|
|
59
|
-
//
|
|
60
|
-
|
|
52
|
+
// If we have markers, enhance the log object directly
|
|
53
|
+
if (markers.length > 0) {
|
|
54
|
+
const markerString = markers.join(' ');
|
|
61
55
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
if (typeof log[level] === 'function') {
|
|
65
|
-
accumulator[level] = (...args) => {
|
|
66
|
-
// If first argument is a plain object, merge with markers
|
|
67
|
-
if (args.length > 0 && isPlainObject(args[0])) {
|
|
68
|
-
return log[level](JSON.stringify({ severity: level, ...markers, ...args[0] }));
|
|
69
|
-
}
|
|
56
|
+
// Define log levels
|
|
57
|
+
const logLevels = ['info', 'error', 'debug', 'warn', 'trace', 'verbose', 'silly', 'fatal'];
|
|
70
58
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
// If second argument is a plain object, merge it into the log object
|
|
80
|
-
if (args.length > 1 && isPlainObject(args[1])) {
|
|
81
|
-
Object.assign(logObject, args[1]);
|
|
82
|
-
|
|
83
|
-
// If there are more arguments after the object, add them as 'data'
|
|
84
|
-
if (args.length > 2) {
|
|
85
|
-
logObject.data = args.slice(2);
|
|
86
|
-
}
|
|
87
|
-
} else if (args.length > 1) {
|
|
88
|
-
// If there are additional arguments but second is not a plain object,
|
|
89
|
-
// add all additional args as 'data'
|
|
90
|
-
logObject.data = args.slice(1);
|
|
59
|
+
// Enhance context.log directly to include markers in all log statements
|
|
60
|
+
context.log = logLevels.reduce((accumulator, level) => {
|
|
61
|
+
if (typeof log[level] === 'function') {
|
|
62
|
+
accumulator[level] = (...args) => {
|
|
63
|
+
// If first argument is a string (format string), prepend the marker to it
|
|
64
|
+
if (args.length > 0 && typeof args[0] === 'string') {
|
|
65
|
+
const enhancedArgs = [`${markerString} ${args[0]}`, ...args.slice(1)];
|
|
66
|
+
return log[level](...enhancedArgs);
|
|
91
67
|
}
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
};
|
|
99
|
-
}
|
|
100
|
-
return accumulator;
|
|
101
|
-
}, {});
|
|
68
|
+
return log[level](...args);
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
return accumulator;
|
|
72
|
+
}, {});
|
|
73
|
+
}
|
|
102
74
|
|
|
103
75
|
// Mark that we've processed this context
|
|
104
76
|
context.contextualLog = context.log;
|