@adobe/spacecat-shared-utils 1.85.2 → 1.87.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-utils-v1.87.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.86.0...@adobe/spacecat-shared-utils-v1.87.0) (2026-01-08)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* propagate traceId across workers for cross-worker tracing ([#1247](https://github.com/adobe/spacecat-shared/issues/1247)) ([f7d194d](https://github.com/adobe/spacecat-shared/commit/f7d194d2a8d23426c9dd66aae33f1118d226e827))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-utils-v1.86.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.85.2...@adobe/spacecat-shared-utils-v1.86.0) (2025-12-12)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* add detection for Akamai, Fastly, and CloudFront ([#1238](https://github.com/adobe/spacecat-shared/issues/1238)) ([3f7aad9](https://github.com/adobe/spacecat-shared/commit/3f7aad96fbc823b2e9d59541a71ba3b4e6d315e8))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-utils-v1.85.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.85.1...@adobe/spacecat-shared-utils-v1.85.2) (2025-12-11)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
|
@@ -16,7 +16,8 @@ import { isValidUrl } from '../functions.js';
|
|
|
16
16
|
/**
|
|
17
17
|
* Confidence levels used in bot blocker detection:
|
|
18
18
|
* - 1.0 (ABSOLUTE): Site responds successfully with 200 OK - definitively crawlable
|
|
19
|
-
* - 0.99 (HIGH): Known bot blocker signature detected
|
|
19
|
+
* - 0.99 (HIGH): Known bot blocker signature detected
|
|
20
|
+
* (Cloudflare, Imperva, Akamai, Fastly, CloudFront)
|
|
20
21
|
* - 0.95 (MEDIUM): HTTP/2 protocol errors indicating potential blocking
|
|
21
22
|
* - 0.5: Unknown status code without known blocker signature (e.g., 403 without headers)
|
|
22
23
|
* - 0.3: Unknown error occurred during request
|
|
@@ -32,7 +33,20 @@ const DEFAULT_TIMEOUT = 5000;
|
|
|
32
33
|
function analyzeResponse(response) {
|
|
33
34
|
const { status, headers } = response;
|
|
34
35
|
|
|
35
|
-
|
|
36
|
+
// Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
|
|
37
|
+
const hasCloudflare = () => headers.get('cf-ray') || headers.get('server') === 'cloudflare';
|
|
38
|
+
const hasImperva = () => headers.get('x-iinfo') || headers.get('x-cdn') === 'Incapsula';
|
|
39
|
+
const hasAkamai = () => headers.get('x-akamai-request-id')
|
|
40
|
+
|| headers.get('x-akamai-session-id')
|
|
41
|
+
|| headers.get('server')?.includes('AkamaiGHost');
|
|
42
|
+
const hasFastly = () => headers.get('x-served-by')?.startsWith('cache-')
|
|
43
|
+
|| headers.get('fastly-io-info');
|
|
44
|
+
const hasCloudFront = () => headers.get('x-amz-cf-id')
|
|
45
|
+
|| headers.get('x-amz-cf-pop')
|
|
46
|
+
|| headers.get('via')?.includes('CloudFront');
|
|
47
|
+
|
|
48
|
+
// Active blocking (403 status with known blocker)
|
|
49
|
+
if (status === 403 && hasCloudflare()) {
|
|
36
50
|
return {
|
|
37
51
|
crawlable: false,
|
|
38
52
|
type: 'cloudflare',
|
|
@@ -40,7 +54,7 @@ function analyzeResponse(response) {
|
|
|
40
54
|
};
|
|
41
55
|
}
|
|
42
56
|
|
|
43
|
-
if (status === 403 && (
|
|
57
|
+
if (status === 403 && hasImperva()) {
|
|
44
58
|
return {
|
|
45
59
|
crawlable: false,
|
|
46
60
|
type: 'imperva',
|
|
@@ -48,6 +62,72 @@ function analyzeResponse(response) {
|
|
|
48
62
|
};
|
|
49
63
|
}
|
|
50
64
|
|
|
65
|
+
if (status === 403 && hasAkamai()) {
|
|
66
|
+
return {
|
|
67
|
+
crawlable: false,
|
|
68
|
+
type: 'akamai',
|
|
69
|
+
confidence: CONFIDENCE_HIGH,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (status === 403 && hasFastly()) {
|
|
74
|
+
return {
|
|
75
|
+
crawlable: false,
|
|
76
|
+
type: 'fastly',
|
|
77
|
+
confidence: CONFIDENCE_HIGH,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (status === 403 && hasCloudFront()) {
|
|
82
|
+
return {
|
|
83
|
+
crawlable: false,
|
|
84
|
+
type: 'cloudfront',
|
|
85
|
+
confidence: CONFIDENCE_HIGH,
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Success with known infrastructure present (infrastructure detected but allowing requests)
|
|
90
|
+
if (status === 200 && hasCloudflare()) {
|
|
91
|
+
return {
|
|
92
|
+
crawlable: true,
|
|
93
|
+
type: 'cloudflare-allowed',
|
|
94
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (status === 200 && hasImperva()) {
|
|
99
|
+
return {
|
|
100
|
+
crawlable: true,
|
|
101
|
+
type: 'imperva-allowed',
|
|
102
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (status === 200 && hasAkamai()) {
|
|
107
|
+
return {
|
|
108
|
+
crawlable: true,
|
|
109
|
+
type: 'akamai-allowed',
|
|
110
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (status === 200 && hasFastly()) {
|
|
115
|
+
return {
|
|
116
|
+
crawlable: true,
|
|
117
|
+
type: 'fastly-allowed',
|
|
118
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (status === 200 && hasCloudFront()) {
|
|
123
|
+
return {
|
|
124
|
+
crawlable: true,
|
|
125
|
+
type: 'cloudfront-allowed',
|
|
126
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Success with no known infrastructure
|
|
51
131
|
if (status === 200) {
|
|
52
132
|
return {
|
|
53
133
|
crawlable: true,
|
|
@@ -56,6 +136,7 @@ function analyzeResponse(response) {
|
|
|
56
136
|
};
|
|
57
137
|
}
|
|
58
138
|
|
|
139
|
+
// Unknown status without known blocker signature
|
|
59
140
|
return {
|
|
60
141
|
crawlable: true,
|
|
61
142
|
type: 'unknown',
|
|
@@ -86,14 +167,24 @@ function analyzeError(error) {
|
|
|
86
167
|
* Currently detects:
|
|
87
168
|
* - Cloudflare bot blocking (403 + cf-ray header)
|
|
88
169
|
* - Imperva/Incapsula (403 + x-iinfo or x-cdn: Incapsula header)
|
|
170
|
+
* - Akamai (403 + x-akamai-request-id or related headers)
|
|
171
|
+
* - Fastly (403 + x-served-by or fastly-io-info headers)
|
|
172
|
+
* - AWS CloudFront (403 + x-amz-cf-id or via: CloudFront header)
|
|
89
173
|
* - HTTP/2 stream errors (NGHTTP2_INTERNAL_ERROR, ERR_HTTP2_STREAM_ERROR)
|
|
90
174
|
*
|
|
175
|
+
* Also detects infrastructure presence on successful requests (200 OK):
|
|
176
|
+
* - Returns 'cloudflare-allowed', 'imperva-allowed', 'akamai-allowed',
|
|
177
|
+
* 'fastly-allowed', or 'cloudfront-allowed' when infrastructure is present
|
|
178
|
+
* but allowing the request through
|
|
179
|
+
*
|
|
91
180
|
* @param {Object} config - Configuration object
|
|
92
181
|
* @param {string} config.baseUrl - The base URL to check
|
|
93
182
|
* @param {number} [config.timeout=5000] - Request timeout in milliseconds
|
|
94
183
|
* @returns {Promise<Object>} Detection result with:
|
|
95
184
|
* - crawlable {boolean}: Whether the site can be crawled by bots
|
|
96
|
-
* - type {string}: Blocker type ('cloudflare', 'imperva', '
|
|
185
|
+
* - type {string}: Blocker type ('cloudflare', 'imperva', 'akamai', 'fastly',
|
|
186
|
+
* 'cloudfront', 'http2-block', 'cloudflare-allowed', 'imperva-allowed',
|
|
187
|
+
* 'akamai-allowed', 'fastly-allowed', 'cloudfront-allowed', 'none', 'unknown')
|
|
97
188
|
* - confidence {number}: Confidence level (0.0-1.0, see confidence level constants)
|
|
98
189
|
* @throws {Error} If baseUrl is invalid
|
|
99
190
|
*/
|
|
@@ -17,7 +17,10 @@ export interface BotBlockerConfig {
|
|
|
17
17
|
|
|
18
18
|
export interface BotBlockerResult {
|
|
19
19
|
crawlable: boolean;
|
|
20
|
-
type: 'cloudflare' | 'imperva' | '
|
|
20
|
+
type: 'cloudflare' | 'imperva' | 'akamai' | 'fastly' | 'cloudfront'
|
|
21
|
+
| 'cloudflare-allowed' | 'imperva-allowed' | 'akamai-allowed'
|
|
22
|
+
| 'fastly-allowed' | 'cloudfront-allowed'
|
|
23
|
+
| 'http2-block' | 'none' | 'unknown';
|
|
21
24
|
confidence: number;
|
|
22
25
|
}
|
|
23
26
|
|
package/src/cdn-helpers.js
CHANGED
|
@@ -152,6 +152,17 @@ const CDN_TRANSFORMATIONS = {
|
|
|
152
152
|
'Compress logs': 'Yes',
|
|
153
153
|
HelpUrl: 'https://docs-cybersec.thalesgroup.com/bundle/cloud-application-security/page/siem-log-configuration.htm',
|
|
154
154
|
}),
|
|
155
|
+
'byocdn-other': (payload) => ({
|
|
156
|
+
'Bucket name': payload.bucketName,
|
|
157
|
+
Region: payload.region,
|
|
158
|
+
Path: `${payload.allowedPaths?.[0] || ''}<year>/<month>/<day>`,
|
|
159
|
+
'Access Key': payload.accessKey,
|
|
160
|
+
'Secret Key': payload.secretKey,
|
|
161
|
+
'Timestamp format': 'RFC3339',
|
|
162
|
+
'Log format': 'JSON lines (one log per line)',
|
|
163
|
+
Compression: 'Optional, but prefered. Please use Gzip compression if you decide to compress the log files.',
|
|
164
|
+
'Example of valid log line': '{"timestamp":"2025-12-01T13:00:05Z","host":"www.example.com","url":"/docs/getting-started","request_method":"GET","request_user_agent":"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)","response_status":200,"request_referer":"https://www.chatgpt.com/","response_content_type":"text/html; charset=utf-8","time_to_first_byte":123}',
|
|
165
|
+
}),
|
|
155
166
|
};
|
|
156
167
|
|
|
157
168
|
/**
|
|
@@ -164,7 +175,8 @@ const CDN_TRANSFORMATIONS = {
|
|
|
164
175
|
*
|
|
165
176
|
* @param {Object} payload - The result from CDN-Logs-Infrastructure-Provisioning API
|
|
166
177
|
* @param {string} payload.logSource - The CDN type ('byocdn-fastly' | 'byocdn-akamai'
|
|
167
|
-
* | 'byocdn-cloudflare' | 'byocdn-cloudfront' | 'ams-cloudfront' | 'byocdn-imperva'
|
|
178
|
+
* | 'byocdn-cloudflare' | 'byocdn-cloudfront' | 'ams-cloudfront' | 'byocdn-imperva'
|
|
179
|
+
* | 'byocdn-other')
|
|
168
180
|
* @returns {Object} - The prepared log forwarding configuration parameters
|
|
169
181
|
* @throws {Error} - If logSource is not supported or missing
|
|
170
182
|
*/
|
|
@@ -193,7 +205,7 @@ const prettifyLogForwardingConfig = (payload) => {
|
|
|
193
205
|
throw new Error('allowedPaths is required in payload');
|
|
194
206
|
}
|
|
195
207
|
|
|
196
|
-
if (payload.logSource === 'byocdn-fastly' || payload.logSource === 'byocdn-akamai') {
|
|
208
|
+
if (payload.logSource === 'byocdn-fastly' || payload.logSource === 'byocdn-akamai' || payload.logSource === 'byocdn-other') {
|
|
197
209
|
if (!payload.accessKey) {
|
|
198
210
|
throw new Error('accessKey is required in payload');
|
|
199
211
|
}
|
package/src/log-wrapper.js
CHANGED
|
@@ -43,8 +43,9 @@ export function logWrapper(fn) {
|
|
|
43
43
|
markers.push(`[jobId=${jobId}]`);
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
// Extract traceId from
|
|
47
|
-
|
|
46
|
+
// Extract traceId: prioritize context.traceId (from SQS message propagation)
|
|
47
|
+
// over X-Ray segment (which is new for each Lambda invocation)
|
|
48
|
+
const traceId = context.traceId || getTraceId();
|
|
48
49
|
if (traceId) {
|
|
49
50
|
markers.push(`[traceId=${traceId}]`);
|
|
50
51
|
}
|