@adobe/spacecat-shared-utils 1.115.3 → 1.115.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/package.json +3 -2
- package/src/bot-blocker-detect/bot-blocker-detect.js +92 -14
- package/src/network-policy.js +71 -0
- package/src/url-helpers.js +57 -14
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
## [@adobe/spacecat-shared-utils-v1.115.4](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.115.3...@adobe/spacecat-shared-utils-v1.115.4) (2026-05-14)
|
|
2
|
+
|
|
3
|
+
### Bug Fixes
|
|
4
|
+
|
|
5
|
+
* onboard flow fixes around bot detection and resolve canonical urls for some sites ([#1556](https://github.com/adobe/spacecat-shared/issues/1556)) ([6209834](https://github.com/adobe/spacecat-shared/commit/620983412260634b3ac2651d03ddb3b9ac079c01))
|
|
6
|
+
|
|
1
7
|
## [@adobe/spacecat-shared-utils-v1.115.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.115.2...@adobe/spacecat-shared-utils-v1.115.3) (2026-05-14)
|
|
2
8
|
|
|
3
9
|
### Bug Fixes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@adobe/spacecat-shared-utils",
|
|
3
|
-
"version": "1.115.
|
|
3
|
+
"version": "1.115.4",
|
|
4
4
|
"description": "Shared modules of the Spacecat Services - utils",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
@@ -73,9 +73,9 @@
|
|
|
73
73
|
"devDependencies": {
|
|
74
74
|
"@adobe/helix-shared-wrap": "2.0.2",
|
|
75
75
|
"@types/validator": "^13.15.2",
|
|
76
|
-
"esbuild": "0.28.0",
|
|
77
76
|
"chai": "6.2.2",
|
|
78
77
|
"chai-as-promised": "8.0.2",
|
|
78
|
+
"esbuild": "0.28.0",
|
|
79
79
|
"esmock": "2.7.5",
|
|
80
80
|
"husky": "9.1.7",
|
|
81
81
|
"nock": "14.0.15",
|
|
@@ -91,6 +91,7 @@
|
|
|
91
91
|
"cheerio": "1.2.0",
|
|
92
92
|
"date-fns": "4.1.0",
|
|
93
93
|
"franc-min": "6.2.0",
|
|
94
|
+
"ipaddr.js": "^2.2.0",
|
|
94
95
|
"iso-639-3": "3.0.1",
|
|
95
96
|
"urijs": "1.19.11",
|
|
96
97
|
"validator": "^13.15.15",
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
|
|
13
13
|
import { tracingFetch, SPACECAT_USER_AGENT } from '../tracing-fetch.js';
|
|
14
14
|
import { isValidUrl } from '../functions.js';
|
|
15
|
+
import { isNonPublicHostname } from '../network-policy.js';
|
|
15
16
|
|
|
16
17
|
/**
|
|
17
18
|
* Confidence levels used in bot blocker detection:
|
|
@@ -29,6 +30,8 @@ const CONFIDENCE_HIGH = 0.99;
|
|
|
29
30
|
const CONFIDENCE_MEDIUM = 0.95;
|
|
30
31
|
const CONFIDENCE_ABSOLUTE = 1.0;
|
|
31
32
|
const DEFAULT_TIMEOUT = 5000;
|
|
33
|
+
export const BODY_READ_TIMEOUT = 3000;
|
|
34
|
+
const BODY_READ_MAX_BYTES = 65536; // 64 KB — challenge markers appear in the first KB
|
|
32
35
|
|
|
33
36
|
/**
|
|
34
37
|
* SpaceCat bot identification constants
|
|
@@ -303,7 +306,8 @@ function analyzeError(error) {
|
|
|
303
306
|
|
|
304
307
|
/**
|
|
305
308
|
* Detects bot blocker technology on a website.
|
|
306
|
-
* Makes a
|
|
309
|
+
* Makes a GET request (following up to 10 redirects manually) and analyzes the response.
|
|
310
|
+
* Each redirect hop is checked against the SSRF guard before connecting.
|
|
307
311
|
*
|
|
308
312
|
* Currently detects:
|
|
309
313
|
* - Cloudflare bot blocking (403 + cf-ray header)
|
|
@@ -312,6 +316,8 @@ function analyzeError(error) {
|
|
|
312
316
|
* - Fastly (403 + x-served-by or fastly-io-info headers)
|
|
313
317
|
* - AWS CloudFront (403 + x-amz-cf-id or via: CloudFront header)
|
|
314
318
|
* - HTTP/2 stream errors (NGHTTP2_INTERNAL_ERROR, ERR_HTTP2_STREAM_ERROR)
|
|
319
|
+
* - Redirect chains exceeding MAX_REDIRECTS ('redirect-limit-exceeded')
|
|
320
|
+
* - SSRF: private/non-public hostnames in initial URL or redirect targets ('ssrf-redirect-blocked')
|
|
315
321
|
*
|
|
316
322
|
* Also detects infrastructure presence on successful requests (200 OK):
|
|
317
323
|
* - Returns 'cloudflare-allowed', 'imperva-allowed', 'akamai-allowed',
|
|
@@ -321,29 +327,101 @@ function analyzeError(error) {
|
|
|
321
327
|
* @param {Object} config - Configuration object
|
|
322
328
|
* @param {string} config.baseUrl - The base URL to check
|
|
323
329
|
* @param {number} [config.timeout=5000] - Request timeout in milliseconds
|
|
330
|
+
* @param {Object} [config.log=console] - Logger with warn/debug methods
|
|
324
331
|
* @returns {Promise<Object>} Detection result with:
|
|
325
332
|
* - crawlable {boolean}: Whether the site can be crawled by bots
|
|
326
333
|
* - type {string}: Blocker type ('cloudflare', 'imperva', 'akamai', 'fastly',
|
|
327
|
-
* 'cloudfront', 'http2-block', '
|
|
328
|
-
* '
|
|
334
|
+
* 'cloudfront', 'http2-block', 'redirect-limit-exceeded', 'ssrf-redirect-blocked',
|
|
335
|
+
* 'cloudflare-allowed', 'imperva-allowed', 'akamai-allowed', 'fastly-allowed',
|
|
336
|
+
* 'cloudfront-allowed', 'none', 'unknown')
|
|
329
337
|
* - confidence {number}: Confidence level (0.0-1.0, see confidence level constants)
|
|
330
|
-
* @throws {Error} If baseUrl is
|
|
338
|
+
* @throws {Error} If baseUrl is not a valid URL
|
|
331
339
|
*/
|
|
332
|
-
export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
|
|
340
|
+
export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT, log = console }) {
|
|
333
341
|
if (!baseUrl || !isValidUrl(baseUrl)) {
|
|
334
342
|
throw new Error('Invalid baseUrl');
|
|
335
343
|
}
|
|
336
344
|
|
|
345
|
+
let hostname;
|
|
337
346
|
try {
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
|
|
347
|
+
({ hostname } = new URL(baseUrl));
|
|
348
|
+
/* c8 ignore next 3 */
|
|
349
|
+
} catch {
|
|
350
|
+
throw new Error('Invalid baseUrl');
|
|
351
|
+
}
|
|
352
|
+
if (isNonPublicHostname(hostname)) {
|
|
353
|
+
return { crawlable: false, type: 'ssrf-redirect-blocked', confidence: CONFIDENCE_ABSOLUTE };
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
try {
|
|
357
|
+
// Follow redirects manually so the SSRF guard runs on every hop before connecting.
|
|
358
|
+
const MAX_REDIRECTS = 10;
|
|
359
|
+
let currentUrl = baseUrl;
|
|
360
|
+
let response;
|
|
361
|
+
let exitedViaLimit = true;
|
|
362
|
+
for (let hop = 0; hop <= MAX_REDIRECTS; hop += 1) {
|
|
363
|
+
response = await tracingFetch(currentUrl, { // eslint-disable-line no-await-in-loop
|
|
364
|
+
method: 'GET',
|
|
365
|
+
headers: { 'User-Agent': SPACECAT_USER_AGENT },
|
|
366
|
+
redirect: 'manual',
|
|
367
|
+
timeout,
|
|
368
|
+
});
|
|
369
|
+
|
|
370
|
+
if (response.status < 300 || response.status >= 400) {
|
|
371
|
+
exitedViaLimit = false;
|
|
372
|
+
break;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
const location = response.headers.get('location');
|
|
376
|
+
if (!location) {
|
|
377
|
+
exitedViaLimit = false;
|
|
378
|
+
break;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
let redirectUrl;
|
|
382
|
+
try {
|
|
383
|
+
redirectUrl = new URL(location, currentUrl).toString();
|
|
384
|
+
} catch {
|
|
385
|
+
exitedViaLimit = false;
|
|
386
|
+
break;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
const { hostname: rHost } = new URL(redirectUrl);
|
|
390
|
+
if (isNonPublicHostname(rHost)) {
|
|
391
|
+
log.warn('detectBotBlocker: redirect to private hostname blocked', { fn: 'detectBotBlocker', url: redirectUrl });
|
|
392
|
+
return { crawlable: false, type: 'ssrf-redirect-blocked', confidence: CONFIDENCE_ABSOLUTE };
|
|
393
|
+
}
|
|
394
|
+
currentUrl = redirectUrl;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
if (exitedViaLimit && response.status >= 300 && response.status < 400) {
|
|
398
|
+
log.warn('detectBotBlocker: redirect limit exceeded', { fn: 'detectBotBlocker', url: baseUrl, limit: MAX_REDIRECTS });
|
|
399
|
+
return { crawlable: false, type: 'redirect-limit-exceeded', confidence: CONFIDENCE_HIGH };
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
let html = null;
|
|
403
|
+
const contentLength = parseInt(response.headers.get('content-length') || '0', 10);
|
|
404
|
+
// Content-Length check is best-effort; chunked responses (no Content-Length header) are
|
|
405
|
+
// bounded by BODY_READ_TIMEOUT only.
|
|
406
|
+
if (contentLength > 0 && contentLength > BODY_READ_MAX_BYTES) {
|
|
407
|
+
log.warn('detectBotBlocker: body too large, skipping body read', { fn: 'detectBotBlocker', url: baseUrl, contentLength });
|
|
408
|
+
} else {
|
|
409
|
+
try {
|
|
410
|
+
// Promise.race guards against servers that stream body slowly after headers arrive.
|
|
411
|
+
// tracingFetch clears its AbortSignal in finally{} before returning, so response.text()
|
|
412
|
+
// has no built-in timeout. clearTimeout prevents the timer handle from leaking when
|
|
413
|
+
// response.text() resolves before the deadline.
|
|
414
|
+
let timer;
|
|
415
|
+
html = await Promise.race([
|
|
416
|
+
response.text().finally(() => clearTimeout(timer)),
|
|
417
|
+
new Promise((_, reject) => { timer = setTimeout(() => reject(new Error('body-read-timeout')), BODY_READ_TIMEOUT); }),
|
|
418
|
+
]);
|
|
419
|
+
} catch (e) {
|
|
420
|
+
log.warn('detectBotBlocker: body read failed, using header-only analysis', { fn: 'detectBotBlocker', url: baseUrl, cause: e?.message });
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
return analyzeResponse(response, html);
|
|
347
425
|
} catch (error) {
|
|
348
426
|
return analyzeError(error);
|
|
349
427
|
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import ipaddr from 'ipaddr.js';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* IP ranges that should never be fetched from a Lambda (SSRF guard).
|
|
17
|
+
* ipaddr.js range() returns these strings for non-public addresses.
|
|
18
|
+
*/
|
|
19
|
+
const BLOCKED_RANGES = new Set([
|
|
20
|
+
'loopback', // 127.0.0.0/8, ::1
|
|
21
|
+
'private', // 10/8, 172.16/12, 192.168/16
|
|
22
|
+
'linkLocal', // 169.254/16, fe80::/10
|
|
23
|
+
'uniqueLocal', // fc00::/7 (IPv6 ULA)
|
|
24
|
+
'unspecified', // 0.0.0.0, ::
|
|
25
|
+
'carrierGradeNat', // 100.64.0.0/10
|
|
26
|
+
'broadcast', // 255.255.255.255/32
|
|
27
|
+
'multicast', // 224.0.0.0/4, ff00::/8
|
|
28
|
+
'reserved', // 240.0.0.0/4
|
|
29
|
+
'6to4', // 2002::/16
|
|
30
|
+
'teredo', // 2001::/32
|
|
31
|
+
'rfc6052', // 64:ff9b::/96
|
|
32
|
+
]);
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Returns true if the hostname is a non-public address that must not be fetched.
|
|
36
|
+
* Covers: loopback, private ranges, link-local, IPv6 ULA, INADDR_ANY, localhost,
|
|
37
|
+
* IPv4-mapped IPv6, and trailing-dot variants. DNS-based rebinding is out of scope.
|
|
38
|
+
*
|
|
39
|
+
* Used by detectBotBlocker and resolveCanonicalUrl to guard against SSRF on
|
|
40
|
+
* attacker-supplied URLs. Both functions import from here so any fix is applied once.
|
|
41
|
+
*
|
|
42
|
+
* @param {string} hostname - Parsed hostname from new URL(). May include brackets for IPv6.
|
|
43
|
+
* @returns {boolean} True if the hostname must be blocked.
|
|
44
|
+
*/
|
|
45
|
+
export function isNonPublicHostname(hostname) {
|
|
46
|
+
// Strip trailing dot (e.g. "localhost." -> "localhost")
|
|
47
|
+
const h = hostname.replace(/\.$/, '');
|
|
48
|
+
|
|
49
|
+
// Strip IPv6 brackets (e.g. "[::1]" -> "::1")
|
|
50
|
+
const bare = h.startsWith('[') && h.endsWith(']') ? h.slice(1, -1) : h;
|
|
51
|
+
|
|
52
|
+
if (bare.toLowerCase() === 'localhost') {
|
|
53
|
+
return true;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (!ipaddr.isValid(bare)) {
|
|
57
|
+
return false; // domain names (not IP literals) are allowed through
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
try {
|
|
61
|
+
const addr = ipaddr.parse(bare);
|
|
62
|
+
// IPv4-mapped IPv6 (e.g. ::ffff:127.0.0.1): evaluate the embedded IPv4 range
|
|
63
|
+
if (addr.kind() === 'ipv6' && addr.isIPv4MappedAddress()) {
|
|
64
|
+
return BLOCKED_RANGES.has(addr.toIPv4Address().range());
|
|
65
|
+
}
|
|
66
|
+
return BLOCKED_RANGES.has(addr.range());
|
|
67
|
+
/* c8 ignore next 3 */
|
|
68
|
+
} catch {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
}
|
package/src/url-helpers.js
CHANGED
|
@@ -14,6 +14,7 @@ import { context as h2, h1 } from '@adobe/fetch';
|
|
|
14
14
|
import URI from 'urijs';
|
|
15
15
|
import { hasText, isValidUrl } from './functions.js';
|
|
16
16
|
import { SPACECAT_USER_AGENT } from './tracing-fetch.js';
|
|
17
|
+
import { isNonPublicHostname } from './network-policy.js';
|
|
17
18
|
|
|
18
19
|
/* c8 ignore next 3 */
|
|
19
20
|
export const { fetch } = process.env.HELIX_FETCH_FORCE_HTTP1
|
|
@@ -134,46 +135,88 @@ function getSpacecatRequestHeaders() {
|
|
|
134
135
|
};
|
|
135
136
|
}
|
|
136
137
|
|
|
138
|
+
const RESOLVE_CANONICAL_URL_TOTAL_TIMEOUT = 7000;
|
|
139
|
+
|
|
137
140
|
/**
|
|
138
141
|
* Resolve canonical URL for a given URL string by following redirect chain.
|
|
142
|
+
*
|
|
143
|
+
* The `deadline` is a shared absolute timestamp across all attempts — HEAD, GET, and every
|
|
144
|
+
* redirect hop all draw from the same budget. HEAD is tried first; on network error or non-2xx
|
|
145
|
+
* the request is retried once with GET. GET is never retried — if it fails there is no further
|
|
146
|
+
* fallback method.
|
|
147
|
+
*
|
|
148
|
+
* Redirects are followed manually (redirect: 'manual') so the SSRF guard runs on every hop
|
|
149
|
+
* before the network connection is made. Auto-follow would connect first, guard second.
|
|
150
|
+
*
|
|
151
|
+
* Non-public hostnames (private IPs, loopback, link-local, localhost, IPv6 ULA, INADDR_ANY)
|
|
152
|
+
* are rejected on every hop including redirect targets to prevent SSRF.
|
|
153
|
+
* See network-policy.js for the full list of blocked ranges.
|
|
154
|
+
*
|
|
139
155
|
* @param {string} urlString - The URL string to normalize.
|
|
140
156
|
* @param {string} method - HTTP method to use ('HEAD' or 'GET').
|
|
157
|
+
* @param {number} deadline - Absolute timestamp (ms) by which all attempts must finish.
|
|
158
|
+
* @param {object} [log=console] - Logger with a warn() method for observability.
|
|
141
159
|
* @returns {Promise<string|null>} A Promise that resolves to the canonical URL or null if failed.
|
|
142
160
|
*/
|
|
143
|
-
async function resolveCanonicalUrl(
|
|
161
|
+
async function resolveCanonicalUrl(
|
|
162
|
+
urlString,
|
|
163
|
+
method = 'HEAD',
|
|
164
|
+
deadline = Date.now() + RESOLVE_CANONICAL_URL_TOTAL_TIMEOUT,
|
|
165
|
+
log = console,
|
|
166
|
+
) {
|
|
167
|
+
try {
|
|
168
|
+
const { hostname } = new URL(urlString);
|
|
169
|
+
if (isNonPublicHostname(hostname)) {
|
|
170
|
+
log.warn('[resolveCanonicalUrl] private hostname rejected', { fn: 'resolveCanonicalUrl', url: urlString });
|
|
171
|
+
return null;
|
|
172
|
+
}
|
|
173
|
+
} catch (e) {
|
|
174
|
+
log.warn('[resolveCanonicalUrl] invalid URL', { fn: 'resolveCanonicalUrl', url: urlString, cause: e?.message });
|
|
175
|
+
return null;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const remaining = deadline - Date.now();
|
|
179
|
+
if (remaining <= 0) {
|
|
180
|
+
log.warn('[resolveCanonicalUrl] deadline expired', { fn: 'resolveCanonicalUrl', url: urlString, method });
|
|
181
|
+
return null;
|
|
182
|
+
}
|
|
183
|
+
|
|
144
184
|
const headers = getSpacecatRequestHeaders();
|
|
145
|
-
let resp;
|
|
146
185
|
|
|
147
186
|
try {
|
|
148
|
-
const
|
|
149
|
-
resp = await fetch(urlString, {
|
|
187
|
+
const resp = await fetch(urlString, {
|
|
150
188
|
headers,
|
|
151
189
|
method,
|
|
152
|
-
|
|
190
|
+
redirect: 'manual',
|
|
191
|
+
signal: AbortSignal.timeout(remaining),
|
|
192
|
+
decode: false,
|
|
153
193
|
});
|
|
154
194
|
|
|
155
195
|
if (resp.ok) {
|
|
156
196
|
return ensureHttps(resp.url);
|
|
157
197
|
}
|
|
158
198
|
|
|
159
|
-
//
|
|
160
|
-
if (
|
|
161
|
-
|
|
199
|
+
// Manual redirect: extract Location and recurse so the guard runs on each hop
|
|
200
|
+
if (resp.status >= 300 && resp.status < 400) {
|
|
201
|
+
const location = resp.headers.get('location');
|
|
202
|
+
if (location) {
|
|
203
|
+
const redirectUrl = new URL(location, urlString).toString();
|
|
204
|
+
return resolveCanonicalUrl(redirectUrl, method, deadline, log);
|
|
205
|
+
}
|
|
162
206
|
}
|
|
163
207
|
|
|
164
208
|
if (method === 'HEAD') {
|
|
165
|
-
return resolveCanonicalUrl(urlString, 'GET');
|
|
209
|
+
return resolveCanonicalUrl(urlString, 'GET', deadline, log);
|
|
166
210
|
}
|
|
167
211
|
|
|
168
|
-
// If the URL is not found and we've tried both HEAD and GET, return null
|
|
169
212
|
return null;
|
|
170
|
-
} catch {
|
|
171
|
-
//
|
|
213
|
+
} catch (e) {
|
|
214
|
+
// HEAD retries with GET on any error; GET does not retry — there is no further fallback method.
|
|
172
215
|
if (method === 'HEAD') {
|
|
173
|
-
return resolveCanonicalUrl(urlString, 'GET');
|
|
216
|
+
return resolveCanonicalUrl(urlString, 'GET', deadline, log);
|
|
174
217
|
}
|
|
175
218
|
|
|
176
|
-
|
|
219
|
+
log.warn('[resolveCanonicalUrl] GET request failed', { fn: 'resolveCanonicalUrl', url: urlString, cause: e?.message });
|
|
177
220
|
return null;
|
|
178
221
|
}
|
|
179
222
|
}
|