npm - @adobe/spacecat-shared-utils - Versions diffs - 1.115.3 → 1.115.4 - Mend

@adobe/spacecat-shared-utils 1.115.3 → 1.115.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md +6 -0
package/package.json +3 -2
package/src/bot-blocker-detect/bot-blocker-detect.js +92 -14
package/src/network-policy.js +71 -0
package/src/url-helpers.js +57 -14

package/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,9 @@
+## [@adobe/spacecat-shared-utils-v1.115.4](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.115.3...@adobe/spacecat-shared-utils-v1.115.4) (2026-05-14)
+### Bug Fixes
+* onboard flow fixes around bot detection and resolve canonical urls for some sites ([#1556](https://github.com/adobe/spacecat-shared/issues/1556)) ([6209834](https://github.com/adobe/spacecat-shared/commit/620983412260634b3ac2651d03ddb3b9ac079c01))
 ## [@adobe/spacecat-shared-utils-v1.115.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.115.2...@adobe/spacecat-shared-utils-v1.115.3) (2026-05-14)
 ### Bug Fixes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adobe/spacecat-shared-utils",
-  "version": "1.115.3",
+  "version": "1.115.4",
   "description": "Shared modules of the Spacecat Services - utils",
   "type": "module",
   "exports": {
@@ -73,9 +73,9 @@
   "devDependencies": {
     "@adobe/helix-shared-wrap": "2.0.2",
     "@types/validator": "^13.15.2",
-    "esbuild": "0.28.0",
     "chai": "6.2.2",
     "chai-as-promised": "8.0.2",
+    "esbuild": "0.28.0",
     "esmock": "2.7.5",
     "husky": "9.1.7",
     "nock": "14.0.15",
@@ -91,6 +91,7 @@
     "cheerio": "1.2.0",
     "date-fns": "4.1.0",
     "franc-min": "6.2.0",
+    "ipaddr.js": "^2.2.0",
     "iso-639-3": "3.0.1",
     "urijs": "1.19.11",
     "validator": "^13.15.15",

package/src/bot-blocker-detect/bot-blocker-detect.js CHANGED Viewed

@@ -12,6 +12,7 @@
 import { tracingFetch, SPACECAT_USER_AGENT } from '../tracing-fetch.js';
 import { isValidUrl } from '../functions.js';
+import { isNonPublicHostname } from '../network-policy.js';
 /**
  * Confidence levels used in bot blocker detection:
@@ -29,6 +30,8 @@ const CONFIDENCE_HIGH = 0.99;
 const CONFIDENCE_MEDIUM = 0.95;
 const CONFIDENCE_ABSOLUTE = 1.0;
 const DEFAULT_TIMEOUT = 5000;
+export const BODY_READ_TIMEOUT = 3000;
+const BODY_READ_MAX_BYTES = 65536; // 64 KB — challenge markers appear in the first KB
 /**
  * SpaceCat bot identification constants
@@ -303,7 +306,8 @@ function analyzeError(error) {
 /**
  * Detects bot blocker technology on a website.
- * Makes a single HEAD request and analyzes the response for blocking patterns.
+ * Makes a GET request (following up to 10 redirects manually) and analyzes the response.
+ * Each redirect hop is checked against the SSRF guard before connecting.
  *
  * Currently detects:
  * - Cloudflare bot blocking (403 + cf-ray header)
@@ -312,6 +316,8 @@ function analyzeError(error) {
  * - Fastly (403 + x-served-by or fastly-io-info headers)
  * - AWS CloudFront (403 + x-amz-cf-id or via: CloudFront header)
  * - HTTP/2 stream errors (NGHTTP2_INTERNAL_ERROR, ERR_HTTP2_STREAM_ERROR)
+ * - Redirect chains exceeding MAX_REDIRECTS ('redirect-limit-exceeded')
+ * - SSRF: private/non-public hostnames in initial URL or redirect targets ('ssrf-redirect-blocked')
  *
  * Also detects infrastructure presence on successful requests (200 OK):
  * - Returns 'cloudflare-allowed', 'imperva-allowed', 'akamai-allowed',
@@ -321,29 +327,101 @@ function analyzeError(error) {
  * @param {Object} config - Configuration object
  * @param {string} config.baseUrl - The base URL to check
  * @param {number} [config.timeout=5000] - Request timeout in milliseconds
+ * @param {Object} [config.log=console] - Logger with warn/debug methods
  * @returns {Promise<Object>} Detection result with:
  *   - crawlable {boolean}: Whether the site can be crawled by bots
  *   - type {string}: Blocker type ('cloudflare', 'imperva', 'akamai', 'fastly',
- *     'cloudfront', 'http2-block', 'cloudflare-allowed', 'imperva-allowed',
- *     'akamai-allowed', 'fastly-allowed', 'cloudfront-allowed', 'none', 'unknown')
+ *     'cloudfront', 'http2-block', 'redirect-limit-exceeded', 'ssrf-redirect-blocked',
+ *     'cloudflare-allowed', 'imperva-allowed', 'akamai-allowed', 'fastly-allowed',
+ *     'cloudfront-allowed', 'none', 'unknown')
  *   - confidence {number}: Confidence level (0.0-1.0, see confidence level constants)
- * @throws {Error} If baseUrl is invalid
+ * @throws {Error} If baseUrl is not a valid URL
  */
-export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
+export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT, log = console }) {
   if (!baseUrl || !isValidUrl(baseUrl)) {
     throw new Error('Invalid baseUrl');
   }
+  let hostname;
   try {
-    const response = await tracingFetch(baseUrl, {
-      method: 'HEAD',
-      headers: {
-        'User-Agent': SPACECAT_USER_AGENT,
-      },
-      signal: AbortSignal.timeout(timeout),
-    });
-    return analyzeResponse(response);
+    ({ hostname } = new URL(baseUrl));
+  /* c8 ignore next 3 */
+  } catch {
+    throw new Error('Invalid baseUrl');
+  }
+  if (isNonPublicHostname(hostname)) {
+    return { crawlable: false, type: 'ssrf-redirect-blocked', confidence: CONFIDENCE_ABSOLUTE };
+  }
+  try {
+    // Follow redirects manually so the SSRF guard runs on every hop before connecting.
+    const MAX_REDIRECTS = 10;
+    let currentUrl = baseUrl;
+    let response;
+    let exitedViaLimit = true;
+    for (let hop = 0; hop <= MAX_REDIRECTS; hop += 1) {
+      response = await tracingFetch(currentUrl, { // eslint-disable-line no-await-in-loop
+        method: 'GET',
+        headers: { 'User-Agent': SPACECAT_USER_AGENT },
+        redirect: 'manual',
+        timeout,
+      });
+      if (response.status < 300 || response.status >= 400) {
+        exitedViaLimit = false;
+        break;
+      }
+      const location = response.headers.get('location');
+      if (!location) {
+        exitedViaLimit = false;
+        break;
+      }
+      let redirectUrl;
+      try {
+        redirectUrl = new URL(location, currentUrl).toString();
+      } catch {
+        exitedViaLimit = false;
+        break;
+      }
+      const { hostname: rHost } = new URL(redirectUrl);
+      if (isNonPublicHostname(rHost)) {
+        log.warn('detectBotBlocker: redirect to private hostname blocked', { fn: 'detectBotBlocker', url: redirectUrl });
+        return { crawlable: false, type: 'ssrf-redirect-blocked', confidence: CONFIDENCE_ABSOLUTE };
+      }
+      currentUrl = redirectUrl;
+    }
+    if (exitedViaLimit && response.status >= 300 && response.status < 400) {
+      log.warn('detectBotBlocker: redirect limit exceeded', { fn: 'detectBotBlocker', url: baseUrl, limit: MAX_REDIRECTS });
+      return { crawlable: false, type: 'redirect-limit-exceeded', confidence: CONFIDENCE_HIGH };
+    }
+    let html = null;
+    const contentLength = parseInt(response.headers.get('content-length') || '0', 10);
+    // Content-Length check is best-effort; chunked responses (no Content-Length header) are
+    // bounded by BODY_READ_TIMEOUT only.
+    if (contentLength > 0 && contentLength > BODY_READ_MAX_BYTES) {
+      log.warn('detectBotBlocker: body too large, skipping body read', { fn: 'detectBotBlocker', url: baseUrl, contentLength });
+    } else {
+      try {
+        // Promise.race guards against servers that stream body slowly after headers arrive.
+        // tracingFetch clears its AbortSignal in finally{} before returning, so response.text()
+        // has no built-in timeout. clearTimeout prevents the timer handle from leaking when
+        // response.text() resolves before the deadline.
+        let timer;
+        html = await Promise.race([
+          response.text().finally(() => clearTimeout(timer)),
+          new Promise((_, reject) => { timer = setTimeout(() => reject(new Error('body-read-timeout')), BODY_READ_TIMEOUT); }),
+        ]);
+      } catch (e) {
+        log.warn('detectBotBlocker: body read failed, using header-only analysis', { fn: 'detectBotBlocker', url: baseUrl, cause: e?.message });
+      }
+    }
+    return analyzeResponse(response, html);
   } catch (error) {
     return analyzeError(error);
   }

package/src/network-policy.js ADDED Viewed

@@ -0,0 +1,71 @@
+/*
+ * Copyright 2025 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+import ipaddr from 'ipaddr.js';
+/**
+ * IP ranges that should never be fetched from a Lambda (SSRF guard).
+ * ipaddr.js range() returns these strings for non-public addresses.
+ */
+const BLOCKED_RANGES = new Set([
+  'loopback', // 127.0.0.0/8, ::1
+  'private', // 10/8, 172.16/12, 192.168/16
+  'linkLocal', // 169.254/16, fe80::/10
+  'uniqueLocal', // fc00::/7 (IPv6 ULA)
+  'unspecified', // 0.0.0.0, ::
+  'carrierGradeNat', // 100.64.0.0/10
+  'broadcast', // 255.255.255.255/32
+  'multicast', // 224.0.0.0/4, ff00::/8
+  'reserved', // 240.0.0.0/4
+  '6to4', // 2002::/16
+  'teredo', // 2001::/32
+  'rfc6052', // 64:ff9b::/96
+]);
+/**
+ * Returns true if the hostname is a non-public address that must not be fetched.
+ * Covers: loopback, private ranges, link-local, IPv6 ULA, INADDR_ANY, localhost,
+ * IPv4-mapped IPv6, and trailing-dot variants. DNS-based rebinding is out of scope.
+ *
+ * Used by detectBotBlocker and resolveCanonicalUrl to guard against SSRF on
+ * attacker-supplied URLs. Both functions import from here so any fix is applied once.
+ *
+ * @param {string} hostname - Parsed hostname from new URL(). May include brackets for IPv6.
+ * @returns {boolean} True if the hostname must be blocked.
+ */
+export function isNonPublicHostname(hostname) {
+  // Strip trailing dot (e.g. "localhost." -> "localhost")
+  const h = hostname.replace(/\.$/, '');
+  // Strip IPv6 brackets (e.g. "[::1]" -> "::1")
+  const bare = h.startsWith('[') && h.endsWith(']') ? h.slice(1, -1) : h;
+  if (bare.toLowerCase() === 'localhost') {
+    return true;
+  }
+  if (!ipaddr.isValid(bare)) {
+    return false; // domain names (not IP literals) are allowed through
+  }
+  try {
+    const addr = ipaddr.parse(bare);
+    // IPv4-mapped IPv6 (e.g. ::ffff:127.0.0.1): evaluate the embedded IPv4 range
+    if (addr.kind() === 'ipv6' && addr.isIPv4MappedAddress()) {
+      return BLOCKED_RANGES.has(addr.toIPv4Address().range());
+    }
+    return BLOCKED_RANGES.has(addr.range());
+  /* c8 ignore next 3 */
+  } catch {
+    return false;
+  }
+}

package/src/url-helpers.js CHANGED Viewed

@@ -14,6 +14,7 @@ import { context as h2, h1 } from '@adobe/fetch';
 import URI from 'urijs';
 import { hasText, isValidUrl } from './functions.js';
 import { SPACECAT_USER_AGENT } from './tracing-fetch.js';
+import { isNonPublicHostname } from './network-policy.js';
 /* c8 ignore next 3 */
 export const { fetch } = process.env.HELIX_FETCH_FORCE_HTTP1
@@ -134,46 +135,88 @@ function getSpacecatRequestHeaders() {
   };
 }
+const RESOLVE_CANONICAL_URL_TOTAL_TIMEOUT = 7000;
 /**
  * Resolve canonical URL for a given URL string by following redirect chain.
+ *
+ * The `deadline` is a shared absolute timestamp across all attempts — HEAD, GET, and every
+ * redirect hop all draw from the same budget. HEAD is tried first; on network error or non-2xx
+ * the request is retried once with GET. GET is never retried — if it fails there is no further
+ * fallback method.
+ *
+ * Redirects are followed manually (redirect: 'manual') so the SSRF guard runs on every hop
+ * before the network connection is made. Auto-follow would connect first, guard second.
+ *
+ * Non-public hostnames (private IPs, loopback, link-local, localhost, IPv6 ULA, INADDR_ANY)
+ * are rejected on every hop including redirect targets to prevent SSRF.
+ * See network-policy.js for the full list of blocked ranges.
+ *
  * @param {string} urlString - The URL string to normalize.
  * @param {string} method - HTTP method to use ('HEAD' or 'GET').
+ * @param {number} deadline - Absolute timestamp (ms) by which all attempts must finish.
+ * @param {object} [log=console] - Logger with a warn() method for observability.
  * @returns {Promise<string|null>} A Promise that resolves to the canonical URL or null if failed.
  */
-async function resolveCanonicalUrl(urlString, method = 'HEAD') {
+async function resolveCanonicalUrl(
+  urlString,
+  method = 'HEAD',
+  deadline = Date.now() + RESOLVE_CANONICAL_URL_TOTAL_TIMEOUT,
+  log = console,
+) {
+  try {
+    const { hostname } = new URL(urlString);
+    if (isNonPublicHostname(hostname)) {
+      log.warn('[resolveCanonicalUrl] private hostname rejected', { fn: 'resolveCanonicalUrl', url: urlString });
+      return null;
+    }
+  } catch (e) {
+    log.warn('[resolveCanonicalUrl] invalid URL', { fn: 'resolveCanonicalUrl', url: urlString, cause: e?.message });
+    return null;
+  }
+  const remaining = deadline - Date.now();
+  if (remaining <= 0) {
+    log.warn('[resolveCanonicalUrl] deadline expired', { fn: 'resolveCanonicalUrl', url: urlString, method });
+    return null;
+  }
   const headers = getSpacecatRequestHeaders();
-  let resp;
   try {
-    const timeout = method === 'HEAD' ? 10000 : 20000; // 10s for HEAD, 20s for GET
-    resp = await fetch(urlString, {
+    const resp = await fetch(urlString, {
       headers,
       method,
-      signal: AbortSignal.timeout(timeout),
+      redirect: 'manual',
+      signal: AbortSignal.timeout(remaining),
+      decode: false,
     });
     if (resp.ok) {
       return ensureHttps(resp.url);
     }
-    // Handle redirect chains
-    if (urlString !== resp.url) {
-      return resolveCanonicalUrl(resp.url, method);
+    // Manual redirect: extract Location and recurse so the guard runs on each hop
+    if (resp.status >= 300 && resp.status < 400) {
+      const location = resp.headers.get('location');
+      if (location) {
+        const redirectUrl = new URL(location, urlString).toString();
+        return resolveCanonicalUrl(redirectUrl, method, deadline, log);
+      }
     }
     if (method === 'HEAD') {
-      return resolveCanonicalUrl(urlString, 'GET');
+      return resolveCanonicalUrl(urlString, 'GET', deadline, log);
     }
-    // If the URL is not found and we've tried both HEAD and GET, return null
     return null;
-  } catch {
-    // If HEAD failed with network error and we haven't tried GET yet, retry with GET
+  } catch (e) {
+    // HEAD retries with GET on any error; GET does not retry — there is no further fallback method.
     if (method === 'HEAD') {
-      return resolveCanonicalUrl(urlString, 'GET');
+      return resolveCanonicalUrl(urlString, 'GET', deadline, log);
     }
-    // For all errors (both HTTP status and network), return null
+    log.warn('[resolveCanonicalUrl] GET request failed', { fn: 'resolveCanonicalUrl', url: urlString, cause: e?.message });
     return null;
   }
 }