npm - @fanboynz/network-scanner - Versions diffs - 3.0.3 → 3.1.2 - Mend

@fanboynz/network-scanner 3.0.3 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/lib/searchstring.js CHANGED Viewed

@@ -3,12 +3,12 @@
 const fs = require('fs');
 const { formatLogMessage, messageColors } = require('./colorize');
-const CURL_TAG = messageColors.processing('[curl]');
-// responseHandler is a separate code path (Puppeteer response listener,
-// not curl) — its debug output gets its own subsystem prefix so it's
-// distinguishable from curl-handler logs.
+// Subsystem tag for the Puppeteer response-listener path. createCurlHandler
+// + its CURL_TAG (and the downloadWithCurl/downloadWithRetry helpers) used
+// to live here but were dead — nwss.js imports the curl-based handler from
+// lib/curl.js instead. Removed in the same cleanup that drops those
+// functions.
 const SEARCHSTRING_TAG = messageColors.processing('[searchstring]');
-const { runProcess } = require('./spawn-async');
 const { grepContent } = require('./grep');
 // Configuration constants for search logic
@@ -51,83 +51,6 @@ function parseSearchStrings(searchstring, searchstringAnd) {
   };
 }
-/**
- * Downloads content using curl with appropriate headers and timeout
- * @param {string} url - The URL to download
- * @param {string} userAgent - User agent string to use
- * @param {number} timeout - Timeout in seconds (default: 30)
- * @returns {Promise<string>} The downloaded content
- */
-async function downloadWithCurl(url, userAgent = '', timeout = 30) {
-  const MAX_STDOUT_BYTES = 52428800; // 50MB, matches --max-filesize below
-  const curlArgs = [
-    '-s',
-    '-L',
-    '--max-time', timeout.toString(),
-    '--max-redirs', '5',
-    '--fail-with-body',
-    '--max-filesize', '52428800',
-    '--range', '0-52428799',
-    '--compressed'
-  ];
-  if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`);
-  curlArgs.push(
-    '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-    '-H', 'Accept-Language: en-US,en;q=0.5',
-    '-H', 'Accept-Encoding: gzip, deflate',
-    '-H', 'Connection: keep-alive',
-    '-H', 'Upgrade-Insecure-Requests: 1'
-  );
-  curlArgs.push(url);
-  // Shared async-spawn helper — same streaming/cap/timeout/kill plumbing
-  // that used to be ~80 lines of inline boilerplate here.
-  const result = await runProcess('curl', curlArgs, {
-    timeout: timeout * 1000,
-    maxStdout: MAX_STDOUT_BYTES
-  });
-  if (result.error) throw new Error(`Curl failed for ${url}: ${result.error}`);
-  if (result.truncated) throw new Error(`Curl output exceeded ${MAX_STDOUT_BYTES} bytes for ${url}`);
-  if (result.signal) throw new Error(`Curl killed by signal ${result.signal} for ${url}`);
-  if (result.code !== 0) {
-    throw new Error(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`);
-  }
-  return result.stdout.toString('utf8');
-}
-/**
- * Downloads content with retry logic for transient failures
- * @param {string} url - The URL to download
- * @param {string} userAgent - User agent string to use
- * @param {number} timeout - Timeout in seconds
- * @param {number} retries - Number of retry attempts (default: 2)
- * @returns {Promise<string>} The downloaded content
- */
-async function downloadWithRetry(url, userAgent = '', timeout = 30, retries = 2) {
-  for (let attempt = 0; attempt <= retries; attempt++) {
-    try {
-      return await downloadWithCurl(url, userAgent, timeout);
-    } catch (err) {
-      // Don't retry on final attempt
-      if (attempt === retries) throw err;
-      // Only retry on specific transient errors
-      const shouldRetry = err.message.includes('timeout') ||
-                         err.message.includes('Connection refused') ||
-                         err.message.includes('502') ||
-                         err.message.includes('503') ||
-                         err.message.includes('Connection reset');
-      if (!shouldRetry) throw err;
-      // Exponential backoff: 1s, 2s, 4s...
-      await new Promise(resolve => setTimeout(resolve, 1000 * Math.pow(2, attempt)));
-    }
-  }
-}
 // Lookup table for the 6 named entities the previous chained-replace
 // handled. Hoisted out of safeDecodeXmlEntities so the object isn't
 // reallocated per call.
@@ -337,157 +260,6 @@ function shouldAnalyzeContentType(contentType) {
   return textTypes.some(type => normalizedType.startsWith(type));
 }
-/**
- * Creates a curl-based URL handler for downloading and optionally searching content
- * @param {object} config - Configuration object containing all necessary parameters
- * @returns {Function} URL handler function for curl-based content analysis
- */
-function createCurlHandler(config) {
-  const {
-    searchStrings,
-    searchStringsAnd,
-    hasSearchStringAnd,
-    regexes,
-    matchedDomains,
-    addMatchedDomain, // Helper function for adding domains
-    currentUrl,
-    perSiteSubDomains,
-    ignoreDomains,
-    matchesIgnoreDomain,
-    getRootDomain,
-    siteConfig,
-    dumpUrls,
-    matchedUrlsLogFile,
-    forceDebug,
-    userAgent,
-    resourceType, // Resource type from request
-    hasSearchString
-  } = config;
-  // Hoisted: currentUrl doesn't change for this handler's lifetime, so
-  // parsing its hostname once at handler-creation eliminates the
-  // per-request URL allocation.
-  let currentUrlHostname = '';
-  try { currentUrlHostname = new URL(currentUrl).hostname; } catch (_) {}
-  return async function curlHandler(requestUrl) {
-    // Regex check FIRST — cheap filter that skips ~99% of requests.
-    // Previously this ran AFTER a URL parse + domain-cache lookup;
-    // the parse is the expensive bit, so doing it after the cheap
-    // gate moves the cost off the hot path.
-    const matchesRegex = regexes.some(re => re.test(requestUrl));
-    if (!matchesRegex) return;
-    // Parse requestUrl ONCE and reuse. Was parsed 2-3 times.
-    let requestHostname;
-    try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; }
-    const reqDomain = perSiteSubDomains ? requestHostname : getRootDomain(requestUrl);
-    if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(reqDomain)) {
-      if (forceDebug) {
-        console.log(formatLogMessage('debug', `${CURL_TAG} Skipping already detected domain: ${reqDomain}`));
-      }
-      return;
-    }
-    const isFirstParty = currentUrlHostname === requestHostname;
-    // Apply first-party/third-party filtering
-    if (isFirstParty && siteConfig.firstParty === false) {
-      if (forceDebug) {
-        console.log(formatLogMessage('debug', `${CURL_TAG} Skipping first-party request (firstParty=false): ${requestUrl}`));
-      }
-      return;
-    }
-    if (!isFirstParty && siteConfig.thirdParty === false) {
-      if (forceDebug) {
-        console.log(formatLogMessage('debug', `${CURL_TAG} Skipping third-party request (thirdParty=false): ${requestUrl}`));
-      }
-      return;
-    }
-    try {
-      if (forceDebug) {
-        console.log(formatLogMessage('debug', `${CURL_TAG} Downloading content from: ${requestUrl}`));
-      }
-      // If NO searchstring is defined, match immediately (like browser behavior)
-      if (!hasSearchString && !hasSearchStringAnd) {
-        if (!reqDomain || matchesIgnoreDomain(reqDomain, ignoreDomains)) {
-          return;
-        }
-        addMatchedDomain(reqDomain, resourceType);
-        const simplifiedUrl = getRootDomain(currentUrl);
-        if (siteConfig.verbose === 1) {
-          const partyType = isFirstParty ? 'first-party' : 'third-party';
-          const resourceInfo = resourceType ? ` (${resourceType})` : '';
-          console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) matched regex${resourceInfo}`);
-        }
-        if (dumpUrls) {
-          const timestamp = new Date().toISOString();
-          const partyType = isFirstParty ? 'first-party' : 'third-party';
-          const resourceInfo = resourceType ? ` (${resourceType})` : '';
-          try {
-            fs.appendFileSync(matchedUrlsLogFile,
-              `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
-          } catch (logErr) {
-            console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
-          }
-        }
-        return;
-      }
-      // If searchstring IS defined, download and search content
-      const content = await downloadWithRetry(requestUrl, userAgent, 30);
-      // Check if content contains search strings (OR or AND logic)
-      const { found, matchedString, logicType, error } = searchContent(content, searchStrings, searchStringsAnd, '', requestUrl);
-      if (found) {
-        if (!reqDomain || matchesIgnoreDomain(reqDomain, ignoreDomains)) {
-          return;
-        }
-        addMatchedDomain(reqDomain, resourceType);
-        const simplifiedUrl = getRootDomain(currentUrl);
-        if (siteConfig.verbose === 1) {
-          const partyType = isFirstParty ? 'first-party' : 'third-party';
-          const resourceInfo = resourceType ? ` (${resourceType})` : '';
-          console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) contains searchstring (${logicType}): "${matchedString}"${resourceInfo}`);
-        }
-        if (dumpUrls) {
-          const timestamp = new Date().toISOString();
-          const partyType = isFirstParty ? 'first-party' : 'third-party';
-          const resourceInfo = resourceType ? ` (${resourceType})` : '';
-          try {
-            fs.appendFileSync(matchedUrlsLogFile,
-              `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, searchstring (${logicType}): "${matchedString}")${resourceInfo}\n`);
-          } catch (logErr) {
-            console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
-          }
-        }
-      } else if (forceDebug) {
-        const partyType = isFirstParty ? 'first-party' : 'third-party';
-        console.log(formatLogMessage('debug', `${CURL_TAG} ${requestUrl} (${partyType}) matched regex but no searchstring found`));
-        if (error) {
-          console.log(formatLogMessage('debug', `${CURL_TAG} Search error: ${error}`));
-        }
-      }
-    } catch (err) {
-      if (forceDebug) {
-        console.log(formatLogMessage('debug', `${CURL_TAG} Failed to download content for ${requestUrl}: ${err.message}`));
-      }
-    }
-  };
-}
 /**
  * Creates a response handler function for the given configuration
  * @param {object} config - Configuration object containing all necessary parameters
@@ -758,14 +530,20 @@ function validateSearchString(searchstring, searchstringAnd) {
   return { isValid: true, error: null };
 }
+// Public surface used by nwss.js (parseSearchStrings, createResponseHandler)
+// and lib/validate_rules.js (validateSearchString). searchContent,
+// safeDecodeXmlEntities, and shouldAnalyzeContentType stay exported as
+// reasonable internal-helper API surface even though current external
+// consumers don't import them. createCurlHandler + downloadWithCurl +
+// downloadWithRetry were removed entirely — createCurlHandler had no
+// external invocations (nwss.js imported the name but never called it,
+// using lib/curl.js's version instead), and the download helpers were
+// only consumed by createCurlHandler.
 module.exports = {
   parseSearchStrings,
   searchContent,
   safeDecodeXmlEntities,
   shouldAnalyzeContentType,
   createResponseHandler,
-  createCurlHandler,
-  downloadWithCurl,
-  validateSearchString,
-  downloadWithRetry
+  validateSearchString
 };

package/lib/smart-cache.js CHANGED Viewed

@@ -93,10 +93,16 @@ class SmartCache {
       this._setupAutoSave();
     }
-    // Set up memory monitoring
+    // Set up memory monitoring. unref'd so this always-on housekeeping timer
+    // can never hold the event loop open past scan completion — destroy()
+    // clears it promptly on the normal path, but unref guarantees a clean
+    // exit on any path that skips destroy() (e.g. an unhandled throw before
+    // nwss reaches its cleanup). Matches the unref convention applied to
+    // every other Node-side timer in the codebase.
     this.memoryCheckInterval = setInterval(() => {
       this._checkMemoryPressure();
     }, this.options.memoryCheckInterval);
+    if (typeof this.memoryCheckInterval.unref === 'function') this.memoryCheckInterval.unref();
   }
   /**
@@ -1137,9 +1143,11 @@ class SmartCache {
    * @private
    */
   _setupAutoSave() {
+    // unref'd for the same reason as memoryCheckInterval — never block exit.
     this.autoSaveInterval = setInterval(() => {
       this.savePersistentCache();
     }, this.options.autoSaveInterval);
+    if (typeof this.autoSaveInterval.unref === 'function') this.autoSaveInterval.unref();
   }
   /**

package/lib/socks-relay.js CHANGED Viewed

@@ -227,13 +227,11 @@ function handleClient(client, upstream, forceDebug, relay) {
         upstreamSock = info.socket;
         // Safety net: if cleanup() ran while we were awaiting the upstream
-        // connect (some path other than the handshake watchdog — e.g. a
-        // 'close' event on the client during pause), settled is true and
-        // cleanup's settled guard would short-circuit a future call,
-        // orphaning this freshly-connected upstream socket. Destroy it
-        // here directly. With Fix #1a moving the watchdog clearTimeout to
-        // the 'connecting' transition this is currently unreachable, but
-        // cheap to keep as defense-in-depth against future code paths.
+        // connect, settled is true and cleanup's settled guard would
+        // short-circuit a future call, orphaning this freshly-connected
+        // upstream socket — so destroy it here directly. Reachable when the
+        // client emits 'error' or 'close' during the await (both wired to
+        // cleanup at handler setup), e.g. Chromium disconnects mid-connect.
         if (settled) {
           try { upstreamSock.destroy(); } catch (_) {}
           return;
@@ -250,8 +248,8 @@ function handleClient(client, upstream, forceDebug, relay) {
         try { upstreamSock.setKeepAlive(true, 60000); } catch (_) {}
         upstreamSock.on('error', cleanup);
         upstreamSock.on('close', cleanup);
-        client.on('error', cleanup);
-        client.on('close', cleanup);
+        // client 'error' and 'close' are wired once at handler setup (bottom
+        // of handleClient) and cover all phases — not re-attached here.
         // SOCKS5 success (BND.ADDR 0.0.0.0:0 — Chromium ignores it for CONNECT)
         client.write(Buffer.from([0x05, 0x00, 0x00, 0x01, 0, 0, 0, 0, 0, 0]));
@@ -273,6 +271,13 @@ function handleClient(client, upstream, forceDebug, relay) {
   client.on('data', onData);
   client.on('error', cleanup);
+  // Attach 'close' HERE (not after piping starts) so it covers the whole
+  // lifetime, including the up-to-20s upstream-connect await. A client that
+  // disconnects cleanly mid-connect now sets settled=true, letting the
+  // post-connect `if (settled)` net destroy the freshly-opened upstream
+  // socket instead of piping into a dead client; and a close mid-handshake
+  // clears the watchdog immediately rather than leaving it to fire later.
+  client.on('close', cleanup);
 }
 // SOCKS5 failure reply (valid only before piping starts).

package/lib/validate_rules.js CHANGED Viewed

@@ -583,7 +583,11 @@ function validateRulesetFile(filePath, options = {}) {
       errors.push(`Line ${lineNumber}: ${validation.error} - ${line}`);
       if (errors.length >= maxErrors) {
-        errors.push(`... (stopping after ${maxErrors} errors, ${stats.total - i - 1} lines remaining)`);
+        // Lines remaining in the file = total lines − current index − 1.
+        // (Previously `stats.total - i - 1`, which mixed "non-empty lines
+        // processed" with "file line index" and went negative when empties
+        // were interleaved.)
+        errors.push(`... (stopping after ${maxErrors} errors, ${lines.length - i - 1} lines remaining)`);
         break;
       }
     }
@@ -1075,9 +1079,286 @@ function testDomainValidation() {
   return allPassed;
 }
+// ─── Per-site config normalization (runs on every scan, not just --validate-config) ───
+//
+// Catches the silent-failure class that bit a user across multiple scan iterations:
+//   1. Typo'd siteConfig keys (whois_terms vs whois) silently ignored.
+//   2. Boolean fields given truthy/falsy non-boolean values (interact: 1 vs interact: true)
+//      silently disabled by strict `=== true` checks downstream.
+//   3. Misleading downstream warnings that blame the wrong field.
+//
+// normalizeSiteConfig() mutates siteConfig in place (coercing 1→true, etc) and returns
+// warnings the caller surfaces. Designed to run at scan startup, ALWAYS, not gated on
+// --validate-config (which most users never run).
+// Whitelist of every siteConfig.X key read across nwss.js + lib/*.js.
+// Regenerate via BOTH:
+//   grep -hoE "siteConfig\.[a-zA-Z_][a-zA-Z0-9_]*" nwss.js lib/*.js | sort -u
+//   grep -hoE "siteConfig\[['\"][^'\"]+['\"]\]" nwss.js lib/*.js | sort -u
+// The second pattern catches bracket-notation access required for keys with
+// hyphens (e.g. 'dig-or', 'whois-or'). Dot-notation grep alone missed these
+// and produced false 'unknown siteConfig key' warnings for valid config.
+// Also grep for destructured siteConfig keys (master destructure block in
+// processUrl) — those don't show up in either pattern.
+const KNOWN_SITE_CONFIG_KEYS = new Set([
+  'adblock_rules', 'blocked', 'bypass_cache', 'capture_popups',
+  'capture_popups_max_depth', 'capture_popups_window_ms', 'cdp', 'cdp_specific',
+  'clear_sitedata', 'clear_sitedata_full_on_reload',
+  'cloudflare_bypass', 'cloudflare_max_retries', 'comments',
+  'cloudflare_parallel_detection', 'cloudflare_phish', 'cloudflare_retry_on_error',
+  'css_blocked', 'curl', 'cursor_mode', 'custom_headers', 'delay',
+  'delay_uncapped', 'detect_js_patterns', 'dig', 'dig-or', 'digRecordType', 'dig_subdomain',
+  'disable_adblock', 'dnsmasq', 'dnsmasq_old', 'evaluateOnNewDocument',
+  'even_blocked',
+  'filterRegex', 'fingerprint_protection', 'firstParty', 'flowproxy_additional_delay',
+  'flowproxy_delay', 'flowproxy_detection', 'flowproxy_js_timeout', 'flowproxy_nav_timeout',
+  'flowproxy_page_timeout', 'forcereload', 'ghost_cursor_duration',
+  'ghost_cursor_hesitate', 'ghost_cursor_overshoot', 'ghost_cursor_speed',
+  'goto_options', 'grep', 'headful', 'ignore_similar', 'ignore_similar_ignored_domains',
+  'ignore_similar_threshold', 'interact', 'interact_click_count', 'interact_clicks',
+  'interact_duration', 'interact_intensity', 'interact_scrolling', 'isBrave',
+  'js_redirect_timeout', 'localhost', 'max_redirects', 'openvpn', 'pihole',
+  'plain', 'privoxy', 'proxy', 'proxy_bypass', 'proxy_debug', 'proxy_remote_dns',
+  'realistic_click', 'referrer_disable', 'referrer_headers', 'regex_and',
+  'reload', 'resourceTypes', 'screenshot', 'searchstring', 'searchstring_and',
+  'socks5_bypass', 'socks5_debug', 'socks5_proxy', 'socks5_remote_dns',
+  'subDomains',
+  'thirdParty', 'timeout', 'unbound', 'url', 'userAgent', 'verbose', 'vpn',
+  'whois', 'whois-or', 'whois_delay', 'whois_max_retries', 'whois_retry_on_error',
+  'whois_retry_on_timeout', 'whois_server', 'whois_server_mode',
+  'whois_timeout_multiplier', 'whois_use_fallback', 'window_cleanup',
+  'window_cleanup_threshold',
+  // Internal sentinel added by nwss.js when fanning array URLs into tasks.
+  '_originalUrl',
+]);
+// Boolean siteConfig fields where strict `=== true` is used downstream.
+// Listed only for fields with UNAMBIGUOUS boolean semantics — fields with
+// multi-type overloads stay out:
+//   forcereload         : true | string[]
+//   cloudflare_bypass   : true | 'debug'
+//   cloudflare_phish    : true | 'debug'
+//   window_cleanup      : true | 'all' | 'realtime'
+//   cursor_mode         : string ('ghost')
+// Update both this set AND the strict-equality call sites if a new boolean
+// siteConfig field is added.
+const BOOLEAN_SITE_CONFIG_FIELDS = new Set([
+  'adblock_rules', 'bypass_cache', 'capture_popups', 'cdp', 'clear_sitedata',
+  'clear_sitedata_full_on_reload', 'curl', 'delay_uncapped',
+  'detect_js_patterns', 'dig_subdomain',
+  'disable_adblock', 'dnsmasq', 'dnsmasq_old', 'evaluateOnNewDocument',
+  'even_blocked', 'firstParty', 'flowproxy_detection',
+  'grep', 'headful', 'ignore_similar', 'ignore_similar_ignored_domains',
+  'interact', 'interact_clicks', 'interact_scrolling', 'isBrave', 'localhost',
+  'pihole', 'plain', 'privoxy', 'proxy_debug', 'proxy_remote_dns',
+  'realistic_click', 'referrer_disable', 'regex_and', 'screenshot',
+  'searchstring_and', 'socks5_debug', 'socks5_remote_dns', 'thirdParty',
+  'unbound', 'whois_retry_on_error', 'whois_retry_on_timeout', 'whois_use_fallback',
+]);
+// Fields that accept BOTH `"x"` (single term) and `["x", "y"]` (multi-term).
+// Downstream consumers (nwss.js line ~2824, lib/nettools.js line ~1149-1152)
+// do `Array.isArray(val) && val.length > 0` checks, so a string value
+// previously caused silent feature-disable. normalizeSiteConfig() now wraps
+// any string value in a single-element array so both forms are first-class.
+// Non-string non-array values still warn (and stay as-is, since we don't
+// know how to coerce them).
+const STRING_TO_ARRAY_FIELDS = new Set([
+  'dig', 'dig-or', 'whois', 'whois-or',
+]);
+// Truthy-but-not-true → true. Falsy-but-not-false → false. Otherwise leave alone.
+// Strings are lower-cased before matching so "True"/"TRUE"/"Yes"/etc all match.
+function _coerceBooleanLike(val) {
+  if (val === true || val === false) return { coerced: false, value: val };
+  const s = typeof val === 'string' ? val.toLowerCase() : val;
+  if (s === 1 || s === '1' || s === 'true' || s === 'yes' || s === 'on') {
+    return { coerced: true, value: true };
+  }
+  if (s === 0 || s === '0' || s === 'false' || s === 'no' || s === 'off') {
+    return { coerced: true, value: false };
+  }
+  return { coerced: false, value: val };
+}
+// Tiny Levenshtein for "did you mean?" suggestions. Inlined rather than
+// imported from lib/ignore_similar (which has its own dependency tree we
+// don't want to drag into validation) -- 18 lines of well-known algorithm.
+function _editDistance(a, b) {
+  if (a === b) return 0;
+  if (!a) return b.length;
+  if (!b) return a.length;
+  const m = a.length, n = b.length;
+  let prev = new Array(n + 1);
+  let curr = new Array(n + 1);
+  for (let j = 0; j <= n; j++) prev[j] = j;
+  for (let i = 1; i <= m; i++) {
+    curr[0] = i;
+    for (let j = 1; j <= n; j++) {
+      curr[j] = a[i - 1] === b[j - 1]
+        ? prev[j - 1]
+        : 1 + Math.min(prev[j - 1], prev[j], curr[j - 1]);
+    }
+    [prev, curr] = [curr, prev];
+  }
+  return prev[n];
+}
+// Suggest a known key for an unknown one. Two parallel candidate searches,
+// then pick the better signal:
+//
+//   1. EDIT-DISTANCE candidate — classic typo case ('intract' → 'interact').
+//      Threshold scales with the unknown key's length (40%, min 2) so short
+//      typos stay matchable.
+//
+//   2. PREFIX candidate — "user added a suffix to a known root" case.
+//      'whois_terms' starts with 'whois' (known key) → suggest 'whois'.
+//      Requires the prefix to be at least 3 chars to avoid spurious matches
+//      on accidental 1-2 letter prefixes. Among multiple prefix candidates,
+//      we take the LONGEST (most specific category boundary).
+//
+// Ranking: if there's a very close edit-distance match (≤2 edits), prefer
+// it — almost certainly a misspelling of that specific key (e.g.
+// 'whois_max_retri' → 'whois_max_retries' at distance 2 beats the prefix
+// match 'whois'). Otherwise prefer the prefix match when present, since
+// "extra suffix on a known root" is a stronger signal than a 4+-edit
+// distance to an unrelated key.
+function _suggestKey(unknownKey, knownKeys) {
+  const threshold = Math.max(2, Math.floor(unknownKey.length * 0.4));
+  let distBest = null, distBestVal = Infinity;
+  let prefixBest = null, prefixBestLen = 0;
+  for (const k of knownKeys) {
+    const d = _editDistance(unknownKey, k);
+    if (d < distBestVal && d <= threshold) {
+      distBestVal = d;
+      distBest = k;
+    }
+    if (k.length >= 3 && unknownKey !== k &&
+        unknownKey.startsWith(k) && k.length > prefixBestLen) {
+      prefixBest = k;
+      prefixBestLen = k.length;
+    }
+  }
+  if (distBest && distBestVal <= 2) return distBest;
+  return prefixBest || distBest;
+}
+/**
+ * Per-site validation + boolean coercion run at scan startup (always, not
+ * gated on --validate-config).
+ *
+ * Mutates siteConfig in place to coerce boolean-like values (1, 0, "true",
+ * "false", "yes", "no", "on", "off") to true/false for fields in
+ * BOOLEAN_SITE_CONFIG_FIELDS. Returns warnings the caller surfaces via the
+ * usual logging path.
+ *
+ * Catches the failure classes:
+ *   1. Unknown siteConfig keys → typo warning + "did you mean?" suggestion.
+ *      Example: 'whois_terms' → "did you mean 'whois'?"
+ *   2. Boolean field with truthy non-boolean value → coerce + warn.
+ *      Example: 'interact: 1' → coerced to 'interact: true', warning emitted.
+ *   3. Boolean field with non-boolean non-truthy value → warn only, no coerce.
+ *      Example: 'interact: "maybe"' → warned, left alone.
+ *
+ * @param {object} siteConfig - mutated in place
+ * @param {number} siteIndex - for warning messages
+ * @returns {{warnings: string[], errors: string[]}}
+ */
+function normalizeSiteConfig(siteConfig, siteIndex = 0) {
+  const warnings = [];
+  const errors = [];
+  if (!siteConfig || typeof siteConfig !== 'object') {
+    errors.push(`Site ${siteIndex}: not an object`);
+    return { warnings, errors };
+  }
+  const tag = siteConfig.url ? `Site ${siteIndex} (${siteConfig.url})` : `Site ${siteIndex}`;
+  // 1. Unknown-key detection. Scan every top-level key; report with
+  // Levenshtein-based suggestion when close to a known key.
+  for (const key of Object.keys(siteConfig)) {
+    if (KNOWN_SITE_CONFIG_KEYS.has(key)) continue;
+    const suggestion = _suggestKey(key, KNOWN_SITE_CONFIG_KEYS);
+    warnings.push(
+      `${tag}: unknown siteConfig key '${key}'` +
+      (suggestion ? ` — did you mean '${suggestion}'?` : '') +
+      ' — value will be ignored at runtime'
+    );
+  }
+  // 2. Boolean coercion for known boolean fields. Mutates siteConfig.
+  for (const field of BOOLEAN_SITE_CONFIG_FIELDS) {
+    if (!(field in siteConfig)) continue;
+    const original = siteConfig[field];
+    if (original === undefined || original === null) continue;
+    const { coerced, value } = _coerceBooleanLike(original);
+    if (coerced) {
+      siteConfig[field] = value;
+      warnings.push(
+        `${tag}: '${field}' value ${JSON.stringify(original)} should be ${value} ` +
+        `(boolean) — coerced for compatibility; please update config to use ${value}`
+      );
+    } else if (typeof original !== 'boolean') {
+      warnings.push(
+        `${tag}: '${field}' should be boolean (true/false), got ${JSON.stringify(original)} ` +
+        `— may not work as expected (downstream strict-equality check will treat as disabled)`
+      );
+    }
+  }
+  // 3. String → single-element array coercion for fields that accept both
+  // forms (dig, dig-or, whois, whois-or). Downstream consumers all gate on
+  // Array.isArray(), so a bare string value previously silently disabled
+  // the feature. Wrapping in [val] is the canonical "user gave one term"
+  // outcome and matches user intent. Both forms are first-class — no
+  // warning is emitted on the string path, just the in-place mutation.
+  //
+  // Empty string is left alone: the downstream `siteConfig.dig && ...`
+  // check sees the empty string as falsy and disables the feature. If we
+  // coerced "" to [""], nettools' array.length>0 check would PASS and then
+  // every dig/whois output would match (`"".includes(anything)` is true),
+  // turning a clearly-empty config into a match-everything one.
+  //
+  // Non-string non-array values DO warn since we can't sensibly coerce.
+  for (const field of STRING_TO_ARRAY_FIELDS) {
+    if (!(field in siteConfig)) continue;
+    const val = siteConfig[field];
+    if (val === undefined || val === null) continue;
+    if (typeof val === 'string') {
+      if (val.length > 0) siteConfig[field] = [val];
+      // empty string: leave as-is (preserves disable-on-falsy semantics)
+    } else if (!Array.isArray(val)) {
+      warnings.push(
+        `${tag}: '${field}' should be a string or array of strings, got ${typeof val} ` +
+        `(${JSON.stringify(val).slice(0, 60)}) — feature will be disabled at runtime`
+      );
+    }
+  }
+  // 4. Dependent-flag implication: clear_sitedata_full_on_reload only takes
+  // effect inside the `if (clear_sitedata === true)` guard at nwss.js:4627
+  // — setting it WITHOUT clear_sitedata: true silently does nothing. That's
+  // the same silent-failure pattern this validator was created to prevent,
+  // so auto-enable clear_sitedata and warn the user. They almost certainly
+  // intended both to be true; opt-in to heavy-storage clearing without
+  // opt-in to clearing-at-all doesn't make sense as a configuration.
+  if (siteConfig.clear_sitedata_full_on_reload === true &&
+      siteConfig.clear_sitedata !== true) {
+    siteConfig.clear_sitedata = true;
+    warnings.push(
+      `${tag}: 'clear_sitedata_full_on_reload: true' requires 'clear_sitedata: true' ` +
+      `— auto-enabled clear_sitedata for compatibility; please add 'clear_sitedata: true' ` +
+      `to your config explicitly`
+    );
+  }
+  return { warnings, errors };
+}
 // Public surface used by nwss.js (validateRulesetFile, validateFullConfig,
-// testDomainValidation, cleanRulesetFile). The rest (isValidDomain,
-// isValidDomainLabel, isValidTLD, isIPAddress, isIPv4, isIPv6,
+// testDomainValidation, cleanRulesetFile, normalizeSiteConfig). The rest
+// (isValidDomain, isValidDomainLabel, isValidTLD, isIPAddress, isIPv4, isIPv6,
 // validateRegexPattern, validateAdblockModifiers, validateAdblockRule,
 // validateSiteConfig) stay internal-helper-but-exported for now since
 // downstream callers MAY import them via the dotted path even if grep
@@ -1100,5 +1381,6 @@ module.exports = {
   cleanRulesetFile,
   validateSiteConfig,
   validateFullConfig,
+  normalizeSiteConfig,
   testDomainValidation
 };