@fanboynz/network-scanner 3.0.3 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,12 +3,12 @@
3
3
 
4
4
  const fs = require('fs');
5
5
  const { formatLogMessage, messageColors } = require('./colorize');
6
- const CURL_TAG = messageColors.processing('[curl]');
7
- // responseHandler is a separate code path (Puppeteer response listener,
8
- // not curl) its debug output gets its own subsystem prefix so it's
9
- // distinguishable from curl-handler logs.
6
+ // Subsystem tag for the Puppeteer response-listener path. createCurlHandler
7
+ // + its CURL_TAG (and the downloadWithCurl/downloadWithRetry helpers) used
8
+ // to live here but were dead nwss.js imports the curl-based handler from
9
+ // lib/curl.js instead. Removed in the same cleanup that drops those
10
+ // functions.
10
11
  const SEARCHSTRING_TAG = messageColors.processing('[searchstring]');
11
- const { runProcess } = require('./spawn-async');
12
12
  const { grepContent } = require('./grep');
13
13
 
14
14
  // Configuration constants for search logic
@@ -51,83 +51,6 @@ function parseSearchStrings(searchstring, searchstringAnd) {
51
51
  };
52
52
  }
53
53
 
54
- /**
55
- * Downloads content using curl with appropriate headers and timeout
56
- * @param {string} url - The URL to download
57
- * @param {string} userAgent - User agent string to use
58
- * @param {number} timeout - Timeout in seconds (default: 30)
59
- * @returns {Promise<string>} The downloaded content
60
- */
61
- async function downloadWithCurl(url, userAgent = '', timeout = 30) {
62
- const MAX_STDOUT_BYTES = 52428800; // 50MB, matches --max-filesize below
63
-
64
- const curlArgs = [
65
- '-s',
66
- '-L',
67
- '--max-time', timeout.toString(),
68
- '--max-redirs', '5',
69
- '--fail-with-body',
70
- '--max-filesize', '52428800',
71
- '--range', '0-52428799',
72
- '--compressed'
73
- ];
74
- if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`);
75
- curlArgs.push(
76
- '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
77
- '-H', 'Accept-Language: en-US,en;q=0.5',
78
- '-H', 'Accept-Encoding: gzip, deflate',
79
- '-H', 'Connection: keep-alive',
80
- '-H', 'Upgrade-Insecure-Requests: 1'
81
- );
82
- curlArgs.push(url);
83
-
84
- // Shared async-spawn helper — same streaming/cap/timeout/kill plumbing
85
- // that used to be ~80 lines of inline boilerplate here.
86
- const result = await runProcess('curl', curlArgs, {
87
- timeout: timeout * 1000,
88
- maxStdout: MAX_STDOUT_BYTES
89
- });
90
-
91
- if (result.error) throw new Error(`Curl failed for ${url}: ${result.error}`);
92
- if (result.truncated) throw new Error(`Curl output exceeded ${MAX_STDOUT_BYTES} bytes for ${url}`);
93
- if (result.signal) throw new Error(`Curl killed by signal ${result.signal} for ${url}`);
94
- if (result.code !== 0) {
95
- throw new Error(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`);
96
- }
97
- return result.stdout.toString('utf8');
98
- }
99
-
100
- /**
101
- * Downloads content with retry logic for transient failures
102
- * @param {string} url - The URL to download
103
- * @param {string} userAgent - User agent string to use
104
- * @param {number} timeout - Timeout in seconds
105
- * @param {number} retries - Number of retry attempts (default: 2)
106
- * @returns {Promise<string>} The downloaded content
107
- */
108
- async function downloadWithRetry(url, userAgent = '', timeout = 30, retries = 2) {
109
- for (let attempt = 0; attempt <= retries; attempt++) {
110
- try {
111
- return await downloadWithCurl(url, userAgent, timeout);
112
- } catch (err) {
113
- // Don't retry on final attempt
114
- if (attempt === retries) throw err;
115
-
116
- // Only retry on specific transient errors
117
- const shouldRetry = err.message.includes('timeout') ||
118
- err.message.includes('Connection refused') ||
119
- err.message.includes('502') ||
120
- err.message.includes('503') ||
121
- err.message.includes('Connection reset');
122
-
123
- if (!shouldRetry) throw err;
124
-
125
- // Exponential backoff: 1s, 2s, 4s...
126
- await new Promise(resolve => setTimeout(resolve, 1000 * Math.pow(2, attempt)));
127
- }
128
- }
129
- }
130
-
131
54
  // Lookup table for the 6 named entities the previous chained-replace
132
55
  // handled. Hoisted out of safeDecodeXmlEntities so the object isn't
133
56
  // reallocated per call.
@@ -337,157 +260,6 @@ function shouldAnalyzeContentType(contentType) {
337
260
  return textTypes.some(type => normalizedType.startsWith(type));
338
261
  }
339
262
 
340
- /**
341
- * Creates a curl-based URL handler for downloading and optionally searching content
342
- * @param {object} config - Configuration object containing all necessary parameters
343
- * @returns {Function} URL handler function for curl-based content analysis
344
- */
345
- function createCurlHandler(config) {
346
- const {
347
- searchStrings,
348
- searchStringsAnd,
349
- hasSearchStringAnd,
350
- regexes,
351
- matchedDomains,
352
- addMatchedDomain, // Helper function for adding domains
353
- currentUrl,
354
- perSiteSubDomains,
355
- ignoreDomains,
356
- matchesIgnoreDomain,
357
- getRootDomain,
358
- siteConfig,
359
- dumpUrls,
360
- matchedUrlsLogFile,
361
- forceDebug,
362
- userAgent,
363
- resourceType, // Resource type from request
364
- hasSearchString
365
- } = config;
366
-
367
- // Hoisted: currentUrl doesn't change for this handler's lifetime, so
368
- // parsing its hostname once at handler-creation eliminates the
369
- // per-request URL allocation.
370
- let currentUrlHostname = '';
371
- try { currentUrlHostname = new URL(currentUrl).hostname; } catch (_) {}
372
-
373
- return async function curlHandler(requestUrl) {
374
- // Regex check FIRST — cheap filter that skips ~99% of requests.
375
- // Previously this ran AFTER a URL parse + domain-cache lookup;
376
- // the parse is the expensive bit, so doing it after the cheap
377
- // gate moves the cost off the hot path.
378
- const matchesRegex = regexes.some(re => re.test(requestUrl));
379
- if (!matchesRegex) return;
380
-
381
- // Parse requestUrl ONCE and reuse. Was parsed 2-3 times.
382
- let requestHostname;
383
- try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; }
384
- const reqDomain = perSiteSubDomains ? requestHostname : getRootDomain(requestUrl);
385
-
386
- if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(reqDomain)) {
387
- if (forceDebug) {
388
- console.log(formatLogMessage('debug', `${CURL_TAG} Skipping already detected domain: ${reqDomain}`));
389
- }
390
- return;
391
- }
392
-
393
- const isFirstParty = currentUrlHostname === requestHostname;
394
-
395
- // Apply first-party/third-party filtering
396
- if (isFirstParty && siteConfig.firstParty === false) {
397
- if (forceDebug) {
398
- console.log(formatLogMessage('debug', `${CURL_TAG} Skipping first-party request (firstParty=false): ${requestUrl}`));
399
- }
400
- return;
401
- }
402
-
403
- if (!isFirstParty && siteConfig.thirdParty === false) {
404
- if (forceDebug) {
405
- console.log(formatLogMessage('debug', `${CURL_TAG} Skipping third-party request (thirdParty=false): ${requestUrl}`));
406
- }
407
- return;
408
- }
409
-
410
- try {
411
- if (forceDebug) {
412
- console.log(formatLogMessage('debug', `${CURL_TAG} Downloading content from: ${requestUrl}`));
413
- }
414
-
415
- // If NO searchstring is defined, match immediately (like browser behavior)
416
- if (!hasSearchString && !hasSearchStringAnd) {
417
- if (!reqDomain || matchesIgnoreDomain(reqDomain, ignoreDomains)) {
418
- return;
419
- }
420
-
421
- addMatchedDomain(reqDomain, resourceType);
422
- const simplifiedUrl = getRootDomain(currentUrl);
423
-
424
- if (siteConfig.verbose === 1) {
425
- const partyType = isFirstParty ? 'first-party' : 'third-party';
426
- const resourceInfo = resourceType ? ` (${resourceType})` : '';
427
- console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) matched regex${resourceInfo}`);
428
- }
429
-
430
- if (dumpUrls) {
431
- const timestamp = new Date().toISOString();
432
- const partyType = isFirstParty ? 'first-party' : 'third-party';
433
- const resourceInfo = resourceType ? ` (${resourceType})` : '';
434
- try {
435
- fs.appendFileSync(matchedUrlsLogFile,
436
- `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
437
- } catch (logErr) {
438
- console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
439
- }
440
- }
441
- return;
442
- }
443
-
444
- // If searchstring IS defined, download and search content
445
- const content = await downloadWithRetry(requestUrl, userAgent, 30);
446
-
447
- // Check if content contains search strings (OR or AND logic)
448
- const { found, matchedString, logicType, error } = searchContent(content, searchStrings, searchStringsAnd, '', requestUrl);
449
-
450
- if (found) {
451
- if (!reqDomain || matchesIgnoreDomain(reqDomain, ignoreDomains)) {
452
- return;
453
- }
454
-
455
- addMatchedDomain(reqDomain, resourceType);
456
- const simplifiedUrl = getRootDomain(currentUrl);
457
-
458
- if (siteConfig.verbose === 1) {
459
- const partyType = isFirstParty ? 'first-party' : 'third-party';
460
- const resourceInfo = resourceType ? ` (${resourceType})` : '';
461
- console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) contains searchstring (${logicType}): "${matchedString}"${resourceInfo}`);
462
- }
463
-
464
- if (dumpUrls) {
465
- const timestamp = new Date().toISOString();
466
- const partyType = isFirstParty ? 'first-party' : 'third-party';
467
- const resourceInfo = resourceType ? ` (${resourceType})` : '';
468
- try {
469
- fs.appendFileSync(matchedUrlsLogFile,
470
- `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, searchstring (${logicType}): "${matchedString}")${resourceInfo}\n`);
471
- } catch (logErr) {
472
- console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
473
- }
474
- }
475
- } else if (forceDebug) {
476
- const partyType = isFirstParty ? 'first-party' : 'third-party';
477
- console.log(formatLogMessage('debug', `${CURL_TAG} ${requestUrl} (${partyType}) matched regex but no searchstring found`));
478
- if (error) {
479
- console.log(formatLogMessage('debug', `${CURL_TAG} Search error: ${error}`));
480
- }
481
- }
482
-
483
- } catch (err) {
484
- if (forceDebug) {
485
- console.log(formatLogMessage('debug', `${CURL_TAG} Failed to download content for ${requestUrl}: ${err.message}`));
486
- }
487
- }
488
- };
489
- }
490
-
491
263
  /**
492
264
  * Creates a response handler function for the given configuration
493
265
  * @param {object} config - Configuration object containing all necessary parameters
@@ -758,14 +530,20 @@ function validateSearchString(searchstring, searchstringAnd) {
758
530
  return { isValid: true, error: null };
759
531
  }
760
532
 
533
+ // Public surface used by nwss.js (parseSearchStrings, createResponseHandler)
534
+ // and lib/validate_rules.js (validateSearchString). searchContent,
535
+ // safeDecodeXmlEntities, and shouldAnalyzeContentType stay exported as
536
+ // reasonable internal-helper API surface even though current external
537
+ // consumers don't import them. createCurlHandler + downloadWithCurl +
538
+ // downloadWithRetry were removed entirely — createCurlHandler had no
539
+ // external invocations (nwss.js imported the name but never called it,
540
+ // using lib/curl.js's version instead), and the download helpers were
541
+ // only consumed by createCurlHandler.
761
542
  module.exports = {
762
543
  parseSearchStrings,
763
544
  searchContent,
764
545
  safeDecodeXmlEntities,
765
546
  shouldAnalyzeContentType,
766
547
  createResponseHandler,
767
- createCurlHandler,
768
- downloadWithCurl,
769
- validateSearchString,
770
- downloadWithRetry
548
+ validateSearchString
771
549
  };
@@ -93,10 +93,16 @@ class SmartCache {
93
93
  this._setupAutoSave();
94
94
  }
95
95
 
96
- // Set up memory monitoring
96
+ // Set up memory monitoring. unref'd so this always-on housekeeping timer
97
+ // can never hold the event loop open past scan completion — destroy()
98
+ // clears it promptly on the normal path, but unref guarantees a clean
99
+ // exit on any path that skips destroy() (e.g. an unhandled throw before
100
+ // nwss reaches its cleanup). Matches the unref convention applied to
101
+ // every other Node-side timer in the codebase.
97
102
  this.memoryCheckInterval = setInterval(() => {
98
103
  this._checkMemoryPressure();
99
104
  }, this.options.memoryCheckInterval);
105
+ if (typeof this.memoryCheckInterval.unref === 'function') this.memoryCheckInterval.unref();
100
106
  }
101
107
 
102
108
  /**
@@ -1137,9 +1143,11 @@ class SmartCache {
1137
1143
  * @private
1138
1144
  */
1139
1145
  _setupAutoSave() {
1146
+ // unref'd for the same reason as memoryCheckInterval — never block exit.
1140
1147
  this.autoSaveInterval = setInterval(() => {
1141
1148
  this.savePersistentCache();
1142
1149
  }, this.options.autoSaveInterval);
1150
+ if (typeof this.autoSaveInterval.unref === 'function') this.autoSaveInterval.unref();
1143
1151
  }
1144
1152
 
1145
1153
  /**
@@ -227,13 +227,11 @@ function handleClient(client, upstream, forceDebug, relay) {
227
227
 
228
228
  upstreamSock = info.socket;
229
229
  // Safety net: if cleanup() ran while we were awaiting the upstream
230
- // connect (some path other than the handshake watchdog — e.g. a
231
- // 'close' event on the client during pause), settled is true and
232
- // cleanup's settled guard would short-circuit a future call,
233
- // orphaning this freshly-connected upstream socket. Destroy it
234
- // here directly. With Fix #1a moving the watchdog clearTimeout to
235
- // the 'connecting' transition this is currently unreachable, but
236
- // cheap to keep as defense-in-depth against future code paths.
230
+ // connect, settled is true and cleanup's settled guard would
231
+ // short-circuit a future call, orphaning this freshly-connected
232
+ // upstream socket so destroy it here directly. Reachable when the
233
+ // client emits 'error' or 'close' during the await (both wired to
234
+ // cleanup at handler setup), e.g. Chromium disconnects mid-connect.
237
235
  if (settled) {
238
236
  try { upstreamSock.destroy(); } catch (_) {}
239
237
  return;
@@ -250,8 +248,8 @@ function handleClient(client, upstream, forceDebug, relay) {
250
248
  try { upstreamSock.setKeepAlive(true, 60000); } catch (_) {}
251
249
  upstreamSock.on('error', cleanup);
252
250
  upstreamSock.on('close', cleanup);
253
- client.on('error', cleanup);
254
- client.on('close', cleanup);
251
+ // client 'error' and 'close' are wired once at handler setup (bottom
252
+ // of handleClient) and cover all phases — not re-attached here.
255
253
 
256
254
  // SOCKS5 success (BND.ADDR 0.0.0.0:0 — Chromium ignores it for CONNECT)
257
255
  client.write(Buffer.from([0x05, 0x00, 0x00, 0x01, 0, 0, 0, 0, 0, 0]));
@@ -273,6 +271,13 @@ function handleClient(client, upstream, forceDebug, relay) {
273
271
 
274
272
  client.on('data', onData);
275
273
  client.on('error', cleanup);
274
+ // Attach 'close' HERE (not after piping starts) so it covers the whole
275
+ // lifetime, including the up-to-20s upstream-connect await. A client that
276
+ // disconnects cleanly mid-connect now sets settled=true, letting the
277
+ // post-connect `if (settled)` net destroy the freshly-opened upstream
278
+ // socket instead of piping into a dead client; and a close mid-handshake
279
+ // clears the watchdog immediately rather than leaving it to fire later.
280
+ client.on('close', cleanup);
276
281
  }
277
282
 
278
283
  // SOCKS5 failure reply (valid only before piping starts).
@@ -583,7 +583,11 @@ function validateRulesetFile(filePath, options = {}) {
583
583
  errors.push(`Line ${lineNumber}: ${validation.error} - ${line}`);
584
584
 
585
585
  if (errors.length >= maxErrors) {
586
- errors.push(`... (stopping after ${maxErrors} errors, ${stats.total - i - 1} lines remaining)`);
586
+ // Lines remaining in the file = total lines current index 1.
587
+ // (Previously `stats.total - i - 1`, which mixed "non-empty lines
588
+ // processed" with "file line index" and went negative when empties
589
+ // were interleaved.)
590
+ errors.push(`... (stopping after ${maxErrors} errors, ${lines.length - i - 1} lines remaining)`);
587
591
  break;
588
592
  }
589
593
  }
@@ -1075,9 +1079,286 @@ function testDomainValidation() {
1075
1079
  return allPassed;
1076
1080
  }
1077
1081
 
1082
+ // ─── Per-site config normalization (runs on every scan, not just --validate-config) ───
1083
+ //
1084
+ // Catches the silent-failure class that bit a user across multiple scan iterations:
1085
+ // 1. Typo'd siteConfig keys (whois_terms vs whois) silently ignored.
1086
+ // 2. Boolean fields given truthy/falsy non-boolean values (interact: 1 vs interact: true)
1087
+ // silently disabled by strict `=== true` checks downstream.
1088
+ // 3. Misleading downstream warnings that blame the wrong field.
1089
+ //
1090
+ // normalizeSiteConfig() mutates siteConfig in place (coercing 1→true, etc) and returns
1091
+ // warnings the caller surfaces. Designed to run at scan startup, ALWAYS, not gated on
1092
+ // --validate-config (which most users never run).
1093
+
1094
+ // Whitelist of every siteConfig.X key read across nwss.js + lib/*.js.
1095
+ // Regenerate via BOTH:
1096
+ // grep -hoE "siteConfig\.[a-zA-Z_][a-zA-Z0-9_]*" nwss.js lib/*.js | sort -u
1097
+ // grep -hoE "siteConfig\[['\"][^'\"]+['\"]\]" nwss.js lib/*.js | sort -u
1098
+ // The second pattern catches bracket-notation access required for keys with
1099
+ // hyphens (e.g. 'dig-or', 'whois-or'). Dot-notation grep alone missed these
1100
+ // and produced false 'unknown siteConfig key' warnings for valid config.
1101
+ // Also grep for destructured siteConfig keys (master destructure block in
1102
+ // processUrl) — those don't show up in either pattern.
1103
+ const KNOWN_SITE_CONFIG_KEYS = new Set([
1104
+ 'adblock_rules', 'blocked', 'bypass_cache', 'capture_popups',
1105
+ 'capture_popups_max_depth', 'capture_popups_window_ms', 'cdp', 'cdp_specific',
1106
+ 'clear_sitedata', 'clear_sitedata_full_on_reload',
1107
+ 'cloudflare_bypass', 'cloudflare_max_retries', 'comments',
1108
+ 'cloudflare_parallel_detection', 'cloudflare_phish', 'cloudflare_retry_on_error',
1109
+ 'css_blocked', 'curl', 'cursor_mode', 'custom_headers', 'delay',
1110
+ 'delay_uncapped', 'detect_js_patterns', 'dig', 'dig-or', 'digRecordType', 'dig_subdomain',
1111
+ 'disable_adblock', 'dnsmasq', 'dnsmasq_old', 'evaluateOnNewDocument',
1112
+ 'even_blocked',
1113
+ 'filterRegex', 'fingerprint_protection', 'firstParty', 'flowproxy_additional_delay',
1114
+ 'flowproxy_delay', 'flowproxy_detection', 'flowproxy_js_timeout', 'flowproxy_nav_timeout',
1115
+ 'flowproxy_page_timeout', 'forcereload', 'ghost_cursor_duration',
1116
+ 'ghost_cursor_hesitate', 'ghost_cursor_overshoot', 'ghost_cursor_speed',
1117
+ 'goto_options', 'grep', 'headful', 'ignore_similar', 'ignore_similar_ignored_domains',
1118
+ 'ignore_similar_threshold', 'interact', 'interact_click_count', 'interact_clicks',
1119
+ 'interact_duration', 'interact_intensity', 'interact_scrolling', 'isBrave',
1120
+ 'js_redirect_timeout', 'localhost', 'max_redirects', 'openvpn', 'pihole',
1121
+ 'plain', 'privoxy', 'proxy', 'proxy_bypass', 'proxy_debug', 'proxy_remote_dns',
1122
+ 'realistic_click', 'referrer_disable', 'referrer_headers', 'regex_and',
1123
+ 'reload', 'resourceTypes', 'screenshot', 'searchstring', 'searchstring_and',
1124
+ 'socks5_bypass', 'socks5_debug', 'socks5_proxy', 'socks5_remote_dns',
1125
+ 'subDomains',
1126
+ 'thirdParty', 'timeout', 'unbound', 'url', 'userAgent', 'verbose', 'vpn',
1127
+ 'whois', 'whois-or', 'whois_delay', 'whois_max_retries', 'whois_retry_on_error',
1128
+ 'whois_retry_on_timeout', 'whois_server', 'whois_server_mode',
1129
+ 'whois_timeout_multiplier', 'whois_use_fallback', 'window_cleanup',
1130
+ 'window_cleanup_threshold',
1131
+ // Internal sentinel added by nwss.js when fanning array URLs into tasks.
1132
+ '_originalUrl',
1133
+ ]);
1134
+
1135
+ // Boolean siteConfig fields where strict `=== true` is used downstream.
1136
+ // Listed only for fields with UNAMBIGUOUS boolean semantics — fields with
1137
+ // multi-type overloads stay out:
1138
+ // forcereload : true | string[]
1139
+ // cloudflare_bypass : true | 'debug'
1140
+ // cloudflare_phish : true | 'debug'
1141
+ // window_cleanup : true | 'all' | 'realtime'
1142
+ // cursor_mode : string ('ghost')
1143
+ // Update both this set AND the strict-equality call sites if a new boolean
1144
+ // siteConfig field is added.
1145
+ const BOOLEAN_SITE_CONFIG_FIELDS = new Set([
1146
+ 'adblock_rules', 'bypass_cache', 'capture_popups', 'cdp', 'clear_sitedata',
1147
+ 'clear_sitedata_full_on_reload', 'curl', 'delay_uncapped',
1148
+ 'detect_js_patterns', 'dig_subdomain',
1149
+ 'disable_adblock', 'dnsmasq', 'dnsmasq_old', 'evaluateOnNewDocument',
1150
+ 'even_blocked', 'firstParty', 'flowproxy_detection',
1151
+ 'grep', 'headful', 'ignore_similar', 'ignore_similar_ignored_domains',
1152
+ 'interact', 'interact_clicks', 'interact_scrolling', 'isBrave', 'localhost',
1153
+ 'pihole', 'plain', 'privoxy', 'proxy_debug', 'proxy_remote_dns',
1154
+ 'realistic_click', 'referrer_disable', 'regex_and', 'screenshot',
1155
+ 'searchstring_and', 'socks5_debug', 'socks5_remote_dns', 'thirdParty',
1156
+ 'unbound', 'whois_retry_on_error', 'whois_retry_on_timeout', 'whois_use_fallback',
1157
+ ]);
1158
+
1159
+ // Fields that accept BOTH `"x"` (single term) and `["x", "y"]` (multi-term).
1160
+ // Downstream consumers (nwss.js line ~2824, lib/nettools.js line ~1149-1152)
1161
+ // do `Array.isArray(val) && val.length > 0` checks, so a string value
1162
+ // previously caused silent feature-disable. normalizeSiteConfig() now wraps
1163
+ // any string value in a single-element array so both forms are first-class.
1164
+ // Non-string non-array values still warn (and stay as-is, since we don't
1165
+ // know how to coerce them).
1166
+ const STRING_TO_ARRAY_FIELDS = new Set([
1167
+ 'dig', 'dig-or', 'whois', 'whois-or',
1168
+ ]);
1169
+
1170
+ // Truthy-but-not-true → true. Falsy-but-not-false → false. Otherwise leave alone.
1171
+ // Strings are lower-cased before matching so "True"/"TRUE"/"Yes"/etc all match.
1172
+ function _coerceBooleanLike(val) {
1173
+ if (val === true || val === false) return { coerced: false, value: val };
1174
+ const s = typeof val === 'string' ? val.toLowerCase() : val;
1175
+ if (s === 1 || s === '1' || s === 'true' || s === 'yes' || s === 'on') {
1176
+ return { coerced: true, value: true };
1177
+ }
1178
+ if (s === 0 || s === '0' || s === 'false' || s === 'no' || s === 'off') {
1179
+ return { coerced: true, value: false };
1180
+ }
1181
+ return { coerced: false, value: val };
1182
+ }
1183
+
1184
+ // Tiny Levenshtein for "did you mean?" suggestions. Inlined rather than
1185
+ // imported from lib/ignore_similar (which has its own dependency tree we
1186
+ // don't want to drag into validation) -- 18 lines of well-known algorithm.
1187
+ function _editDistance(a, b) {
1188
+ if (a === b) return 0;
1189
+ if (!a) return b.length;
1190
+ if (!b) return a.length;
1191
+ const m = a.length, n = b.length;
1192
+ let prev = new Array(n + 1);
1193
+ let curr = new Array(n + 1);
1194
+ for (let j = 0; j <= n; j++) prev[j] = j;
1195
+ for (let i = 1; i <= m; i++) {
1196
+ curr[0] = i;
1197
+ for (let j = 1; j <= n; j++) {
1198
+ curr[j] = a[i - 1] === b[j - 1]
1199
+ ? prev[j - 1]
1200
+ : 1 + Math.min(prev[j - 1], prev[j], curr[j - 1]);
1201
+ }
1202
+ [prev, curr] = [curr, prev];
1203
+ }
1204
+ return prev[n];
1205
+ }
1206
+
1207
+ // Suggest a known key for an unknown one. Two parallel candidate searches,
1208
+ // then pick the better signal:
1209
+ //
1210
+ // 1. EDIT-DISTANCE candidate — classic typo case ('intract' → 'interact').
1211
+ // Threshold scales with the unknown key's length (40%, min 2) so short
1212
+ // typos stay matchable.
1213
+ //
1214
+ // 2. PREFIX candidate — "user added a suffix to a known root" case.
1215
+ // 'whois_terms' starts with 'whois' (known key) → suggest 'whois'.
1216
+ // Requires the prefix to be at least 3 chars to avoid spurious matches
1217
+ // on accidental 1-2 letter prefixes. Among multiple prefix candidates,
1218
+ // we take the LONGEST (most specific category boundary).
1219
+ //
1220
+ // Ranking: if there's a very close edit-distance match (≤2 edits), prefer
1221
+ // it — almost certainly a misspelling of that specific key (e.g.
1222
+ // 'whois_max_retri' → 'whois_max_retries' at distance 2 beats the prefix
1223
+ // match 'whois'). Otherwise prefer the prefix match when present, since
1224
+ // "extra suffix on a known root" is a stronger signal than a 4+-edit
1225
+ // distance to an unrelated key.
1226
+ function _suggestKey(unknownKey, knownKeys) {
1227
+ const threshold = Math.max(2, Math.floor(unknownKey.length * 0.4));
1228
+ let distBest = null, distBestVal = Infinity;
1229
+ let prefixBest = null, prefixBestLen = 0;
1230
+
1231
+ for (const k of knownKeys) {
1232
+ const d = _editDistance(unknownKey, k);
1233
+ if (d < distBestVal && d <= threshold) {
1234
+ distBestVal = d;
1235
+ distBest = k;
1236
+ }
1237
+ if (k.length >= 3 && unknownKey !== k &&
1238
+ unknownKey.startsWith(k) && k.length > prefixBestLen) {
1239
+ prefixBest = k;
1240
+ prefixBestLen = k.length;
1241
+ }
1242
+ }
1243
+
1244
+ if (distBest && distBestVal <= 2) return distBest;
1245
+ return prefixBest || distBest;
1246
+ }
1247
+
1248
+ /**
1249
+ * Per-site validation + boolean coercion run at scan startup (always, not
1250
+ * gated on --validate-config).
1251
+ *
1252
+ * Mutates siteConfig in place to coerce boolean-like values (1, 0, "true",
1253
+ * "false", "yes", "no", "on", "off") to true/false for fields in
1254
+ * BOOLEAN_SITE_CONFIG_FIELDS. Returns warnings the caller surfaces via the
1255
+ * usual logging path.
1256
+ *
1257
+ * Catches the failure classes:
1258
+ * 1. Unknown siteConfig keys → typo warning + "did you mean?" suggestion.
1259
+ * Example: 'whois_terms' → "did you mean 'whois'?"
1260
+ * 2. Boolean field with truthy non-boolean value → coerce + warn.
1261
+ * Example: 'interact: 1' → coerced to 'interact: true', warning emitted.
1262
+ * 3. Boolean field with non-boolean non-truthy value → warn only, no coerce.
1263
+ * Example: 'interact: "maybe"' → warned, left alone.
1264
+ *
1265
+ * @param {object} siteConfig - mutated in place
1266
+ * @param {number} siteIndex - for warning messages
1267
+ * @returns {{warnings: string[], errors: string[]}}
1268
+ */
1269
+ function normalizeSiteConfig(siteConfig, siteIndex = 0) {
1270
+ const warnings = [];
1271
+ const errors = [];
1272
+ if (!siteConfig || typeof siteConfig !== 'object') {
1273
+ errors.push(`Site ${siteIndex}: not an object`);
1274
+ return { warnings, errors };
1275
+ }
1276
+ const tag = siteConfig.url ? `Site ${siteIndex} (${siteConfig.url})` : `Site ${siteIndex}`;
1277
+
1278
+ // 1. Unknown-key detection. Scan every top-level key; report with
1279
+ // Levenshtein-based suggestion when close to a known key.
1280
+ for (const key of Object.keys(siteConfig)) {
1281
+ if (KNOWN_SITE_CONFIG_KEYS.has(key)) continue;
1282
+ const suggestion = _suggestKey(key, KNOWN_SITE_CONFIG_KEYS);
1283
+ warnings.push(
1284
+ `${tag}: unknown siteConfig key '${key}'` +
1285
+ (suggestion ? ` — did you mean '${suggestion}'?` : '') +
1286
+ ' — value will be ignored at runtime'
1287
+ );
1288
+ }
1289
+
1290
+ // 2. Boolean coercion for known boolean fields. Mutates siteConfig.
1291
+ for (const field of BOOLEAN_SITE_CONFIG_FIELDS) {
1292
+ if (!(field in siteConfig)) continue;
1293
+ const original = siteConfig[field];
1294
+ if (original === undefined || original === null) continue;
1295
+ const { coerced, value } = _coerceBooleanLike(original);
1296
+ if (coerced) {
1297
+ siteConfig[field] = value;
1298
+ warnings.push(
1299
+ `${tag}: '${field}' value ${JSON.stringify(original)} should be ${value} ` +
1300
+ `(boolean) — coerced for compatibility; please update config to use ${value}`
1301
+ );
1302
+ } else if (typeof original !== 'boolean') {
1303
+ warnings.push(
1304
+ `${tag}: '${field}' should be boolean (true/false), got ${JSON.stringify(original)} ` +
1305
+ `— may not work as expected (downstream strict-equality check will treat as disabled)`
1306
+ );
1307
+ }
1308
+ }
1309
+
1310
+ // 3. String → single-element array coercion for fields that accept both
1311
+ // forms (dig, dig-or, whois, whois-or). Downstream consumers all gate on
1312
+ // Array.isArray(), so a bare string value previously silently disabled
1313
+ // the feature. Wrapping in [val] is the canonical "user gave one term"
1314
+ // outcome and matches user intent. Both forms are first-class — no
1315
+ // warning is emitted on the string path, just the in-place mutation.
1316
+ //
1317
+ // Empty string is left alone: the downstream `siteConfig.dig && ...`
1318
+ // check sees the empty string as falsy and disables the feature. If we
1319
+ // coerced "" to [""], nettools' array.length>0 check would PASS and then
1320
+ // every dig/whois output would match (`"".includes(anything)` is true),
1321
+ // turning a clearly-empty config into a match-everything one.
1322
+ //
1323
+ // Non-string non-array values DO warn since we can't sensibly coerce.
1324
+ for (const field of STRING_TO_ARRAY_FIELDS) {
1325
+ if (!(field in siteConfig)) continue;
1326
+ const val = siteConfig[field];
1327
+ if (val === undefined || val === null) continue;
1328
+ if (typeof val === 'string') {
1329
+ if (val.length > 0) siteConfig[field] = [val];
1330
+ // empty string: leave as-is (preserves disable-on-falsy semantics)
1331
+ } else if (!Array.isArray(val)) {
1332
+ warnings.push(
1333
+ `${tag}: '${field}' should be a string or array of strings, got ${typeof val} ` +
1334
+ `(${JSON.stringify(val).slice(0, 60)}) — feature will be disabled at runtime`
1335
+ );
1336
+ }
1337
+ }
1338
+
1339
+ // 4. Dependent-flag implication: clear_sitedata_full_on_reload only takes
1340
+ // effect inside the `if (clear_sitedata === true)` guard at nwss.js:4627
1341
+ // — setting it WITHOUT clear_sitedata: true silently does nothing. That's
1342
+ // the same silent-failure pattern this validator was created to prevent,
1343
+ // so auto-enable clear_sitedata and warn the user. They almost certainly
1344
+ // intended both to be true; opt-in to heavy-storage clearing without
1345
+ // opt-in to clearing-at-all doesn't make sense as a configuration.
1346
+ if (siteConfig.clear_sitedata_full_on_reload === true &&
1347
+ siteConfig.clear_sitedata !== true) {
1348
+ siteConfig.clear_sitedata = true;
1349
+ warnings.push(
1350
+ `${tag}: 'clear_sitedata_full_on_reload: true' requires 'clear_sitedata: true' ` +
1351
+ `— auto-enabled clear_sitedata for compatibility; please add 'clear_sitedata: true' ` +
1352
+ `to your config explicitly`
1353
+ );
1354
+ }
1355
+
1356
+ return { warnings, errors };
1357
+ }
1358
+
1078
1359
  // Public surface used by nwss.js (validateRulesetFile, validateFullConfig,
1079
- // testDomainValidation, cleanRulesetFile). The rest (isValidDomain,
1080
- // isValidDomainLabel, isValidTLD, isIPAddress, isIPv4, isIPv6,
1360
+ // testDomainValidation, cleanRulesetFile, normalizeSiteConfig). The rest
1361
+ // (isValidDomain, isValidDomainLabel, isValidTLD, isIPAddress, isIPv4, isIPv6,
1081
1362
  // validateRegexPattern, validateAdblockModifiers, validateAdblockRule,
1082
1363
  // validateSiteConfig) stay internal-helper-but-exported for now since
1083
1364
  // downstream callers MAY import them via the dotted path even if grep
@@ -1100,5 +1381,6 @@ module.exports = {
1100
1381
  cleanRulesetFile,
1101
1382
  validateSiteConfig,
1102
1383
  validateFullConfig,
1384
+ normalizeSiteConfig,
1103
1385
  testDomainValidation
1104
1386
  };