npm - spamscanner - Versions diffs - 5.1.0 → 5.1.3 - Mend

spamscanner 5.1.0 → 5.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -54,7 +54,7 @@
 ## Foreword
-Spam Scanner is a tool and service built by [@niftylettuce][niftylettuce] after hitting countless roadblocks with existing spam-detection solutions.  In other words, it's our current [plan][plan-for-spam] for [spam][better-plan-for-spam].
+Spam Scanner is a tool and service created after hitting countless roadblocks with existing spam-detection solutions.  In other words, it's our current [plan][plan-for-spam] for [spam][better-plan-for-spam].
 Our goal is to build and utilize a scalable, performant, simple, easy to maintain, and powerful API for use in our service at [Forward Email][forward-email] to limit spam and provide other measures to prevent attacks on our users.
@@ -566,7 +566,7 @@ Note that in [Forward Email][forward-email] we use the `client` approach as we h
 ## Debugging
-Spam Scanner has built-in debug output via `util.debuglog('spamscanner')`.
+Spam Scanner has built-in debug output via `util.debuglog('spamscanner')`.  You can also pass `debug: true` to your instance to get more verbose output.
 This means you can run your app with `NODE_DEBUG=spamscanner node app.js` to get useful debug output to your console.
@@ -606,8 +606,6 @@ This means you can run your app with `NODE_DEBUG=spamscanner node app.js` to get
 [homograph-attack]: https://en.wikipedia.org/wiki/IDN_homograph_attack
-[niftylettuce]: https://github.com/niftylettuce
 [forward-email]: https://forwardemail.net
 [rspamd]: https://rspamd.com/

package/get-classifier.js CHANGED Viewed

@@ -1,5 +1,8 @@
+const { debuglog } = require('util');
 const NaiveBayes = require('@ladjs/naivebayes');
-const debug = require('debug')('spamscanner');
+const debug = debuglog('spamscanner');
 let classifier = new NaiveBayes().toJsonObject();

package/index.js CHANGED Viewed

@@ -11,6 +11,7 @@ const ClamScan = require('clamscan');
 const FileType = require('file-type');
 const NaiveBayes = require('@ladjs/naivebayes');
 const RE2 = require('re2');
+const arrayJoinConjunction = require('array-join-conjunction');
 const bitcoinRegex = require('bitcoin-regex');
 const contractions = require('expand-contractions');
 const creditCardRegex = require('credit-card-regex');
@@ -458,19 +459,13 @@ class SpamScanner {
       ttlMs: ms('1h'),
       // franc
       franc: {
-        // NOTE: if locale was passed and was valid
-        //       then we need to compare it against english
-        //       and if it was english detected (and not und)
-        //       then switch the detected locale to english
         minLength: 5,
-        // we can only support languages available
-        // in stopwords and natural's tokenizer methods
-        // and if it was detected to be english, compare against all languages
-        // otherwise if not, then compare only against english
-        // (namely we need to check against JP/ZH, but perhaps _all_ in future)
-        // (the edge case is that someone could spoof a language and it go undetected and tokenization bugs occur)
         only: ISO_CODE_MAPPING_KEYS
       },
+      // if franc detects multiple languages that have >= % threshold
+      // then if the locale detected was one of them, what is the probability
+      // it must have in order to override all the other matches
+      detectedLocaleOverrideProbability: 0.9,
       ...config
     };
@@ -501,7 +496,7 @@ class SpamScanner {
       return tokens;
     };
-    this.clamscan = new ClamScan();
+    this.clamscan = this.config.clamscan === false ? false : new ClamScan();
     this.getTokensAndMailFromSource = universalify.fromPromise(
       this.getTokensAndMailFromSource.bind(this)
@@ -659,7 +654,13 @@ class SpamScanner {
   async getVirusResults(mail) {
     const messages = [];
-    if (!Array.isArray(mail.attachments)) return messages;
+    if (!this.clamscan) {
+      debug('clamscan disabled');
+      return messages;
+    }
+    if (!Array.isArray(mail.attachments) || mail.attachments.length === 0)
+      return messages;
     try {
       // if it was already loaded, clamscan won't reload itself
@@ -677,7 +678,11 @@ class SpamScanner {
               ? `"${attachment.filename}"`
               : `#${i + 1}`;
             if (isInfected)
-              messages.push(`Attachment ${name} was infected with ${viruses}.`);
+              messages.push(
+                `Attachment ${name} was infected with ${arrayJoinConjunction(
+                  viruses
+                )}.`
+              );
           } catch (err) {
             this.config.logger.error(err);
           }
@@ -976,13 +981,41 @@ class SpamScanner {
     // <https://github.com/wooorm/franc/issues/86> (accurate with min length)
     // <https://github.com/FGRibreau/node-language-detect> (not too accurate)
     //
-    const detectedLanguage = franc(string, this.config.franc);
+    const detectedLanguages = franc.all(string, this.config.franc);
+    if (Array.isArray(detectedLanguages) && detectedLanguages.length > 0) {
+      let detected = this.config.locale;
+      let probability = 0;
+      for (const lang of detectedLanguages) {
+        // if it was undetermined then break out and revert to default (English)
+        if (lang[0] && lang[0] === 'und') break;
+        //
+        // otherwise only use detected languages that have >= 90% accuracy
+        // and if no matches were found, the revert to use English as it's most likely spam
+        // (we can assume that users would understand a different language sent to them is spam)
+        // (so we can assume that language is spoofed to bypass English, the most widely spoken)
+        //
+        if (lang[0] && ISO_CODE_MAPPING[lang[0]] && lang[1]) {
+          // we don't want to check anything lower than our threshold
+          if (lang[1] < this.config.detectedLocaleOverrideProbability) break;
+          if (probability >= lang[1]) {
+            // exit early since we found a match that matched the passed locale
+            // eslint-disable-next-line max-depth
+            if (locale && locale === ISO_CODE_MAPPING[lang[0]]) {
+              detected = locale;
+              probability = lang[1];
+              break;
+            }
+          } else {
+            detected = ISO_CODE_MAPPING[lang[0]];
+            probability = lang[1];
+          }
+        }
+      }
-    if (
-      detectedLanguage !== 'und' &&
-      isSANB(ISO_CODE_MAPPING[detectedLanguage])
-    )
-      locale = ISO_CODE_MAPPING[detectedLanguage];
+      // override the locale based off detected
+      locale = detected;
+    }
     locale = this.parseLocale(isSANB(locale) ? locale : this.config.locale);
@@ -1230,7 +1263,6 @@ class SpamScanner {
     // <https://github.com/NaturalNode/natural/issues/533>
     //
     // NOTE: we're doing this for all languages now, not just en
-    // if (locale === 'en')
     //
     string = contractions.expand(string);

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "spamscanner",
   "description": "Spam Scanner - The Best Anti-Spam Scanning Service and Anti-Spam API",
-  "version": "5.1.0",
+  "version": "5.1.3",
   "author": "Niftylettuce, LLC. <niftylettuce@gmail.com> (https://niftylettuce.com/)",
   "bugs": {
     "url": "https://github.com/spamscanner/spamscanner/issues",
@@ -13,6 +13,7 @@
   ],
   "dependencies": {
     "@ladjs/naivebayes": "^0.1.0",
+    "array-join-conjunction": "^1.0.0",
     "bitcoin-regex": "^2.0.0",
     "chinese-tokenizer": "^2.4.0",
     "clamscan": "^2.1.2",
@@ -67,14 +68,16 @@
   "devDependencies": {
     "@commitlint/cli": "^17.0.2",
     "@commitlint/config-conventional": "^17.0.2",
-    "@ladjs/redis": "^1.0.7",
     "ava": "^4.3.0",
+    "chance": "^1.1.8",
     "cross-env": "^7.0.3",
     "delay": "^5.0.0",
     "eslint": "^8.17.0",
     "eslint-config-xo-lass": "^2.0.1",
     "fixpack": "^4.0.0",
     "husky": "^8.0.1",
+    "ioredis": "^5.0.6",
+    "ioredis-mock": "^8.2.2",
     "is-ci": "^3.0.1",
     "lint-staged": "^13.0.1",
     "lookpath": "^1.2.2",
@@ -83,6 +86,7 @@
     "numeral": "^2.0.6",
     "nyc": "^15.1.0",
     "p-map": "4",
+    "p-queue": "6",
     "read-dir-deep": "^7.0.1",
     "remark-cli": "^10.0.1",
     "remark-preset-github": "^4.0.4",

package/replacements.js CHANGED Viewed

@@ -1,5 +1,8 @@
+const { debuglog } = require('util');
 const cryptoRandomString = require('crypto-random-string');
-const debug = require('debug')('spamscanner');
+const debug = debuglog('spamscanner');
 const REPLACEMENT_WORDS = require('./replacement-words.json');