npm - spamscanner - Versions diffs - 5.0.0 → 5.1.2 - Mend

spamscanner 5.0.0 → 5.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md +3 -4
package/cedict_1_0_ts_utf-8_mdbg.txt +120853 -0
package/index.js +75 -16
package/package.json +6 -2

package/index.js CHANGED Viewed

@@ -1,6 +1,7 @@
-const process = require('process');
 const dns = require('dns');
 const fs = require('fs');
+const path = require('path');
+const process = require('process');
 const { debuglog } = require('util');
 // eslint-disable-next-line n/no-deprecated-api
@@ -10,6 +11,7 @@ const ClamScan = require('clamscan');
 const FileType = require('file-type');
 const NaiveBayes = require('@ladjs/naivebayes');
 const RE2 = require('re2');
+const arrayJoinConjunction = require('array-join-conjunction');
 const bitcoinRegex = require('bitcoin-regex');
 const contractions = require('expand-contractions');
 const creditCardRegex = require('credit-card-regex');
@@ -55,6 +57,19 @@ const { simpleParser } = require('mailparser');
 const debug = debuglog('spamscanner');
+//
+// NOTE: we periodically need to update this
+//
+// Source from: CC-CEDICT
+// Licensed under Creative Commons Attribution-ShareAlike 4.0 International License
+// <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>
+//
+// <https://github.com/yishn/chinese-tokenizer>
+//
+const chineseTokenizer = require('chinese-tokenizer').loadFile(
+  path.join(__dirname, 'cedict_1_0_ts_utf-8_mdbg.txt')
+);
 const aggressiveTokenizer = new natural.AggressiveTokenizer();
 const orthographyTokenizer = new natural.OrthographyTokenizer({
   language: 'fi'
@@ -195,6 +210,8 @@ const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
 // TODO: convert this into a Map
 const ISO_CODE_MAPPING = require('./iso-code-mapping.json');
+const ISO_CODE_MAPPING_KEYS = Object.keys(ISO_CODE_MAPPING);
 // <https://kb.smarshmail.com/Article/23567>
 const EXECUTABLES = new Set(require('./executables.json'));
@@ -426,12 +443,6 @@ class SpamScanner {
             : '/var/run/clamav/clamd.ctl'
         }
       },
-      franc: {
-        minLength: 100,
-        // we can only support languages available
-        // in stopwords and natural's tokenizer methods
-        only: Object.keys(ISO_CODE_MAPPING)
-      },
       hasha: {
         algorithm: 'sha256'
       },
@@ -446,6 +457,15 @@ class SpamScanner {
       client: false,
       cachePrefix: 'spamscanner',
       ttlMs: ms('1h'),
+      // franc
+      franc: {
+        minLength: 5,
+        only: ISO_CODE_MAPPING_KEYS
+      },
+      // if franc detects multiple languages that have >= % threshold
+      // then if the locale detected was one of them, what is the probability
+      // it must have in order to override all the other matches
+      detectedLocaleOverrideProbability: 0.9,
       ...config
     };
@@ -652,7 +672,11 @@ class SpamScanner {
               ? `"${attachment.filename}"`
               : `#${i + 1}`;
             if (isInfected)
-              messages.push(`Attachment ${name} was infected with ${viruses}.`);
+              messages.push(
+                `Attachment ${name} was infected with ${arrayJoinConjunction(
+                  viruses
+                )}.`
+              );
           } catch (err) {
             this.config.logger.error(err);
           }
@@ -951,12 +975,41 @@ class SpamScanner {
     // <https://github.com/wooorm/franc/issues/86> (accurate with min length)
     // <https://github.com/FGRibreau/node-language-detect> (not too accurate)
     //
-    const detectedLanguage = franc(string, this.config.franc);
-    if (
-      detectedLanguage !== 'und' &&
-      isSANB(ISO_CODE_MAPPING[detectedLanguage])
-    )
-      locale = ISO_CODE_MAPPING[detectedLanguage];
+    const detectedLanguages = franc.all(string, this.config.franc);
+    if (Array.isArray(detectedLanguages) && detectedLanguages.length > 0) {
+      let detected = this.config.locale;
+      let probability = 0;
+      for (const lang of detectedLanguages) {
+        // if it was undetermined then break out and revert to default (English)
+        if (lang[0] && lang[0] === 'und') break;
+        //
+        // otherwise only use detected languages that have >= 90% accuracy
+        // and if no matches were found, the revert to use English as it's most likely spam
+        // (we can assume that users would understand a different language sent to them is spam)
+        // (so we can assume that language is spoofed to bypass English, the most widely spoken)
+        //
+        if (lang[0] && ISO_CODE_MAPPING[lang[0]] && lang[1]) {
+          // we don't want to check anything lower than our threshold
+          if (lang[1] < this.config.detectedLocaleOverrideProbability) break;
+          if (probability >= lang[1]) {
+            // exit early since we found a match that matched the passed locale
+            // eslint-disable-next-line max-depth
+            if (locale && locale === ISO_CODE_MAPPING[lang[0]]) {
+              detected = locale;
+              probability = lang[1];
+              break;
+            }
+          } else {
+            detected = ISO_CODE_MAPPING[lang[0]];
+            probability = lang[1];
+          }
+        }
+      }
+      // override the locale based off detected
+      locale = detected;
+    }
     locale = this.parseLocale(isSANB(locale) ? locale : this.config.locale);
@@ -1110,8 +1163,12 @@ class SpamScanner {
         break;
       case 'zh':
         // cmn
-        // TODO: use this instead https://github.com/yishn/chinese-tokenizer
         // ISO 639-3 = zho (Chinese, Macrolanguage)
+        // https://github.com/yishn/chinese-tokenizer
+        tokenizer = {
+          tokenize: (str) =>
+            chineseTokenizer(str).map((results) => results.text)
+        };
         language = 'chinese';
         stopwords = stopwordsZh;
         stemword = false;
@@ -1200,7 +1257,6 @@ class SpamScanner {
     // <https://github.com/NaturalNode/natural/issues/533>
     //
     // NOTE: we're doing this for all languages now, not just en
-    // if (locale === 'en')
     //
     string = contractions.expand(string);
@@ -1214,6 +1270,9 @@ class SpamScanner {
     //
     const tokens = [];
     for (const token of tokenizer.tokenize(string.toLowerCase())) {
+      // zh tokenizr yields empty strings
+      if (token === '' || token === ' ') continue;
       // whitelist words from being stemmed (safeguard)
       if (
         this.WHITELISTED_WORDS.has(token) ||

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "spamscanner",
   "description": "Spam Scanner - The Best Anti-Spam Scanning Service and Anti-Spam API",
-  "version": "5.0.0",
+  "version": "5.1.2",
   "author": "Niftylettuce, LLC. <niftylettuce@gmail.com> (https://niftylettuce.com/)",
   "bugs": {
     "url": "https://github.com/spamscanner/spamscanner/issues",
@@ -13,7 +13,9 @@
   ],
   "dependencies": {
     "@ladjs/naivebayes": "^0.1.0",
+    "array-join-conjunction": "^1.0.0",
     "bitcoin-regex": "^2.0.0",
+    "chinese-tokenizer": "^2.4.0",
     "clamscan": "^2.1.2",
     "credit-card-regex": "^3.0.0",
     "crypto-random-string": "3",
@@ -66,7 +68,6 @@
   "devDependencies": {
     "@commitlint/cli": "^17.0.2",
     "@commitlint/config-conventional": "^17.0.2",
-    "@ladjs/redis": "^1.0.7",
     "ava": "^4.3.0",
     "cross-env": "^7.0.3",
     "delay": "^5.0.0",
@@ -74,6 +75,8 @@
     "eslint-config-xo-lass": "^2.0.1",
     "fixpack": "^4.0.0",
     "husky": "^8.0.1",
+    "ioredis": "^5.0.6",
+    "ioredis-mock": "^8.2.2",
     "is-ci": "^3.0.1",
     "lint-staged": "^13.0.1",
     "lookpath": "^1.2.2",
@@ -91,6 +94,7 @@
     "node": ">=14"
   },
   "files": [
+    "cedict_1_0_ts_utf-8_mdbg.txt",
     "package.json",
     "index.js",
     "vocabulary-limit.js",