spamscanner 5.0.0 → 5.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -1,6 +1,7 @@
1
- const process = require('process');
2
1
  const dns = require('dns');
3
2
  const fs = require('fs');
3
+ const path = require('path');
4
+ const process = require('process');
4
5
  const { debuglog } = require('util');
5
6
 
6
7
  // eslint-disable-next-line n/no-deprecated-api
@@ -10,6 +11,7 @@ const ClamScan = require('clamscan');
10
11
  const FileType = require('file-type');
11
12
  const NaiveBayes = require('@ladjs/naivebayes');
12
13
  const RE2 = require('re2');
14
+ const arrayJoinConjunction = require('array-join-conjunction');
13
15
  const bitcoinRegex = require('bitcoin-regex');
14
16
  const contractions = require('expand-contractions');
15
17
  const creditCardRegex = require('credit-card-regex');
@@ -55,6 +57,19 @@ const { simpleParser } = require('mailparser');
55
57
 
56
58
  const debug = debuglog('spamscanner');
57
59
 
60
+ //
61
+ // NOTE: we periodically need to update this
62
+ //
63
+ // Source from: CC-CEDICT
64
+ // Licensed under Creative Commons Attribution-ShareAlike 4.0 International License
65
+ // <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>
66
+ //
67
+ // <https://github.com/yishn/chinese-tokenizer>
68
+ //
69
+ const chineseTokenizer = require('chinese-tokenizer').loadFile(
70
+ path.join(__dirname, 'cedict_1_0_ts_utf-8_mdbg.txt')
71
+ );
72
+
58
73
  const aggressiveTokenizer = new natural.AggressiveTokenizer();
59
74
  const orthographyTokenizer = new natural.OrthographyTokenizer({
60
75
  language: 'fi'
@@ -195,6 +210,8 @@ const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
195
210
  // TODO: convert this into a Map
196
211
  const ISO_CODE_MAPPING = require('./iso-code-mapping.json');
197
212
 
213
+ const ISO_CODE_MAPPING_KEYS = Object.keys(ISO_CODE_MAPPING);
214
+
198
215
  // <https://kb.smarshmail.com/Article/23567>
199
216
  const EXECUTABLES = new Set(require('./executables.json'));
200
217
 
@@ -426,12 +443,6 @@ class SpamScanner {
426
443
  : '/var/run/clamav/clamd.ctl'
427
444
  }
428
445
  },
429
- franc: {
430
- minLength: 100,
431
- // we can only support languages available
432
- // in stopwords and natural's tokenizer methods
433
- only: Object.keys(ISO_CODE_MAPPING)
434
- },
435
446
  hasha: {
436
447
  algorithm: 'sha256'
437
448
  },
@@ -446,6 +457,15 @@ class SpamScanner {
446
457
  client: false,
447
458
  cachePrefix: 'spamscanner',
448
459
  ttlMs: ms('1h'),
460
+ // franc
461
+ franc: {
462
+ minLength: 5,
463
+ only: ISO_CODE_MAPPING_KEYS
464
+ },
465
+ // if franc detects multiple languages that have >= % threshold
466
+ // then if the locale detected was one of them, what is the probability
467
+ // it must have in order to override all the other matches
468
+ detectedLocaleOverrideProbability: 0.9,
449
469
  ...config
450
470
  };
451
471
 
@@ -652,7 +672,11 @@ class SpamScanner {
652
672
  ? `"${attachment.filename}"`
653
673
  : `#${i + 1}`;
654
674
  if (isInfected)
655
- messages.push(`Attachment ${name} was infected with ${viruses}.`);
675
+ messages.push(
676
+ `Attachment ${name} was infected with ${arrayJoinConjunction(
677
+ viruses
678
+ )}.`
679
+ );
656
680
  } catch (err) {
657
681
  this.config.logger.error(err);
658
682
  }
@@ -951,12 +975,41 @@ class SpamScanner {
951
975
  // <https://github.com/wooorm/franc/issues/86> (accurate with min length)
952
976
  // <https://github.com/FGRibreau/node-language-detect> (not too accurate)
953
977
  //
954
- const detectedLanguage = franc(string, this.config.franc);
955
- if (
956
- detectedLanguage !== 'und' &&
957
- isSANB(ISO_CODE_MAPPING[detectedLanguage])
958
- )
959
- locale = ISO_CODE_MAPPING[detectedLanguage];
978
+ const detectedLanguages = franc.all(string, this.config.franc);
979
+ if (Array.isArray(detectedLanguages) && detectedLanguages.length > 0) {
980
+ let detected = this.config.locale;
981
+ let probability = 0;
982
+ for (const lang of detectedLanguages) {
983
+ // if it was undetermined then break out and revert to default (English)
984
+ if (lang[0] && lang[0] === 'und') break;
985
+
986
+ //
987
+ // otherwise only use detected languages that have >= 90% accuracy
988
+ // and if no matches were found, the revert to use English as it's most likely spam
989
+ // (we can assume that users would understand a different language sent to them is spam)
990
+ // (so we can assume that language is spoofed to bypass English, the most widely spoken)
991
+ //
992
+ if (lang[0] && ISO_CODE_MAPPING[lang[0]] && lang[1]) {
993
+ // we don't want to check anything lower than our threshold
994
+ if (lang[1] < this.config.detectedLocaleOverrideProbability) break;
995
+ if (probability >= lang[1]) {
996
+ // exit early since we found a match that matched the passed locale
997
+ // eslint-disable-next-line max-depth
998
+ if (locale && locale === ISO_CODE_MAPPING[lang[0]]) {
999
+ detected = locale;
1000
+ probability = lang[1];
1001
+ break;
1002
+ }
1003
+ } else {
1004
+ detected = ISO_CODE_MAPPING[lang[0]];
1005
+ probability = lang[1];
1006
+ }
1007
+ }
1008
+ }
1009
+
1010
+ // override the locale based off detected
1011
+ locale = detected;
1012
+ }
960
1013
 
961
1014
  locale = this.parseLocale(isSANB(locale) ? locale : this.config.locale);
962
1015
 
@@ -1110,8 +1163,12 @@ class SpamScanner {
1110
1163
  break;
1111
1164
  case 'zh':
1112
1165
  // cmn
1113
- // TODO: use this instead https://github.com/yishn/chinese-tokenizer
1114
1166
  // ISO 639-3 = zho (Chinese, Macrolanguage)
1167
+ // https://github.com/yishn/chinese-tokenizer
1168
+ tokenizer = {
1169
+ tokenize: (str) =>
1170
+ chineseTokenizer(str).map((results) => results.text)
1171
+ };
1115
1172
  language = 'chinese';
1116
1173
  stopwords = stopwordsZh;
1117
1174
  stemword = false;
@@ -1200,7 +1257,6 @@ class SpamScanner {
1200
1257
  // <https://github.com/NaturalNode/natural/issues/533>
1201
1258
  //
1202
1259
  // NOTE: we're doing this for all languages now, not just en
1203
- // if (locale === 'en')
1204
1260
  //
1205
1261
  string = contractions.expand(string);
1206
1262
 
@@ -1214,6 +1270,9 @@ class SpamScanner {
1214
1270
  //
1215
1271
  const tokens = [];
1216
1272
  for (const token of tokenizer.tokenize(string.toLowerCase())) {
1273
+ // zh tokenizr yields empty strings
1274
+ if (token === '' || token === ' ') continue;
1275
+
1217
1276
  // whitelist words from being stemmed (safeguard)
1218
1277
  if (
1219
1278
  this.WHITELISTED_WORDS.has(token) ||
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "spamscanner",
3
3
  "description": "Spam Scanner - The Best Anti-Spam Scanning Service and Anti-Spam API",
4
- "version": "5.0.0",
4
+ "version": "5.1.2",
5
5
  "author": "Niftylettuce, LLC. <niftylettuce@gmail.com> (https://niftylettuce.com/)",
6
6
  "bugs": {
7
7
  "url": "https://github.com/spamscanner/spamscanner/issues",
@@ -13,7 +13,9 @@
13
13
  ],
14
14
  "dependencies": {
15
15
  "@ladjs/naivebayes": "^0.1.0",
16
+ "array-join-conjunction": "^1.0.0",
16
17
  "bitcoin-regex": "^2.0.0",
18
+ "chinese-tokenizer": "^2.4.0",
17
19
  "clamscan": "^2.1.2",
18
20
  "credit-card-regex": "^3.0.0",
19
21
  "crypto-random-string": "3",
@@ -66,7 +68,6 @@
66
68
  "devDependencies": {
67
69
  "@commitlint/cli": "^17.0.2",
68
70
  "@commitlint/config-conventional": "^17.0.2",
69
- "@ladjs/redis": "^1.0.7",
70
71
  "ava": "^4.3.0",
71
72
  "cross-env": "^7.0.3",
72
73
  "delay": "^5.0.0",
@@ -74,6 +75,8 @@
74
75
  "eslint-config-xo-lass": "^2.0.1",
75
76
  "fixpack": "^4.0.0",
76
77
  "husky": "^8.0.1",
78
+ "ioredis": "^5.0.6",
79
+ "ioredis-mock": "^8.2.2",
77
80
  "is-ci": "^3.0.1",
78
81
  "lint-staged": "^13.0.1",
79
82
  "lookpath": "^1.2.2",
@@ -91,6 +94,7 @@
91
94
  "node": ">=14"
92
95
  },
93
96
  "files": [
97
+ "cedict_1_0_ts_utf-8_mdbg.txt",
94
98
  "package.json",
95
99
  "index.js",
96
100
  "vocabulary-limit.js",