spamscanner 4.0.0 → 5.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -1,8 +1,10 @@
1
1
  const dns = require('dns');
2
2
  const fs = require('fs');
3
- const { promisify } = require('util');
3
+ const path = require('path');
4
+ const process = require('process');
5
+ const { debuglog } = require('util');
4
6
 
5
- // eslint-disable-next-line node/no-deprecated-api
7
+ // eslint-disable-next-line n/no-deprecated-api
6
8
  const punycode = require('punycode');
7
9
 
8
10
  const ClamScan = require('clamscan');
@@ -12,7 +14,6 @@ const RE2 = require('re2');
12
14
  const bitcoinRegex = require('bitcoin-regex');
13
15
  const contractions = require('expand-contractions');
14
16
  const creditCardRegex = require('credit-card-regex');
15
- const debug = require('debug')('spamscanner');
16
17
  const emailRegexSafe = require('email-regex-safe');
17
18
  const emojiPatterns = require('emoji-patterns');
18
19
  const escapeStringRegexp = require('escape-string-regexp');
@@ -46,12 +47,28 @@ const toEmoji = require('gemoji/name-to-emoji');
46
47
  const universalify = require('universalify');
47
48
  const urlRegexSafe = require('url-regex-safe');
48
49
  const validator = require('validator');
50
+ const which = require('which');
49
51
  const { Iconv } = require('iconv');
50
52
  const { codes } = require('currency-codes');
51
53
  const { fromUrl, NO_HOSTNAME } = require('parse-domain');
52
54
  const { parse } = require('node-html-parser');
53
55
  const { simpleParser } = require('mailparser');
54
56
 
57
+ const debug = debuglog('spamscanner');
58
+
59
+ //
60
+ // NOTE: we periodically need to update this
61
+ //
62
+ // Source from: CC-CEDICT
63
+ // Licensed under Creative Commons Attribution-ShareAlike 4.0 International License
64
+ // <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>
65
+ //
66
+ // <https://github.com/yishn/chinese-tokenizer>
67
+ //
68
+ const chineseTokenizer = require('chinese-tokenizer').loadFile(
69
+ path.join(__dirname, 'cedict_1_0_ts_utf-8_mdbg.txt')
70
+ );
71
+
55
72
  const aggressiveTokenizer = new natural.AggressiveTokenizer();
56
73
  const orthographyTokenizer = new natural.OrthographyTokenizer({
57
74
  language: 'fi'
@@ -69,20 +86,115 @@ const aggressiveTokenizerSv = new natural.AggressiveTokenizerSv();
69
86
  const aggressiveTokenizerRu = new natural.AggressiveTokenizerRu();
70
87
  const aggressiveTokenizerVi = new natural.AggressiveTokenizerVi();
71
88
 
72
- const stopwordsEn = require('natural/lib/natural/util/stopwords').words;
73
- const stopwordsEs = require('natural/lib/natural/util/stopwords_es').words;
74
- const stopwordsFa = require('natural/lib/natural/util/stopwords_fa').words;
75
- const stopwordsFr = require('natural/lib/natural/util/stopwords_fr').words;
76
- const stopwordsId = require('natural/lib/natural/util/stopwords_id').words;
77
- const stopwordsJa = require('natural/lib/natural/util/stopwords_ja').words;
78
- const stopwordsIt = require('natural/lib/natural/util/stopwords_it').words;
79
- const stopwordsNl = require('natural/lib/natural/util/stopwords_nl').words;
80
- const stopwordsNo = require('natural/lib/natural/util/stopwords_no').words;
81
- const stopwordsPl = require('natural/lib/natural/util/stopwords_pl').words;
82
- const stopwordsPt = require('natural/lib/natural/util/stopwords_pt').words;
83
- const stopwordsRu = require('natural/lib/natural/util/stopwords_ru').words;
84
- const stopwordsSv = require('natural/lib/natural/util/stopwords_sv').words;
85
- const stopwordsZh = require('natural/lib/natural/util/stopwords_zh').words;
89
+ const stopwordsEn = new Set([
90
+ ...require('natural/lib/natural/util/stopwords').words,
91
+ ...sw.eng
92
+ ]);
93
+ const stopwordsEs = new Set([
94
+ ...require('natural/lib/natural/util/stopwords_es').words,
95
+ ...sw.spa
96
+ ]);
97
+ const stopwordsFa = new Set([
98
+ ...require('natural/lib/natural/util/stopwords_fa').words,
99
+ ...sw.fas
100
+ ]);
101
+ const stopwordsFr = new Set([
102
+ ...require('natural/lib/natural/util/stopwords_fr').words,
103
+ ...sw.fra
104
+ ]);
105
+ const stopwordsId = new Set([
106
+ ...require('natural/lib/natural/util/stopwords_id').words,
107
+ ...sw.ind
108
+ ]);
109
+ const stopwordsJa = new Set([
110
+ ...require('natural/lib/natural/util/stopwords_ja').words,
111
+ ...sw.jpn
112
+ ]);
113
+ const stopwordsIt = new Set([
114
+ ...require('natural/lib/natural/util/stopwords_it').words,
115
+ ...sw.ita
116
+ ]);
117
+ const stopwordsNl = new Set([
118
+ ...require('natural/lib/natural/util/stopwords_nl').words,
119
+ ...sw.nld
120
+ ]);
121
+ const stopwordsNo = new Set([
122
+ ...require('natural/lib/natural/util/stopwords_no').words,
123
+ ...sw.nob
124
+ ]);
125
+ const stopwordsPl = new Set([
126
+ ...require('natural/lib/natural/util/stopwords_pl').words,
127
+ ...sw.pol
128
+ ]);
129
+ const stopwordsPt = new Set([
130
+ ...require('natural/lib/natural/util/stopwords_pt').words,
131
+ ...sw.por,
132
+ ...sw.porBr
133
+ ]);
134
+ const stopwordsRu = new Set([
135
+ ...require('natural/lib/natural/util/stopwords_ru').words,
136
+ ...sw.rus
137
+ ]);
138
+ const stopwordsSv = new Set([
139
+ ...require('natural/lib/natural/util/stopwords_sv').words,
140
+ ...sw.swe
141
+ ]);
142
+ const stopwordsZh = new Set([
143
+ ...require('natural/lib/natural/util/stopwords_zh').words,
144
+ ...sw.zho
145
+ ]);
146
+
147
+ const stopwordsRon = new Set(sw.ron);
148
+ const stopwordsTur = new Set(sw.tur);
149
+ const stopwordsVie = new Set(sw.vie);
150
+ const stopwordsDeu = new Set(sw.deu);
151
+ const stopwordsHun = new Set(sw.hun);
152
+ const stopwordsAra = new Set(sw.ara);
153
+ const stopwordsDan = new Set(sw.dan);
154
+ const stopwordsFin = new Set(sw.fin);
155
+
156
+ // TODO: add stopword pairing for these langs:
157
+ // afr
158
+ // ben
159
+ // bre
160
+ // bul
161
+ // cat
162
+ // ces
163
+ // ell
164
+ // epo
165
+ // est
166
+ // eus
167
+ // fra
168
+ // gle
169
+ // glg
170
+ // guj
171
+ // hau
172
+ // heb
173
+ // hin
174
+ // hrv
175
+ // hye
176
+ // kor
177
+ // kur
178
+ // lat
179
+ // lav
180
+ // lgg
181
+ // lggNd
182
+ // lit
183
+ // mar
184
+ // msa
185
+ // mya
186
+ // panGu
187
+ // slk
188
+ // slv
189
+ // som
190
+ // sot
191
+ // swa
192
+ // tgl
193
+ // tha
194
+ // ukr
195
+ // urd
196
+ // yor
197
+ // zul
86
198
 
87
199
  // <https://stackoverflow.com/a/41353282>
88
200
  // <https://www.ietf.org/rfc/rfc3986.txt>
@@ -94,17 +206,18 @@ const PKG = require('./package.json');
94
206
 
95
207
  const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
96
208
 
209
+ // TODO: convert this into a Map
97
210
  const ISO_CODE_MAPPING = require('./iso-code-mapping.json');
98
211
 
212
+ const ISO_CODE_MAPPING_KEYS = Object.keys(ISO_CODE_MAPPING);
213
+
99
214
  // <https://kb.smarshmail.com/Article/23567>
100
- const EXECUTABLES = require('./executables.json');
215
+ const EXECUTABLES = new Set(require('./executables.json'));
101
216
 
102
217
  const REPLACEMENT_WORDS = require('./replacement-words.json');
103
218
 
104
219
  const locales = new Set(i18nLocales.map((l) => l.toLowerCase()));
105
220
 
106
- const readFile = promisify(fs.readFile);
107
-
108
221
  const normalizeUrlOptions = {
109
222
  stripProtocol: true,
110
223
  stripWWW: false,
@@ -154,7 +267,8 @@ for (const code of codes()) {
154
267
  const symbol = getSymbolFromCurrency(code);
155
268
  if (
156
269
  typeof symbol === 'string' &&
157
- !currencySymbols.includes(symbol) &&
270
+ // eslint-disable-next-line unicorn/prefer-includes
271
+ currencySymbols.indexOf(symbol) === -1 &&
158
272
  !new RE2(/^[a-z]+$/i).test(symbol)
159
273
  )
160
274
  currencySymbols.push(escapeStringRegexp(symbol));
@@ -187,7 +301,9 @@ const isURLOptions = {
187
301
  class SpamScanner {
188
302
  constructor(config = {}) {
189
303
  this.config = {
190
- debug: process.env.NODE_ENV === 'test',
304
+ debug:
305
+ process.env.NODE_ENV === 'test' ||
306
+ process.env.NODE_ENV === 'development',
191
307
  checkIDNHomographAttack: false,
192
308
  // note that if you attempt to train an existing `scanner.classifier`
193
309
  // then you will need to re-use these, so we suggest you store them
@@ -312,19 +428,20 @@ class SpamScanner {
312
428
  userAgent: `${PKG.name}/${PKG.version}`,
313
429
  timeout: ms('10s'),
314
430
  clamscan: {
431
+ debugMode:
432
+ process.env.NODE_ENV === 'test' ||
433
+ process.env.NODE_ENV === 'development',
434
+ clamscan: {
435
+ path: which.sync('clamscan', { nothrow: true })
436
+ },
315
437
  clamdscan: {
316
438
  timeout: ms('10s'),
439
+ path: which.sync('clamdscan', { nothrow: true }),
317
440
  socket: macosVersion.isMacOS
318
441
  ? '/tmp/clamd.socket'
319
442
  : '/var/run/clamav/clamd.ctl'
320
443
  }
321
444
  },
322
- franc: {
323
- minLength: 100,
324
- // we can only support languages available
325
- // in stopwords and natural's tokenizer methods
326
- only: Object.keys(ISO_CODE_MAPPING)
327
- },
328
445
  hasha: {
329
446
  algorithm: 'sha256'
330
447
  },
@@ -339,6 +456,11 @@ class SpamScanner {
339
456
  client: false,
340
457
  cachePrefix: 'spamscanner',
341
458
  ttlMs: ms('1h'),
459
+ // franc
460
+ franc: {
461
+ minLength: 5,
462
+ only: ISO_CODE_MAPPING_KEYS
463
+ },
342
464
  ...config
343
465
  };
344
466
 
@@ -416,9 +538,7 @@ class SpamScanner {
416
538
  // cache in the background
417
539
  this.config.client
418
540
  .set(key, `${isAdult}:${isMalware}`, 'PX', this.config.ttlMs)
419
- // eslint-disable-next-line promise/prefer-await-to-then
420
541
  .then(this.config.logger.info)
421
- // eslint-disable-next-line promise/prefer-await-to-then
422
542
  .catch(this.config.logger.error);
423
543
  return { isAdult, isMalware };
424
544
  };
@@ -432,6 +552,27 @@ class SpamScanner {
432
552
  throw new Error(
433
553
  `Locale of ${this.config.locale} was not valid according to locales list.`
434
554
  );
555
+
556
+ //
557
+ // set up regex helpers
558
+ //
559
+ this.EMAIL_REPLACEMENT_REGEX = new RE2(this.config.replacements.email, 'g');
560
+ const replacementRegexes = [];
561
+ for (const key of Object.keys(this.config.replacements)) {
562
+ replacementRegexes.push(
563
+ escapeStringRegexp(this.config.replacements[key])
564
+ );
565
+ }
566
+
567
+ this.REPLACEMENTS_REGEX = new RE2(
568
+ new RegExp(replacementRegexes.join('|'), 'g')
569
+ );
570
+
571
+ //
572
+ // set up helper Map and Sets for fast lookup
573
+ // (Set.has is 2x faster than includes, and 50% faster than indexOf)
574
+ //
575
+ this.WHITELISTED_WORDS = new Set(Object.values(this.config.replacements));
435
576
  }
436
577
 
437
578
  getHostname(link) {
@@ -521,15 +662,12 @@ class SpamScanner {
521
662
  const stream = isStream(attachment.content)
522
663
  ? attachment.content
523
664
  : intoStream(attachment.content);
524
- const { is_infected: isInfected, viruses } =
525
- await clamscan.scan_stream(stream);
665
+ const { isInfected, viruses } = await clamscan.scanStream(stream);
526
666
  const name = isSANB(attachment.filename)
527
667
  ? `"${attachment.filename}"`
528
668
  : `#${i + 1}`;
529
669
  if (isInfected)
530
- messages.push(
531
- `Attachment ${name} was infected with "${viruses}".`
532
- );
670
+ messages.push(`Attachment ${name} was infected with ${viruses}.`);
533
671
  } catch (err) {
534
672
  this.config.logger.error(err);
535
673
  }
@@ -547,13 +685,16 @@ class SpamScanner {
547
685
 
548
686
  let gtube = false;
549
687
 
550
- if (isSANB(mail.html) && mail.html.includes(GTUBE)) gtube = true;
688
+ // eslint-disable-next-line unicorn/prefer-includes
689
+ if (isSANB(mail.html) && mail.html.indexOf(GTUBE) !== -1) gtube = true;
551
690
 
552
- if (isSANB(mail.text) && !gtube && mail.text.includes(GTUBE)) gtube = true;
691
+ // eslint-disable-next-line unicorn/prefer-includes
692
+ if (isSANB(mail.text) && !gtube && mail.text.indexOf(GTUBE) !== -1)
693
+ gtube = true;
553
694
 
554
695
  if (gtube)
555
696
  messages.push(
556
- 'Message detected to contain the GTUBE test from <https://spamassassin.apache.org/gtube/>.'
697
+ 'Message detected to contain the GTUBE test from https://spamassassin.apache.org/gtube/.'
557
698
  );
558
699
 
559
700
  return messages;
@@ -619,8 +760,6 @@ class SpamScanner {
619
760
  //
620
761
  // However we don't recommend this and therefore have our servers set to standard Cloudflare DNS
621
762
  //
622
- // TODO: we need to do two lookups in parallel, one against adult and one against malware
623
- // and also make sure the messages aren't duplicated when we concatenate final array of messages
624
763
  const [isAdult, isMalware] = await Promise.all([
625
764
  this.malwareLookup('https://family.cloudflare-dns.com/dns-query', name),
626
765
  this.malwareLookup('https://security.cloudflare-dns.com/dns-query', name)
@@ -742,14 +881,14 @@ class SpamScanner {
742
881
  })
743
882
  .match(URL_REGEX) || [];
744
883
 
745
- const array = [];
884
+ const array = new Set();
746
885
  for (const url of urls) {
747
886
  const normalized = this.getNormalizedUrl(url);
748
887
 
749
- if (normalized && !array.includes(normalized)) array.push(normalized);
888
+ if (normalized) array.add(normalized);
750
889
  }
751
890
 
752
- return array;
891
+ return [...array];
753
892
  }
754
893
 
755
894
  parseLocale(locale) {
@@ -763,12 +902,6 @@ class SpamScanner {
763
902
  // <https://github.com/NaturalNode/natural#stemmers>
764
903
  // eslint-disable-next-line complexity
765
904
  async getTokens(string, locale, isHTML = false) {
766
- // get the current email replacement regex
767
- const EMAIL_REPLACEMENT_REGEX = new RE2(
768
- this.config.replacements.email,
769
- 'g'
770
- );
771
-
772
905
  //
773
906
  // parse HTML for <html> tag with lang attr
774
907
  // otherwise if that wasn't found then look for this
@@ -816,17 +949,6 @@ class SpamScanner {
816
949
 
817
950
  if (isHTML) string = sanitizeHtml(string, this.config.sanitizeHtml);
818
951
 
819
- const replacementRegexes = [];
820
- for (const key of Object.keys(this.config.replacements)) {
821
- replacementRegexes.push(
822
- escapeStringRegexp(this.config.replacements[key])
823
- );
824
- }
825
-
826
- const REPLACEMENTS_REGEX = new RE2(
827
- new RegExp(replacementRegexes.join('|'), 'g')
828
- );
829
-
830
952
  string = striptags(string, [], ' ')
831
953
  .trim()
832
954
  // replace newlines
@@ -835,7 +957,7 @@ class SpamScanner {
835
957
  // attackers may try to inject our replacements into the message
836
958
  // therefore we should strip all of them before doing any replacements
837
959
  //
838
- .replace(REPLACEMENTS_REGEX, ' ');
960
+ .replace(this.REPLACEMENTS_REGEX, ' ');
839
961
 
840
962
  //
841
963
  // we should instead use language detection to determine
@@ -845,6 +967,7 @@ class SpamScanner {
845
967
  // <https://github.com/FGRibreau/node-language-detect> (not too accurate)
846
968
  //
847
969
  const detectedLanguage = franc(string, this.config.franc);
970
+
848
971
  if (
849
972
  detectedLanguage !== 'und' &&
850
973
  isSANB(ISO_CODE_MAPPING[detectedLanguage])
@@ -853,7 +976,8 @@ class SpamScanner {
853
976
 
854
977
  locale = this.parseLocale(isSANB(locale) ? locale : this.config.locale);
855
978
 
856
- if (!locales.has(locale)) {
979
+ // NOTE: "in" and "po" are valid locales but not from i18n
980
+ if (!locales.has(locale) && locale !== 'in' && locale !== 'po') {
857
981
  debug(`Locale ${locale} was not valid and will use default`);
858
982
  locale = this.parseLocale(this.config.locale);
859
983
  }
@@ -865,103 +989,149 @@ class SpamScanner {
865
989
  let stopwords = stopwordsEn;
866
990
  let language = 'english';
867
991
  let stemword = 'default';
992
+
868
993
  switch (locale) {
869
994
  case 'ar':
995
+ // arb
996
+ // ISO 639-3 = ara
997
+ stopwords = stopwordsAra;
870
998
  language = 'arabic';
871
999
  break;
872
1000
  case 'da':
1001
+ // dan
873
1002
  language = 'danish';
1003
+ stopwords = stopwordsDan;
874
1004
  break;
875
1005
  case 'nl':
1006
+ // nld
876
1007
  stopwords = stopwordsNl;
877
1008
  language = 'dutch';
878
1009
  break;
879
1010
  case 'en':
1011
+ // eng
880
1012
  language = 'english';
881
1013
  break;
882
1014
  case 'fi':
1015
+ // fin
883
1016
  language = 'finnish';
884
1017
  tokenizer = orthographyTokenizer;
1018
+ stopwords = stopwordsFin;
885
1019
  break;
886
1020
  case 'fa':
1021
+ // fas (Persian/Farsi)
887
1022
  language = 'farsi';
888
1023
  tokenizer = aggressiveTokenizerFa;
889
1024
  stopwords = stopwordsFa;
890
1025
  stemword = natural.PorterStemmerFa.stem.bind(natural.PorterStemmerFa);
891
1026
  break;
892
1027
  case 'fr':
1028
+ // fra
893
1029
  language = 'french';
894
1030
  tokenizer = aggressiveTokenizerFr;
895
1031
  stopwords = stopwordsFr;
896
1032
  break;
897
1033
  case 'de':
1034
+ // deu
898
1035
  language = 'german';
1036
+ stopwords = stopwordsDeu;
899
1037
  break;
900
1038
  case 'hu':
1039
+ // hun
901
1040
  language = 'hungarian';
1041
+ stopwords = stopwordsHun;
902
1042
  break;
903
1043
  case 'in':
1044
+ // ind
904
1045
  language = 'indonesian';
905
1046
  tokenizer = aggressiveTokenizerId;
906
1047
  stopwords = stopwordsId;
907
1048
  break;
908
1049
  case 'it':
1050
+ // ita
909
1051
  language = 'italian';
910
1052
  tokenizer = aggressiveTokenizerIt;
911
1053
  stopwords = stopwordsIt;
912
1054
  break;
913
1055
  case 'ja':
1056
+ // jpn
914
1057
  tokenizer = tokenizerJa;
915
1058
  stopwords = stopwordsJa;
916
1059
  stemword = natural.StemmerJa.stem.bind(natural.StemmerJa);
917
1060
  break;
918
1061
  case 'nb':
1062
+ // nob
1063
+ language = 'norwegian';
1064
+ tokenizer = aggressiveTokenizerNo;
1065
+ stopwords = stopwordsNo;
1066
+ break;
919
1067
  case 'nn':
1068
+ // nno
1069
+ // ISO 639-3 = nob
920
1070
  language = 'norwegian';
921
1071
  tokenizer = aggressiveTokenizerNo;
922
1072
  stopwords = stopwordsNo;
923
1073
  break;
924
1074
  case 'po':
1075
+ // pol
925
1076
  language = 'polish';
926
1077
  tokenizer = aggressiveTokenizerPl;
927
1078
  stopwords = stopwordsPl;
928
1079
  stemword = false;
929
1080
  break;
930
1081
  case 'pt':
1082
+ // por
931
1083
  language = 'portuguese';
932
1084
  tokenizer = aggressiveTokenizerPt;
933
1085
  stopwords = stopwordsPt;
934
1086
  break;
935
1087
  case 'es':
1088
+ // spa
936
1089
  language = 'spanish';
937
1090
  tokenizer = aggressiveTokenizerEs;
938
1091
  stopwords = stopwordsEs;
939
1092
  break;
940
1093
  case 'sv':
1094
+ // swe
941
1095
  language = 'swedish';
942
1096
  tokenizer = aggressiveTokenizerSv;
943
1097
  stopwords = stopwordsSv;
944
1098
  break;
945
1099
  case 'ro':
1100
+ // ron
946
1101
  language = 'romanian';
1102
+ stopwords = stopwordsRon;
947
1103
  break;
948
1104
  case 'ru':
1105
+ // rus
949
1106
  language = 'russian';
950
1107
  tokenizer = aggressiveTokenizerRu;
951
1108
  stopwords = stopwordsRu;
952
1109
  break;
953
1110
  case 'ta':
1111
+ // tam
1112
+ // NOTE: no stopwords available
954
1113
  language = 'tamil';
955
1114
  break;
956
1115
  case 'tr':
1116
+ // tur
957
1117
  language = 'turkish';
1118
+ stopwords = stopwordsTur;
958
1119
  break;
959
1120
  case 'vi':
1121
+ // vie
960
1122
  language = 'vietnamese';
961
1123
  tokenizer = aggressiveTokenizerVi;
1124
+ stopwords = stopwordsVie;
962
1125
  stemword = false;
963
1126
  break;
964
1127
  case 'zh':
1128
+ // cmn
1129
+ // ISO 639-3 = zho (Chinese, Macrolanguage)
1130
+ // https://github.com/yishn/chinese-tokenizer
1131
+ tokenizer = {
1132
+ tokenize: (str) =>
1133
+ chineseTokenizer(str).map((results) => results.text)
1134
+ };
965
1135
  language = 'chinese';
966
1136
  stopwords = stopwordsZh;
967
1137
  stemword = false;
@@ -979,7 +1149,7 @@ class SpamScanner {
979
1149
  string
980
1150
  .split(' ')
981
1151
  .map((_string) =>
982
- _string.startsWith(':') &&
1152
+ _string.indexOf(':') === 0 &&
983
1153
  _string.endsWith(':') &&
984
1154
  typeof toEmoji[_string.slice(1, -1)] === 'string'
985
1155
  ? toEmoji[_string.slice(1, -1)]
@@ -1027,7 +1197,10 @@ class SpamScanner {
1027
1197
 
1028
1198
  // now we ensure that URL's and EMAIL's are properly spaced out
1029
1199
  // (e.g. in case ?email=some@email.com was in a URL)
1030
- .replace(EMAIL_REPLACEMENT_REGEX, ` ${this.config.replacements.email} `)
1200
+ .replace(
1201
+ this.EMAIL_REPLACEMENT_REGEX,
1202
+ ` ${this.config.replacements.email} `
1203
+ )
1031
1204
 
1032
1205
  // TODO: replace file paths, file dirs, dotfiles, and dotdirs
1033
1206
 
@@ -1042,12 +1215,14 @@ class SpamScanner {
1042
1215
  // replace currency
1043
1216
  .replace(CURRENCY_REGEX, ` ${this.config.replacements.currency} `);
1044
1217
 
1218
+ //
1045
1219
  // expand contractions so "they're" -> [ they, are ] vs. [ they, re ]
1046
1220
  // <https://github.com/NaturalNode/natural/issues/533>
1047
- if (locale === 'en') string = contractions.expand(string);
1048
-
1049
- // whitelist exclusions
1050
- const whitelistedWords = Object.values(this.config.replacements);
1221
+ //
1222
+ // NOTE: we're doing this for all languages now, not just en
1223
+ // if (locale === 'en')
1224
+ //
1225
+ string = contractions.expand(string);
1051
1226
 
1052
1227
  //
1053
1228
  // Future research:
@@ -1059,45 +1234,37 @@ class SpamScanner {
1059
1234
  //
1060
1235
  const tokens = [];
1061
1236
  for (const token of tokenizer.tokenize(string.toLowerCase())) {
1237
+ // zh tokenizr yields empty strings
1238
+ if (token === '' || token === ' ') continue;
1239
+
1062
1240
  // whitelist words from being stemmed (safeguard)
1063
1241
  if (
1064
- whitelistedWords.includes(token) ||
1065
- token.startsWith(this.config.replacements.initialism) ||
1066
- token.startsWith(this.config.replacements.abbrevation)
1242
+ this.WHITELISTED_WORDS.has(token) ||
1243
+ token.indexOf(this.config.replacements.initialism) === 0 ||
1244
+ token.indexOf(this.config.replacements.abbrevation) === 0
1067
1245
  ) {
1068
1246
  tokens.push(token);
1069
1247
  continue;
1070
1248
  }
1071
1249
 
1072
- if (
1073
- stopwords.includes(token) ||
1074
- (sw[locale] && sw[locale].includes(token)) ||
1075
- (locale !== 'en' &&
1076
- (stopwordsEn.includes(token) || sw.en.includes(token)))
1077
- )
1250
+ if (stopwords.has(token) || (locale !== 'en' && stopwordsEn.has(token))) {
1078
1251
  continue;
1252
+ }
1079
1253
 
1080
1254
  // locale specific stopwords to ignore
1081
1255
  let localeStem;
1082
1256
  if (typeof stemword === 'function') {
1083
1257
  localeStem = stemword(token);
1084
- if (
1085
- localeStem &&
1086
- (stopwords.includes(localeStem) ||
1087
- (sw[locale] && sw[locale].includes(localeStem)))
1088
- )
1258
+ if (localeStem && stopwords.has(localeStem)) {
1089
1259
  continue;
1260
+ }
1090
1261
  }
1091
1262
 
1092
1263
  // always check against English stemwords
1093
1264
  let englishStem;
1094
1265
  if (locale !== 'en') {
1095
1266
  englishStem = snowball.stemword(token, 'english');
1096
- if (
1097
- englishStem &&
1098
- (stopwordsEn.includes(englishStem) || sw.en.includes(englishStem))
1099
- )
1100
- continue;
1267
+ if (englishStem && stopwordsEn.has(englishStem)) continue;
1101
1268
  }
1102
1269
 
1103
1270
  tokens.push(
@@ -1105,6 +1272,8 @@ class SpamScanner {
1105
1272
  );
1106
1273
  }
1107
1274
 
1275
+ debug('locale', locale, 'tokens', tokens);
1276
+
1108
1277
  if (this.config.debug) return tokens;
1109
1278
 
1110
1279
  // we should sha256 all tokens with hasha if not in debug mode
@@ -1117,7 +1286,7 @@ class SpamScanner {
1117
1286
  let source = string;
1118
1287
  if (isBuffer(string)) source = string.toString();
1119
1288
  else if (typeof string === 'string' && isValidPath(string))
1120
- source = await readFile(string);
1289
+ source = await fs.promises.readFile(string);
1121
1290
 
1122
1291
  const tokens = [];
1123
1292
  const mail = await simpleParser(source, this.config.simpleParser);
@@ -1155,12 +1324,11 @@ class SpamScanner {
1155
1324
 
1156
1325
  // eslint-disable-next-line complexity
1157
1326
  async getPhishingResults(mail) {
1158
- const messages = [];
1159
-
1327
+ const messages = new Set();
1160
1328
  //
1161
1329
  // NOTE: all links pushed are lowercased
1162
1330
  //
1163
- const links = [];
1331
+ const links = new Set();
1164
1332
 
1165
1333
  // parse <a> tags with different org domain in text vs the link
1166
1334
  if (isSANB(mail.html)) {
@@ -1170,7 +1338,7 @@ class SpamScanner {
1170
1338
  // elements concatenate to form a URL which is malicious or phishing
1171
1339
  //
1172
1340
  for (const link of this.getUrls(striptags(mail.html, [], ' ').trim())) {
1173
- if (!links.includes(link)) links.push(link);
1341
+ links.add(link);
1174
1342
  }
1175
1343
 
1176
1344
  //
@@ -1212,7 +1380,7 @@ class SpamScanner {
1212
1380
  // (this is needed because some have "Web:%20http://google.com" for example in href tags)
1213
1381
  [href] = this.getUrls(href);
1214
1382
  // eslint-disable-next-line max-depth
1215
- if (href && !links.includes(href)) links.push(href);
1383
+ if (href) links.add(href);
1216
1384
  }
1217
1385
 
1218
1386
  // the text content could contain multiple URL's
@@ -1222,7 +1390,7 @@ class SpamScanner {
1222
1390
  isSANB(href) &&
1223
1391
  validator.isURL(href, isURLOptions)
1224
1392
  ) {
1225
- const string = `Anchor link with href of "${href}" and inner text value of "${textContent}"`;
1393
+ const string = `Anchor link with href of ${href} and inner text value of "${textContent}"`;
1226
1394
  // eslint-disable-next-line max-depth
1227
1395
  if (this.config.checkIDNHomographAttack) {
1228
1396
  const anchorUrlHostname = this.getHostname(href);
@@ -1231,8 +1399,8 @@ class SpamScanner {
1231
1399
  const anchorUrlHostnameToASCII =
1232
1400
  punycode.toASCII(anchorUrlHostname);
1233
1401
  // eslint-disable-next-line max-depth
1234
- if (anchorUrlHostnameToASCII.startsWith('xn--'))
1235
- messages.push(
1402
+ if (anchorUrlHostnameToASCII.indexOf('xn--') === 0)
1403
+ messages.add(
1236
1404
  `${string} has possible IDN homograph attack from anchor hostname.`
1237
1405
  );
1238
1406
  }
@@ -1241,8 +1409,8 @@ class SpamScanner {
1241
1409
  // eslint-disable-next-line max-depth
1242
1410
  for (const link of this.getUrls(textContent)) {
1243
1411
  // this link should have already been included but just in case
1244
- // eslint-disable-next-line max-depth
1245
- if (!links.includes(link)) links.push(link);
1412
+
1413
+ links.add(link);
1246
1414
 
1247
1415
  // eslint-disable-next-line max-depth
1248
1416
  if (this.config.checkIDNHomographAttack) {
@@ -1252,8 +1420,8 @@ class SpamScanner {
1252
1420
  const innerTextUrlHostnameToASCII =
1253
1421
  punycode.toASCII(innerTextUrlHostname);
1254
1422
  // eslint-disable-next-line max-depth
1255
- if (innerTextUrlHostnameToASCII.startsWith('xn--'))
1256
- messages.push(
1423
+ if (innerTextUrlHostnameToASCII.indexOf('xn--') === 0)
1424
+ messages.add(
1257
1425
  `${string} has possible IDN homograph attack from inner text hostname.`
1258
1426
  );
1259
1427
  }
@@ -1269,7 +1437,7 @@ class SpamScanner {
1269
1437
  for (const prop of MAIL_PHISHING_PROPS) {
1270
1438
  if (isSANB(mail[prop])) {
1271
1439
  for (const link of this.getUrls(mail[prop])) {
1272
- if (!links.includes(link)) links.push(link);
1440
+ links.add(link);
1273
1441
  }
1274
1442
  }
1275
1443
  }
@@ -1279,9 +1447,9 @@ class SpamScanner {
1279
1447
  const urlHostname = this.getHostname(link);
1280
1448
  if (urlHostname) {
1281
1449
  const toASCII = punycode.toASCII(urlHostname);
1282
- if (toASCII.startsWith('xn--'))
1283
- messages.push(
1284
- `Possible IDN homograph attack from link of "${link}" with punycode converted hostname of "${toASCII}".`
1450
+ if (toASCII.indexOf('xn--') === 0)
1451
+ messages.add(
1452
+ `Possible IDN homograph attack from link of ${link} with punycode converted hostname of ${toASCII}.`
1285
1453
  );
1286
1454
  }
1287
1455
  }
@@ -1290,28 +1458,25 @@ class SpamScanner {
1290
1458
  // check against Cloudflare malware/phishing/adult DNS lookup
1291
1459
  // if it returns `0.0.0.0` it means it was flagged
1292
1460
  await Promise.all(
1293
- links.map(async (link) => {
1461
+ [...links].map(async (link) => {
1294
1462
  try {
1295
1463
  const urlHostname = this.getHostname(link);
1296
1464
  if (urlHostname) {
1297
1465
  const toASCII = punycode.toASCII(urlHostname);
1298
- const adultMessage = `Link hostname of "${toASCII}" was detected by Cloudflare's Family DNS to contain adult-related content, phishing, and/or malware.`;
1299
- const malwareMessage = `Link hostname of ${toASCII}" was detected by Cloudflare's Security DNS to contain phishing and/or malware.`;
1466
+ const adultMessage = `Link hostname of ${toASCII} was detected by Cloudflare's Family DNS to contain adult-related content, phishing, and/or malware.`;
1467
+ const malwareMessage = `Link hostname of ${toASCII} was detected by Cloudflare's Security DNS to contain phishing and/or malware.`;
1300
1468
 
1301
1469
  // if it already included both messages then return early
1302
- if (
1303
- messages.includes(adultMessage) &&
1304
- messages.includes(malwareMessage)
1305
- )
1470
+ if (messages.has(adultMessage) && messages.has(malwareMessage))
1306
1471
  return;
1307
1472
 
1308
1473
  const { isAdult, isMalware } =
1309
1474
  await this.memoizedIsCloudflareBlocked(toASCII);
1310
1475
 
1311
- if (isAdult && !messages.includes(adultMessage))
1312
- messages.push(adultMessage);
1313
- if (isMalware && !messages.includes(malwareMessage))
1314
- messages.push(malwareMessage);
1476
+ if (isAdult && !messages.has(adultMessage))
1477
+ messages.add(adultMessage);
1478
+ if (isMalware && !messages.has(malwareMessage))
1479
+ messages.add(malwareMessage);
1315
1480
  }
1316
1481
  } catch (err) {
1317
1482
  this.config.logger.error(err);
@@ -1319,7 +1484,7 @@ class SpamScanner {
1319
1484
  })
1320
1485
  );
1321
1486
 
1322
- return { messages, links };
1487
+ return { messages: [...messages], links: [...links] };
1323
1488
  }
1324
1489
 
1325
1490
  // getNSFWResults() {
@@ -1340,7 +1505,7 @@ class SpamScanner {
1340
1505
  try {
1341
1506
  const fileType = await FileType.fromBuffer(attachment.content);
1342
1507
 
1343
- if (fileType && fileType.ext && EXECUTABLES.includes(fileType.ext))
1508
+ if (fileType && fileType.ext && EXECUTABLES.has(fileType.ext))
1344
1509
  messages.push(
1345
1510
  `Attachment's "magic number" indicated it was a dangerous executable with a ".${fileType.ext}" extension.`
1346
1511
  );
@@ -1355,7 +1520,7 @@ class SpamScanner {
1355
1520
  punycode.toUnicode(attachment.filename.split('?')[0])
1356
1521
  );
1357
1522
  const ext = fileExtension(filename);
1358
- if (ext && EXECUTABLES.includes(ext))
1523
+ if (ext && EXECUTABLES.has(ext))
1359
1524
  messages.push(
1360
1525
  `Attachment's file name indicated it was a dangerous executable with a ".${ext}" extension.`
1361
1526
  );
@@ -1363,7 +1528,7 @@ class SpamScanner {
1363
1528
 
1364
1529
  if (isSANB(attachment.contentType)) {
1365
1530
  const ext = mime.extension(attachment.contentType);
1366
- if (isSANB(ext) && EXECUTABLES.includes(ext))
1531
+ if (isSANB(ext) && EXECUTABLES.has(ext))
1367
1532
  messages.push(
1368
1533
  `Attachment's Content-Type was a dangerous executable with a ".${ext}" extension.`
1369
1534
  );