spamscanner 3.0.7 → 5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -1,8 +1,10 @@
1
1
  const dns = require('dns');
2
2
  const fs = require('fs');
3
- const { promisify } = require('util');
3
+ const path = require('path');
4
+ const process = require('process');
5
+ const { debuglog } = require('util');
4
6
 
5
- // eslint-disable-next-line node/no-deprecated-api
7
+ // eslint-disable-next-line n/no-deprecated-api
6
8
  const punycode = require('punycode');
7
9
 
8
10
  const ClamScan = require('clamscan');
@@ -12,7 +14,6 @@ const RE2 = require('re2');
12
14
  const bitcoinRegex = require('bitcoin-regex');
13
15
  const contractions = require('expand-contractions');
14
16
  const creditCardRegex = require('credit-card-regex');
15
- const debug = require('debug')('spamscanner');
16
17
  const emailRegexSafe = require('email-regex-safe');
17
18
  const emojiPatterns = require('emoji-patterns');
18
19
  const escapeStringRegexp = require('escape-string-regexp');
@@ -46,12 +47,28 @@ const toEmoji = require('gemoji/name-to-emoji');
46
47
  const universalify = require('universalify');
47
48
  const urlRegexSafe = require('url-regex-safe');
48
49
  const validator = require('validator');
50
+ const which = require('which');
49
51
  const { Iconv } = require('iconv');
50
52
  const { codes } = require('currency-codes');
51
53
  const { fromUrl, NO_HOSTNAME } = require('parse-domain');
52
54
  const { parse } = require('node-html-parser');
53
55
  const { simpleParser } = require('mailparser');
54
56
 
57
+ const debug = debuglog('spamscanner');
58
+
59
+ //
60
+ // NOTE: we periodically need to update this
61
+ //
62
+ // Source from: CC-CEDICT
63
+ // Licensed under Creative Commons Attribution-ShareAlike 4.0 International License
64
+ // <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>
65
+ //
66
+ // <https://github.com/yishn/chinese-tokenizer>
67
+ //
68
+ const chineseTokenizer = require('chinese-tokenizer').loadFile(
69
+ path.join(__dirname, 'cedict_1_0_ts_utf-8_mdbg.txt')
70
+ );
71
+
55
72
  const aggressiveTokenizer = new natural.AggressiveTokenizer();
56
73
  const orthographyTokenizer = new natural.OrthographyTokenizer({
57
74
  language: 'fi'
@@ -69,20 +86,115 @@ const aggressiveTokenizerSv = new natural.AggressiveTokenizerSv();
69
86
  const aggressiveTokenizerRu = new natural.AggressiveTokenizerRu();
70
87
  const aggressiveTokenizerVi = new natural.AggressiveTokenizerVi();
71
88
 
72
- const stopwordsEn = require('natural/lib/natural/util/stopwords').words;
73
- const stopwordsEs = require('natural/lib/natural/util/stopwords_es').words;
74
- const stopwordsFa = require('natural/lib/natural/util/stopwords_fa').words;
75
- const stopwordsFr = require('natural/lib/natural/util/stopwords_fr').words;
76
- const stopwordsId = require('natural/lib/natural/util/stopwords_id').words;
77
- const stopwordsJa = require('natural/lib/natural/util/stopwords_ja').words;
78
- const stopwordsIt = require('natural/lib/natural/util/stopwords_it').words;
79
- const stopwordsNl = require('natural/lib/natural/util/stopwords_nl').words;
80
- const stopwordsNo = require('natural/lib/natural/util/stopwords_no').words;
81
- const stopwordsPl = require('natural/lib/natural/util/stopwords_pl').words;
82
- const stopwordsPt = require('natural/lib/natural/util/stopwords_pt').words;
83
- const stopwordsRu = require('natural/lib/natural/util/stopwords_ru').words;
84
- const stopwordsSv = require('natural/lib/natural/util/stopwords_sv').words;
85
- const stopwordsZh = require('natural/lib/natural/util/stopwords_zh').words;
89
+ const stopwordsEn = new Set([
90
+ ...require('natural/lib/natural/util/stopwords').words,
91
+ ...sw.eng
92
+ ]);
93
+ const stopwordsEs = new Set([
94
+ ...require('natural/lib/natural/util/stopwords_es').words,
95
+ ...sw.spa
96
+ ]);
97
+ const stopwordsFa = new Set([
98
+ ...require('natural/lib/natural/util/stopwords_fa').words,
99
+ ...sw.fas
100
+ ]);
101
+ const stopwordsFr = new Set([
102
+ ...require('natural/lib/natural/util/stopwords_fr').words,
103
+ ...sw.fra
104
+ ]);
105
+ const stopwordsId = new Set([
106
+ ...require('natural/lib/natural/util/stopwords_id').words,
107
+ ...sw.ind
108
+ ]);
109
+ const stopwordsJa = new Set([
110
+ ...require('natural/lib/natural/util/stopwords_ja').words,
111
+ ...sw.jpn
112
+ ]);
113
+ const stopwordsIt = new Set([
114
+ ...require('natural/lib/natural/util/stopwords_it').words,
115
+ ...sw.ita
116
+ ]);
117
+ const stopwordsNl = new Set([
118
+ ...require('natural/lib/natural/util/stopwords_nl').words,
119
+ ...sw.nld
120
+ ]);
121
+ const stopwordsNo = new Set([
122
+ ...require('natural/lib/natural/util/stopwords_no').words,
123
+ ...sw.nob
124
+ ]);
125
+ const stopwordsPl = new Set([
126
+ ...require('natural/lib/natural/util/stopwords_pl').words,
127
+ ...sw.pol
128
+ ]);
129
+ const stopwordsPt = new Set([
130
+ ...require('natural/lib/natural/util/stopwords_pt').words,
131
+ ...sw.por,
132
+ ...sw.porBr
133
+ ]);
134
+ const stopwordsRu = new Set([
135
+ ...require('natural/lib/natural/util/stopwords_ru').words,
136
+ ...sw.rus
137
+ ]);
138
+ const stopwordsSv = new Set([
139
+ ...require('natural/lib/natural/util/stopwords_sv').words,
140
+ ...sw.swe
141
+ ]);
142
+ const stopwordsZh = new Set([
143
+ ...require('natural/lib/natural/util/stopwords_zh').words,
144
+ ...sw.zho
145
+ ]);
146
+
147
+ const stopwordsRon = new Set(sw.ron);
148
+ const stopwordsTur = new Set(sw.tur);
149
+ const stopwordsVie = new Set(sw.vie);
150
+ const stopwordsDeu = new Set(sw.deu);
151
+ const stopwordsHun = new Set(sw.hun);
152
+ const stopwordsAra = new Set(sw.ara);
153
+ const stopwordsDan = new Set(sw.dan);
154
+ const stopwordsFin = new Set(sw.fin);
155
+
156
+ // TODO: add stopword pairing for these langs:
157
+ // afr
158
+ // ben
159
+ // bre
160
+ // bul
161
+ // cat
162
+ // ces
163
+ // ell
164
+ // epo
165
+ // est
166
+ // eus
167
+ // fra
168
+ // gle
169
+ // glg
170
+ // guj
171
+ // hau
172
+ // heb
173
+ // hin
174
+ // hrv
175
+ // hye
176
+ // kor
177
+ // kur
178
+ // lat
179
+ // lav
180
+ // lgg
181
+ // lggNd
182
+ // lit
183
+ // mar
184
+ // msa
185
+ // mya
186
+ // panGu
187
+ // slk
188
+ // slv
189
+ // som
190
+ // sot
191
+ // swa
192
+ // tgl
193
+ // tha
194
+ // ukr
195
+ // urd
196
+ // yor
197
+ // zul
86
198
 
87
199
  // <https://stackoverflow.com/a/41353282>
88
200
  // <https://www.ietf.org/rfc/rfc3986.txt>
@@ -92,19 +204,20 @@ const ENDING_RESERVED_REGEX = new RE2(
92
204
 
93
205
  const PKG = require('./package.json');
94
206
 
95
- const VOCABULARY_LIMIT = require('./vocabulary-limit');
207
+ const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
208
+
209
+ // TODO: convert this into a Map
210
+ const ISO_CODE_MAPPING = require('./iso-code-mapping.json');
96
211
 
97
- const ISO_CODE_MAPPING = require('./iso-code-mapping');
212
+ const ISO_CODE_MAPPING_KEYS = Object.keys(ISO_CODE_MAPPING);
98
213
 
99
214
  // <https://kb.smarshmail.com/Article/23567>
100
- const EXECUTABLES = require('./executables');
215
+ const EXECUTABLES = new Set(require('./executables.json'));
101
216
 
102
- const REPLACEMENT_WORDS = require('./replacement-words');
217
+ const REPLACEMENT_WORDS = require('./replacement-words.json');
103
218
 
104
219
  const locales = new Set(i18nLocales.map((l) => l.toLowerCase()));
105
220
 
106
- const readFile = promisify(fs.readFile);
107
-
108
221
  const normalizeUrlOptions = {
109
222
  stripProtocol: true,
110
223
  stripWWW: false,
@@ -154,7 +267,8 @@ for (const code of codes()) {
154
267
  const symbol = getSymbolFromCurrency(code);
155
268
  if (
156
269
  typeof symbol === 'string' &&
157
- !currencySymbols.includes(symbol) &&
270
+ // eslint-disable-next-line unicorn/prefer-includes
271
+ currencySymbols.indexOf(symbol) === -1 &&
158
272
  !new RE2(/^[a-z]+$/i).test(symbol)
159
273
  )
160
274
  currencySymbols.push(escapeStringRegexp(symbol));
@@ -187,11 +301,13 @@ const isURLOptions = {
187
301
  class SpamScanner {
188
302
  constructor(config = {}) {
189
303
  this.config = {
190
- debug: process.env.NODE_ENV === 'test',
304
+ debug:
305
+ process.env.NODE_ENV === 'test' ||
306
+ process.env.NODE_ENV === 'development',
191
307
  checkIDNHomographAttack: false,
192
308
  // note that if you attempt to train an existing `scanner.classifier`
193
309
  // then you will need to re-use these, so we suggest you store them
194
- replacements: config.replacements || require('./replacements'),
310
+ replacements: config.replacements || require('./replacements.js'),
195
311
  // <https://nodemailer.com/extras/mailparser/>
196
312
  // NOTE: `iconv` package's Iconv cannot be used in worker threads
197
313
  // AND it can not also be shared in worker threads either (e.g. cloned)
@@ -203,7 +319,7 @@ class SpamScanner {
203
319
  // `wget --mirror --passive-ftp ftp://ftp.ietf.org/ietf-mail-archive`
204
320
  // `wget --mirror --passive-ftp ftp://ftp.ietf.org/concluded-wg-ietf-mail-archive`
205
321
  // (spam dataset is private at the moment)
206
- classifier: config.classifier || require('./get-classifier'),
322
+ classifier: config.classifier || require('./get-classifier.js'),
207
323
  // default locale validated against i18n-locales
208
324
  locale: 'en',
209
325
  // we recommend to use axe/cabin, see https://cabinjs.com
@@ -310,21 +426,22 @@ class SpamScanner {
310
426
  allowedAttributes: false
311
427
  },
312
428
  userAgent: `${PKG.name}/${PKG.version}`,
313
- timeout: ms('5s'),
429
+ timeout: ms('10s'),
314
430
  clamscan: {
431
+ debugMode:
432
+ process.env.NODE_ENV === 'test' ||
433
+ process.env.NODE_ENV === 'development',
434
+ clamscan: {
435
+ path: which.sync('clamscan', { nothrow: true })
436
+ },
315
437
  clamdscan: {
316
438
  timeout: ms('10s'),
439
+ path: which.sync('clamdscan', { nothrow: true }),
317
440
  socket: macosVersion.isMacOS
318
441
  ? '/tmp/clamd.socket'
319
442
  : '/var/run/clamav/clamd.ctl'
320
443
  }
321
444
  },
322
- franc: {
323
- minLength: 100,
324
- // we can only support languages available
325
- // in stopwords and natural's tokenizer methods
326
- only: Object.keys(ISO_CODE_MAPPING)
327
- },
328
445
  hasha: {
329
446
  algorithm: 'sha256'
330
447
  },
@@ -339,6 +456,21 @@ class SpamScanner {
339
456
  client: false,
340
457
  cachePrefix: 'spamscanner',
341
458
  ttlMs: ms('1h'),
459
+ // franc
460
+ franc: {
461
+ // NOTE: if locale was passed and was valid
462
+ // then we need to compare it against english
463
+ // and if it was english detected (and not und)
464
+ // then switch the detected locale to english
465
+ minLength: 5,
466
+ // we can only support languages available
467
+ // in stopwords and natural's tokenizer methods
468
+ // and if it was detected to be english, compare against all languages
469
+ // otherwise if not, then compare only against english
470
+ // (namely we need to check against JP/ZH, but perhaps _all_ in future)
471
+ // (the edge case is that someone could spoof a language and it go undetected and tokenization bugs occur)
472
+ only: ISO_CODE_MAPPING_KEYS
473
+ },
342
474
  ...config
343
475
  };
344
476
 
@@ -416,7 +548,6 @@ class SpamScanner {
416
548
  // cache in the background
417
549
  this.config.client
418
550
  .set(key, `${isAdult}:${isMalware}`, 'PX', this.config.ttlMs)
419
- // eslint-disable-next-line promise/prefer-await-to-then
420
551
  .then(this.config.logger.info)
421
552
  .catch(this.config.logger.error);
422
553
  return { isAdult, isMalware };
@@ -431,6 +562,27 @@ class SpamScanner {
431
562
  throw new Error(
432
563
  `Locale of ${this.config.locale} was not valid according to locales list.`
433
564
  );
565
+
566
+ //
567
+ // set up regex helpers
568
+ //
569
+ this.EMAIL_REPLACEMENT_REGEX = new RE2(this.config.replacements.email, 'g');
570
+ const replacementRegexes = [];
571
+ for (const key of Object.keys(this.config.replacements)) {
572
+ replacementRegexes.push(
573
+ escapeStringRegexp(this.config.replacements[key])
574
+ );
575
+ }
576
+
577
+ this.REPLACEMENTS_REGEX = new RE2(
578
+ new RegExp(replacementRegexes.join('|'), 'g')
579
+ );
580
+
581
+ //
582
+ // set up helper Map and Sets for fast lookup
583
+ // (Set.has is 2x faster than includes, and 50% faster than indexOf)
584
+ //
585
+ this.WHITELISTED_WORDS = new Set(Object.values(this.config.replacements));
434
586
  }
435
587
 
436
588
  getHostname(link) {
@@ -520,17 +672,12 @@ class SpamScanner {
520
672
  const stream = isStream(attachment.content)
521
673
  ? attachment.content
522
674
  : intoStream(attachment.content);
523
- const {
524
- is_infected: isInfected,
525
- viruses
526
- } = await clamscan.scan_stream(stream);
675
+ const { isInfected, viruses } = await clamscan.scanStream(stream);
527
676
  const name = isSANB(attachment.filename)
528
677
  ? `"${attachment.filename}"`
529
678
  : `#${i + 1}`;
530
679
  if (isInfected)
531
- messages.push(
532
- `Attachment ${name} was infected with "${viruses}".`
533
- );
680
+ messages.push(`Attachment ${name} was infected with ${viruses}.`);
534
681
  } catch (err) {
535
682
  this.config.logger.error(err);
536
683
  }
@@ -548,13 +695,16 @@ class SpamScanner {
548
695
 
549
696
  let gtube = false;
550
697
 
551
- if (isSANB(mail.html) && mail.html.includes(GTUBE)) gtube = true;
698
+ // eslint-disable-next-line unicorn/prefer-includes
699
+ if (isSANB(mail.html) && mail.html.indexOf(GTUBE) !== -1) gtube = true;
552
700
 
553
- if (isSANB(mail.text) && !gtube && mail.text.includes(GTUBE)) gtube = true;
701
+ // eslint-disable-next-line unicorn/prefer-includes
702
+ if (isSANB(mail.text) && !gtube && mail.text.indexOf(GTUBE) !== -1)
703
+ gtube = true;
554
704
 
555
705
  if (gtube)
556
706
  messages.push(
557
- 'Message detected to contain the GTUBE test from <https://spamassassin.apache.org/gtube/>.'
707
+ 'Message detected to contain the GTUBE test from https://spamassassin.apache.org/gtube/.'
558
708
  );
559
709
 
560
710
  return messages;
@@ -597,9 +747,8 @@ class SpamScanner {
597
747
  records[0] === '0.0.0.0'
598
748
  );
599
749
  } catch (err) {
600
- this.config.logger.error(err);
601
- // return true if there is an error with DNS lookups
602
- return true;
750
+ this.config.logger.warn(err);
751
+ return false;
603
752
  }
604
753
  }
605
754
  }
@@ -621,8 +770,6 @@ class SpamScanner {
621
770
  //
622
771
  // However we don't recommend this and therefore have our servers set to standard Cloudflare DNS
623
772
  //
624
- // TODO: we need to do two lookups in parallel, one against adult and one against malware
625
- // and also make sure the messages aren't duplicated when we concatenate final array of messages
626
773
  const [isAdult, isMalware] = await Promise.all([
627
774
  this.malwareLookup('https://family.cloudflare-dns.com/dns-query', name),
628
775
  this.malwareLookup('https://security.cloudflare-dns.com/dns-query', name)
@@ -744,14 +891,14 @@ class SpamScanner {
744
891
  })
745
892
  .match(URL_REGEX) || [];
746
893
 
747
- const array = [];
894
+ const array = new Set();
748
895
  for (const url of urls) {
749
896
  const normalized = this.getNormalizedUrl(url);
750
897
 
751
- if (normalized && !array.includes(normalized)) array.push(normalized);
898
+ if (normalized) array.add(normalized);
752
899
  }
753
900
 
754
- return array;
901
+ return [...array];
755
902
  }
756
903
 
757
904
  parseLocale(locale) {
@@ -765,12 +912,6 @@ class SpamScanner {
765
912
  // <https://github.com/NaturalNode/natural#stemmers>
766
913
  // eslint-disable-next-line complexity
767
914
  async getTokens(string, locale, isHTML = false) {
768
- // get the current email replacement regex
769
- const EMAIL_REPLACEMENT_REGEX = new RE2(
770
- this.config.replacements.email,
771
- 'g'
772
- );
773
-
774
915
  //
775
916
  // parse HTML for <html> tag with lang attr
776
917
  // otherwise if that wasn't found then look for this
@@ -818,17 +959,6 @@ class SpamScanner {
818
959
 
819
960
  if (isHTML) string = sanitizeHtml(string, this.config.sanitizeHtml);
820
961
 
821
- const replacementRegexes = [];
822
- for (const key of Object.keys(this.config.replacements)) {
823
- replacementRegexes.push(
824
- escapeStringRegexp(this.config.replacements[key])
825
- );
826
- }
827
-
828
- const REPLACEMENTS_REGEX = new RE2(
829
- new RegExp(replacementRegexes.join('|'), 'g')
830
- );
831
-
832
962
  string = striptags(string, [], ' ')
833
963
  .trim()
834
964
  // replace newlines
@@ -837,7 +967,7 @@ class SpamScanner {
837
967
  // attackers may try to inject our replacements into the message
838
968
  // therefore we should strip all of them before doing any replacements
839
969
  //
840
- .replace(REPLACEMENTS_REGEX, ' ');
970
+ .replace(this.REPLACEMENTS_REGEX, ' ');
841
971
 
842
972
  //
843
973
  // we should instead use language detection to determine
@@ -847,6 +977,7 @@ class SpamScanner {
847
977
  // <https://github.com/FGRibreau/node-language-detect> (not too accurate)
848
978
  //
849
979
  const detectedLanguage = franc(string, this.config.franc);
980
+
850
981
  if (
851
982
  detectedLanguage !== 'und' &&
852
983
  isSANB(ISO_CODE_MAPPING[detectedLanguage])
@@ -855,7 +986,8 @@ class SpamScanner {
855
986
 
856
987
  locale = this.parseLocale(isSANB(locale) ? locale : this.config.locale);
857
988
 
858
- if (!locales.has(locale)) {
989
+ // NOTE: "in" and "po" are valid locales but not from i18n
990
+ if (!locales.has(locale) && locale !== 'in' && locale !== 'po') {
859
991
  debug(`Locale ${locale} was not valid and will use default`);
860
992
  locale = this.parseLocale(this.config.locale);
861
993
  }
@@ -867,103 +999,149 @@ class SpamScanner {
867
999
  let stopwords = stopwordsEn;
868
1000
  let language = 'english';
869
1001
  let stemword = 'default';
1002
+
870
1003
  switch (locale) {
871
1004
  case 'ar':
1005
+ // arb
1006
+ // ISO 639-3 = ara
1007
+ stopwords = stopwordsAra;
872
1008
  language = 'arabic';
873
1009
  break;
874
1010
  case 'da':
1011
+ // dan
875
1012
  language = 'danish';
1013
+ stopwords = stopwordsDan;
876
1014
  break;
877
1015
  case 'nl':
1016
+ // nld
878
1017
  stopwords = stopwordsNl;
879
1018
  language = 'dutch';
880
1019
  break;
881
1020
  case 'en':
1021
+ // eng
882
1022
  language = 'english';
883
1023
  break;
884
1024
  case 'fi':
1025
+ // fin
885
1026
  language = 'finnish';
886
1027
  tokenizer = orthographyTokenizer;
1028
+ stopwords = stopwordsFin;
887
1029
  break;
888
1030
  case 'fa':
1031
+ // fas (Persian/Farsi)
889
1032
  language = 'farsi';
890
1033
  tokenizer = aggressiveTokenizerFa;
891
1034
  stopwords = stopwordsFa;
892
1035
  stemword = natural.PorterStemmerFa.stem.bind(natural.PorterStemmerFa);
893
1036
  break;
894
1037
  case 'fr':
1038
+ // fra
895
1039
  language = 'french';
896
1040
  tokenizer = aggressiveTokenizerFr;
897
1041
  stopwords = stopwordsFr;
898
1042
  break;
899
1043
  case 'de':
1044
+ // deu
900
1045
  language = 'german';
1046
+ stopwords = stopwordsDeu;
901
1047
  break;
902
1048
  case 'hu':
1049
+ // hun
903
1050
  language = 'hungarian';
1051
+ stopwords = stopwordsHun;
904
1052
  break;
905
1053
  case 'in':
1054
+ // ind
906
1055
  language = 'indonesian';
907
1056
  tokenizer = aggressiveTokenizerId;
908
1057
  stopwords = stopwordsId;
909
1058
  break;
910
1059
  case 'it':
1060
+ // ita
911
1061
  language = 'italian';
912
1062
  tokenizer = aggressiveTokenizerIt;
913
1063
  stopwords = stopwordsIt;
914
1064
  break;
915
1065
  case 'ja':
1066
+ // jpn
916
1067
  tokenizer = tokenizerJa;
917
1068
  stopwords = stopwordsJa;
918
1069
  stemword = natural.StemmerJa.stem.bind(natural.StemmerJa);
919
1070
  break;
920
1071
  case 'nb':
1072
+ // nob
1073
+ language = 'norwegian';
1074
+ tokenizer = aggressiveTokenizerNo;
1075
+ stopwords = stopwordsNo;
1076
+ break;
921
1077
  case 'nn':
1078
+ // nno
1079
+ // ISO 639-3 = nob
922
1080
  language = 'norwegian';
923
1081
  tokenizer = aggressiveTokenizerNo;
924
1082
  stopwords = stopwordsNo;
925
1083
  break;
926
1084
  case 'po':
1085
+ // pol
927
1086
  language = 'polish';
928
1087
  tokenizer = aggressiveTokenizerPl;
929
1088
  stopwords = stopwordsPl;
930
1089
  stemword = false;
931
1090
  break;
932
1091
  case 'pt':
1092
+ // por
933
1093
  language = 'portuguese';
934
1094
  tokenizer = aggressiveTokenizerPt;
935
1095
  stopwords = stopwordsPt;
936
1096
  break;
937
1097
  case 'es':
1098
+ // spa
938
1099
  language = 'spanish';
939
1100
  tokenizer = aggressiveTokenizerEs;
940
1101
  stopwords = stopwordsEs;
941
1102
  break;
942
1103
  case 'sv':
1104
+ // swe
943
1105
  language = 'swedish';
944
1106
  tokenizer = aggressiveTokenizerSv;
945
1107
  stopwords = stopwordsSv;
946
1108
  break;
947
1109
  case 'ro':
1110
+ // ron
948
1111
  language = 'romanian';
1112
+ stopwords = stopwordsRon;
949
1113
  break;
950
1114
  case 'ru':
1115
+ // rus
951
1116
  language = 'russian';
952
1117
  tokenizer = aggressiveTokenizerRu;
953
1118
  stopwords = stopwordsRu;
954
1119
  break;
955
1120
  case 'ta':
1121
+ // tam
1122
+ // NOTE: no stopwords available
956
1123
  language = 'tamil';
957
1124
  break;
958
1125
  case 'tr':
1126
+ // tur
959
1127
  language = 'turkish';
1128
+ stopwords = stopwordsTur;
960
1129
  break;
961
1130
  case 'vi':
1131
+ // vie
962
1132
  language = 'vietnamese';
963
1133
  tokenizer = aggressiveTokenizerVi;
1134
+ stopwords = stopwordsVie;
964
1135
  stemword = false;
965
1136
  break;
966
1137
  case 'zh':
1138
+ // cmn
1139
+ // ISO 639-3 = zho (Chinese, Macrolanguage)
1140
+ // https://github.com/yishn/chinese-tokenizer
1141
+ tokenizer = {
1142
+ tokenize: (str) =>
1143
+ chineseTokenizer(str).map((results) => results.text)
1144
+ };
967
1145
  language = 'chinese';
968
1146
  stopwords = stopwordsZh;
969
1147
  stemword = false;
@@ -981,7 +1159,7 @@ class SpamScanner {
981
1159
  string
982
1160
  .split(' ')
983
1161
  .map((_string) =>
984
- _string.startsWith(':') &&
1162
+ _string.indexOf(':') === 0 &&
985
1163
  _string.endsWith(':') &&
986
1164
  typeof toEmoji[_string.slice(1, -1)] === 'string'
987
1165
  ? toEmoji[_string.slice(1, -1)]
@@ -1029,7 +1207,10 @@ class SpamScanner {
1029
1207
 
1030
1208
  // now we ensure that URL's and EMAIL's are properly spaced out
1031
1209
  // (e.g. in case ?email=some@email.com was in a URL)
1032
- .replace(EMAIL_REPLACEMENT_REGEX, ` ${this.config.replacements.email} `)
1210
+ .replace(
1211
+ this.EMAIL_REPLACEMENT_REGEX,
1212
+ ` ${this.config.replacements.email} `
1213
+ )
1033
1214
 
1034
1215
  // TODO: replace file paths, file dirs, dotfiles, and dotdirs
1035
1216
 
@@ -1044,12 +1225,14 @@ class SpamScanner {
1044
1225
  // replace currency
1045
1226
  .replace(CURRENCY_REGEX, ` ${this.config.replacements.currency} `);
1046
1227
 
1228
+ //
1047
1229
  // expand contractions so "they're" -> [ they, are ] vs. [ they, re ]
1048
1230
  // <https://github.com/NaturalNode/natural/issues/533>
1049
- if (locale === 'en') string = contractions.expand(string);
1050
-
1051
- // whitelist exclusions
1052
- const whitelistedWords = Object.values(this.config.replacements);
1231
+ //
1232
+ // NOTE: we're doing this for all languages now, not just en
1233
+ // if (locale === 'en')
1234
+ //
1235
+ string = contractions.expand(string);
1053
1236
 
1054
1237
  //
1055
1238
  // Future research:
@@ -1061,45 +1244,37 @@ class SpamScanner {
1061
1244
  //
1062
1245
  const tokens = [];
1063
1246
  for (const token of tokenizer.tokenize(string.toLowerCase())) {
1247
+ // zh tokenizr yields empty strings
1248
+ if (token === '' || token === ' ') continue;
1249
+
1064
1250
  // whitelist words from being stemmed (safeguard)
1065
1251
  if (
1066
- whitelistedWords.includes(token) ||
1067
- token.startsWith(this.config.replacements.initialism) ||
1068
- token.startsWith(this.config.replacements.abbrevation)
1252
+ this.WHITELISTED_WORDS.has(token) ||
1253
+ token.indexOf(this.config.replacements.initialism) === 0 ||
1254
+ token.indexOf(this.config.replacements.abbrevation) === 0
1069
1255
  ) {
1070
1256
  tokens.push(token);
1071
1257
  continue;
1072
1258
  }
1073
1259
 
1074
- if (
1075
- stopwords.includes(token) ||
1076
- (sw[locale] && sw[locale].includes(token)) ||
1077
- (locale !== 'en' &&
1078
- (stopwordsEn.includes(token) || sw.en.includes(token)))
1079
- )
1260
+ if (stopwords.has(token) || (locale !== 'en' && stopwordsEn.has(token))) {
1080
1261
  continue;
1262
+ }
1081
1263
 
1082
1264
  // locale specific stopwords to ignore
1083
1265
  let localeStem;
1084
1266
  if (typeof stemword === 'function') {
1085
1267
  localeStem = stemword(token);
1086
- if (
1087
- localeStem &&
1088
- (stopwords.includes(localeStem) ||
1089
- (sw[locale] && sw[locale].includes(localeStem)))
1090
- )
1268
+ if (localeStem && stopwords.has(localeStem)) {
1091
1269
  continue;
1270
+ }
1092
1271
  }
1093
1272
 
1094
1273
  // always check against English stemwords
1095
1274
  let englishStem;
1096
1275
  if (locale !== 'en') {
1097
1276
  englishStem = snowball.stemword(token, 'english');
1098
- if (
1099
- englishStem &&
1100
- (stopwordsEn.includes(englishStem) || sw.en.includes(englishStem))
1101
- )
1102
- continue;
1277
+ if (englishStem && stopwordsEn.has(englishStem)) continue;
1103
1278
  }
1104
1279
 
1105
1280
  tokens.push(
@@ -1107,6 +1282,8 @@ class SpamScanner {
1107
1282
  );
1108
1283
  }
1109
1284
 
1285
+ debug('locale', locale, 'tokens', tokens);
1286
+
1110
1287
  if (this.config.debug) return tokens;
1111
1288
 
1112
1289
  // we should sha256 all tokens with hasha if not in debug mode
@@ -1119,7 +1296,7 @@ class SpamScanner {
1119
1296
  let source = string;
1120
1297
  if (isBuffer(string)) source = string.toString();
1121
1298
  else if (typeof string === 'string' && isValidPath(string))
1122
- source = await readFile(string);
1299
+ source = await fs.promises.readFile(string);
1123
1300
 
1124
1301
  const tokens = [];
1125
1302
  const mail = await simpleParser(source, this.config.simpleParser);
@@ -1157,12 +1334,11 @@ class SpamScanner {
1157
1334
 
1158
1335
  // eslint-disable-next-line complexity
1159
1336
  async getPhishingResults(mail) {
1160
- const messages = [];
1161
-
1337
+ const messages = new Set();
1162
1338
  //
1163
1339
  // NOTE: all links pushed are lowercased
1164
1340
  //
1165
- const links = [];
1341
+ const links = new Set();
1166
1342
 
1167
1343
  // parse <a> tags with different org domain in text vs the link
1168
1344
  if (isSANB(mail.html)) {
@@ -1172,7 +1348,7 @@ class SpamScanner {
1172
1348
  // elements concatenate to form a URL which is malicious or phishing
1173
1349
  //
1174
1350
  for (const link of this.getUrls(striptags(mail.html, [], ' ').trim())) {
1175
- if (!links.includes(link)) links.push(link);
1351
+ links.add(link);
1176
1352
  }
1177
1353
 
1178
1354
  //
@@ -1214,7 +1390,7 @@ class SpamScanner {
1214
1390
  // (this is needed because some have "Web:%20http://google.com" for example in href tags)
1215
1391
  [href] = this.getUrls(href);
1216
1392
  // eslint-disable-next-line max-depth
1217
- if (href && !links.includes(href)) links.push(href);
1393
+ if (href) links.add(href);
1218
1394
  }
1219
1395
 
1220
1396
  // the text content could contain multiple URL's
@@ -1224,18 +1400,17 @@ class SpamScanner {
1224
1400
  isSANB(href) &&
1225
1401
  validator.isURL(href, isURLOptions)
1226
1402
  ) {
1227
- const string = `Anchor link with href of "${href}" and inner text value of "${textContent}"`;
1403
+ const string = `Anchor link with href of ${href} and inner text value of "${textContent}"`;
1228
1404
  // eslint-disable-next-line max-depth
1229
1405
  if (this.config.checkIDNHomographAttack) {
1230
1406
  const anchorUrlHostname = this.getHostname(href);
1231
1407
  // eslint-disable-next-line max-depth
1232
1408
  if (anchorUrlHostname) {
1233
- const anchorUrlHostnameToASCII = punycode.toASCII(
1234
- anchorUrlHostname
1235
- );
1409
+ const anchorUrlHostnameToASCII =
1410
+ punycode.toASCII(anchorUrlHostname);
1236
1411
  // eslint-disable-next-line max-depth
1237
- if (anchorUrlHostnameToASCII.startsWith('xn--'))
1238
- messages.push(
1412
+ if (anchorUrlHostnameToASCII.indexOf('xn--') === 0)
1413
+ messages.add(
1239
1414
  `${string} has possible IDN homograph attack from anchor hostname.`
1240
1415
  );
1241
1416
  }
@@ -1244,20 +1419,19 @@ class SpamScanner {
1244
1419
  // eslint-disable-next-line max-depth
1245
1420
  for (const link of this.getUrls(textContent)) {
1246
1421
  // this link should have already been included but just in case
1247
- // eslint-disable-next-line max-depth
1248
- if (!links.includes(link)) links.push(link);
1422
+
1423
+ links.add(link);
1249
1424
 
1250
1425
  // eslint-disable-next-line max-depth
1251
1426
  if (this.config.checkIDNHomographAttack) {
1252
1427
  const innerTextUrlHostname = this.getHostname(link);
1253
1428
  // eslint-disable-next-line max-depth
1254
1429
  if (innerTextUrlHostname) {
1255
- const innerTextUrlHostnameToASCII = punycode.toASCII(
1256
- innerTextUrlHostname
1257
- );
1430
+ const innerTextUrlHostnameToASCII =
1431
+ punycode.toASCII(innerTextUrlHostname);
1258
1432
  // eslint-disable-next-line max-depth
1259
- if (innerTextUrlHostnameToASCII.startsWith('xn--'))
1260
- messages.push(
1433
+ if (innerTextUrlHostnameToASCII.indexOf('xn--') === 0)
1434
+ messages.add(
1261
1435
  `${string} has possible IDN homograph attack from inner text hostname.`
1262
1436
  );
1263
1437
  }
@@ -1273,7 +1447,7 @@ class SpamScanner {
1273
1447
  for (const prop of MAIL_PHISHING_PROPS) {
1274
1448
  if (isSANB(mail[prop])) {
1275
1449
  for (const link of this.getUrls(mail[prop])) {
1276
- if (!links.includes(link)) links.push(link);
1450
+ links.add(link);
1277
1451
  }
1278
1452
  }
1279
1453
  }
@@ -1283,9 +1457,9 @@ class SpamScanner {
1283
1457
  const urlHostname = this.getHostname(link);
1284
1458
  if (urlHostname) {
1285
1459
  const toASCII = punycode.toASCII(urlHostname);
1286
- if (toASCII.startsWith('xn--'))
1287
- messages.push(
1288
- `Possible IDN homograph attack from link of "${link}" with punycode converted hostname of "${toASCII}".`
1460
+ if (toASCII.indexOf('xn--') === 0)
1461
+ messages.add(
1462
+ `Possible IDN homograph attack from link of ${link} with punycode converted hostname of ${toASCII}.`
1289
1463
  );
1290
1464
  }
1291
1465
  }
@@ -1294,30 +1468,25 @@ class SpamScanner {
1294
1468
  // check against Cloudflare malware/phishing/adult DNS lookup
1295
1469
  // if it returns `0.0.0.0` it means it was flagged
1296
1470
  await Promise.all(
1297
- links.map(async (link) => {
1471
+ [...links].map(async (link) => {
1298
1472
  try {
1299
1473
  const urlHostname = this.getHostname(link);
1300
1474
  if (urlHostname) {
1301
1475
  const toASCII = punycode.toASCII(urlHostname);
1302
- const adultMessage = `Link hostname of "${toASCII}" was detected by Cloudflare's Family DNS to contain adult-related content, phishing, and/or malware.`;
1303
- const malwareMessage = `Link hostname of ${toASCII}" was detected by Cloudflare's Security DNS to contain phishing and/or malware.`;
1476
+ const adultMessage = `Link hostname of ${toASCII} was detected by Cloudflare's Family DNS to contain adult-related content, phishing, and/or malware.`;
1477
+ const malwareMessage = `Link hostname of ${toASCII} was detected by Cloudflare's Security DNS to contain phishing and/or malware.`;
1304
1478
 
1305
1479
  // if it already included both messages then return early
1306
- if (
1307
- messages.includes(adultMessage) &&
1308
- messages.includes(malwareMessage)
1309
- )
1480
+ if (messages.has(adultMessage) && messages.has(malwareMessage))
1310
1481
  return;
1311
1482
 
1312
- const {
1313
- isAdult,
1314
- isMalware
1315
- } = await this.memoizedIsCloudflareBlocked(toASCII);
1483
+ const { isAdult, isMalware } =
1484
+ await this.memoizedIsCloudflareBlocked(toASCII);
1316
1485
 
1317
- if (isAdult && !messages.includes(adultMessage))
1318
- messages.push(adultMessage);
1319
- if (isMalware && !messages.includes(malwareMessage))
1320
- messages.push(malwareMessage);
1486
+ if (isAdult && !messages.has(adultMessage))
1487
+ messages.add(adultMessage);
1488
+ if (isMalware && !messages.has(malwareMessage))
1489
+ messages.add(malwareMessage);
1321
1490
  }
1322
1491
  } catch (err) {
1323
1492
  this.config.logger.error(err);
@@ -1325,7 +1494,7 @@ class SpamScanner {
1325
1494
  })
1326
1495
  );
1327
1496
 
1328
- return { messages, links };
1497
+ return { messages: [...messages], links: [...links] };
1329
1498
  }
1330
1499
 
1331
1500
  // getNSFWResults() {
@@ -1346,7 +1515,7 @@ class SpamScanner {
1346
1515
  try {
1347
1516
  const fileType = await FileType.fromBuffer(attachment.content);
1348
1517
 
1349
- if (fileType && fileType.ext && EXECUTABLES.includes(fileType.ext))
1518
+ if (fileType && fileType.ext && EXECUTABLES.has(fileType.ext))
1350
1519
  messages.push(
1351
1520
  `Attachment's "magic number" indicated it was a dangerous executable with a ".${fileType.ext}" extension.`
1352
1521
  );
@@ -1361,7 +1530,7 @@ class SpamScanner {
1361
1530
  punycode.toUnicode(attachment.filename.split('?')[0])
1362
1531
  );
1363
1532
  const ext = fileExtension(filename);
1364
- if (ext && EXECUTABLES.includes(ext))
1533
+ if (ext && EXECUTABLES.has(ext))
1365
1534
  messages.push(
1366
1535
  `Attachment's file name indicated it was a dangerous executable with a ".${ext}" extension.`
1367
1536
  );
@@ -1369,7 +1538,7 @@ class SpamScanner {
1369
1538
 
1370
1539
  if (isSANB(attachment.contentType)) {
1371
1540
  const ext = mime.extension(attachment.contentType);
1372
- if (isSANB(ext) && EXECUTABLES.includes(ext))
1541
+ if (isSANB(ext) && EXECUTABLES.has(ext))
1373
1542
  messages.push(
1374
1543
  `Attachment's Content-Type was a dangerous executable with a ".${ext}" extension.`
1375
1544
  );