spamscanner 4.0.0 → 5.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -8
- package/cedict_1_0_ts_utf-8_mdbg.txt +120853 -0
- package/index.js +284 -119
- package/package.json +37 -73
- package/vocabulary-limit.js +3 -1
package/index.js
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
const dns = require('dns');
|
|
2
2
|
const fs = require('fs');
|
|
3
|
-
const
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const process = require('process');
|
|
5
|
+
const { debuglog } = require('util');
|
|
4
6
|
|
|
5
|
-
// eslint-disable-next-line
|
|
7
|
+
// eslint-disable-next-line n/no-deprecated-api
|
|
6
8
|
const punycode = require('punycode');
|
|
7
9
|
|
|
8
10
|
const ClamScan = require('clamscan');
|
|
@@ -12,7 +14,6 @@ const RE2 = require('re2');
|
|
|
12
14
|
const bitcoinRegex = require('bitcoin-regex');
|
|
13
15
|
const contractions = require('expand-contractions');
|
|
14
16
|
const creditCardRegex = require('credit-card-regex');
|
|
15
|
-
const debug = require('debug')('spamscanner');
|
|
16
17
|
const emailRegexSafe = require('email-regex-safe');
|
|
17
18
|
const emojiPatterns = require('emoji-patterns');
|
|
18
19
|
const escapeStringRegexp = require('escape-string-regexp');
|
|
@@ -46,12 +47,28 @@ const toEmoji = require('gemoji/name-to-emoji');
|
|
|
46
47
|
const universalify = require('universalify');
|
|
47
48
|
const urlRegexSafe = require('url-regex-safe');
|
|
48
49
|
const validator = require('validator');
|
|
50
|
+
const which = require('which');
|
|
49
51
|
const { Iconv } = require('iconv');
|
|
50
52
|
const { codes } = require('currency-codes');
|
|
51
53
|
const { fromUrl, NO_HOSTNAME } = require('parse-domain');
|
|
52
54
|
const { parse } = require('node-html-parser');
|
|
53
55
|
const { simpleParser } = require('mailparser');
|
|
54
56
|
|
|
57
|
+
const debug = debuglog('spamscanner');
|
|
58
|
+
|
|
59
|
+
//
|
|
60
|
+
// NOTE: we periodically need to update this
|
|
61
|
+
//
|
|
62
|
+
// Source from: CC-CEDICT
|
|
63
|
+
// Licensed under Creative Commons Attribution-ShareAlike 4.0 International License
|
|
64
|
+
// <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>
|
|
65
|
+
//
|
|
66
|
+
// <https://github.com/yishn/chinese-tokenizer>
|
|
67
|
+
//
|
|
68
|
+
const chineseTokenizer = require('chinese-tokenizer').loadFile(
|
|
69
|
+
path.join(__dirname, 'cedict_1_0_ts_utf-8_mdbg.txt')
|
|
70
|
+
);
|
|
71
|
+
|
|
55
72
|
const aggressiveTokenizer = new natural.AggressiveTokenizer();
|
|
56
73
|
const orthographyTokenizer = new natural.OrthographyTokenizer({
|
|
57
74
|
language: 'fi'
|
|
@@ -69,20 +86,115 @@ const aggressiveTokenizerSv = new natural.AggressiveTokenizerSv();
|
|
|
69
86
|
const aggressiveTokenizerRu = new natural.AggressiveTokenizerRu();
|
|
70
87
|
const aggressiveTokenizerVi = new natural.AggressiveTokenizerVi();
|
|
71
88
|
|
|
72
|
-
const stopwordsEn =
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
const
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
const
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
const
|
|
85
|
-
|
|
89
|
+
const stopwordsEn = new Set([
|
|
90
|
+
...require('natural/lib/natural/util/stopwords').words,
|
|
91
|
+
...sw.eng
|
|
92
|
+
]);
|
|
93
|
+
const stopwordsEs = new Set([
|
|
94
|
+
...require('natural/lib/natural/util/stopwords_es').words,
|
|
95
|
+
...sw.spa
|
|
96
|
+
]);
|
|
97
|
+
const stopwordsFa = new Set([
|
|
98
|
+
...require('natural/lib/natural/util/stopwords_fa').words,
|
|
99
|
+
...sw.fas
|
|
100
|
+
]);
|
|
101
|
+
const stopwordsFr = new Set([
|
|
102
|
+
...require('natural/lib/natural/util/stopwords_fr').words,
|
|
103
|
+
...sw.fra
|
|
104
|
+
]);
|
|
105
|
+
const stopwordsId = new Set([
|
|
106
|
+
...require('natural/lib/natural/util/stopwords_id').words,
|
|
107
|
+
...sw.ind
|
|
108
|
+
]);
|
|
109
|
+
const stopwordsJa = new Set([
|
|
110
|
+
...require('natural/lib/natural/util/stopwords_ja').words,
|
|
111
|
+
...sw.jpn
|
|
112
|
+
]);
|
|
113
|
+
const stopwordsIt = new Set([
|
|
114
|
+
...require('natural/lib/natural/util/stopwords_it').words,
|
|
115
|
+
...sw.ita
|
|
116
|
+
]);
|
|
117
|
+
const stopwordsNl = new Set([
|
|
118
|
+
...require('natural/lib/natural/util/stopwords_nl').words,
|
|
119
|
+
...sw.nld
|
|
120
|
+
]);
|
|
121
|
+
const stopwordsNo = new Set([
|
|
122
|
+
...require('natural/lib/natural/util/stopwords_no').words,
|
|
123
|
+
...sw.nob
|
|
124
|
+
]);
|
|
125
|
+
const stopwordsPl = new Set([
|
|
126
|
+
...require('natural/lib/natural/util/stopwords_pl').words,
|
|
127
|
+
...sw.pol
|
|
128
|
+
]);
|
|
129
|
+
const stopwordsPt = new Set([
|
|
130
|
+
...require('natural/lib/natural/util/stopwords_pt').words,
|
|
131
|
+
...sw.por,
|
|
132
|
+
...sw.porBr
|
|
133
|
+
]);
|
|
134
|
+
const stopwordsRu = new Set([
|
|
135
|
+
...require('natural/lib/natural/util/stopwords_ru').words,
|
|
136
|
+
...sw.rus
|
|
137
|
+
]);
|
|
138
|
+
const stopwordsSv = new Set([
|
|
139
|
+
...require('natural/lib/natural/util/stopwords_sv').words,
|
|
140
|
+
...sw.swe
|
|
141
|
+
]);
|
|
142
|
+
const stopwordsZh = new Set([
|
|
143
|
+
...require('natural/lib/natural/util/stopwords_zh').words,
|
|
144
|
+
...sw.zho
|
|
145
|
+
]);
|
|
146
|
+
|
|
147
|
+
const stopwordsRon = new Set(sw.ron);
|
|
148
|
+
const stopwordsTur = new Set(sw.tur);
|
|
149
|
+
const stopwordsVie = new Set(sw.vie);
|
|
150
|
+
const stopwordsDeu = new Set(sw.deu);
|
|
151
|
+
const stopwordsHun = new Set(sw.hun);
|
|
152
|
+
const stopwordsAra = new Set(sw.ara);
|
|
153
|
+
const stopwordsDan = new Set(sw.dan);
|
|
154
|
+
const stopwordsFin = new Set(sw.fin);
|
|
155
|
+
|
|
156
|
+
// TODO: add stopword pairing for these langs:
|
|
157
|
+
// afr
|
|
158
|
+
// ben
|
|
159
|
+
// bre
|
|
160
|
+
// bul
|
|
161
|
+
// cat
|
|
162
|
+
// ces
|
|
163
|
+
// ell
|
|
164
|
+
// epo
|
|
165
|
+
// est
|
|
166
|
+
// eus
|
|
167
|
+
// fra
|
|
168
|
+
// gle
|
|
169
|
+
// glg
|
|
170
|
+
// guj
|
|
171
|
+
// hau
|
|
172
|
+
// heb
|
|
173
|
+
// hin
|
|
174
|
+
// hrv
|
|
175
|
+
// hye
|
|
176
|
+
// kor
|
|
177
|
+
// kur
|
|
178
|
+
// lat
|
|
179
|
+
// lav
|
|
180
|
+
// lgg
|
|
181
|
+
// lggNd
|
|
182
|
+
// lit
|
|
183
|
+
// mar
|
|
184
|
+
// msa
|
|
185
|
+
// mya
|
|
186
|
+
// panGu
|
|
187
|
+
// slk
|
|
188
|
+
// slv
|
|
189
|
+
// som
|
|
190
|
+
// sot
|
|
191
|
+
// swa
|
|
192
|
+
// tgl
|
|
193
|
+
// tha
|
|
194
|
+
// ukr
|
|
195
|
+
// urd
|
|
196
|
+
// yor
|
|
197
|
+
// zul
|
|
86
198
|
|
|
87
199
|
// <https://stackoverflow.com/a/41353282>
|
|
88
200
|
// <https://www.ietf.org/rfc/rfc3986.txt>
|
|
@@ -94,17 +206,18 @@ const PKG = require('./package.json');
|
|
|
94
206
|
|
|
95
207
|
const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
|
|
96
208
|
|
|
209
|
+
// TODO: convert this into a Map
|
|
97
210
|
const ISO_CODE_MAPPING = require('./iso-code-mapping.json');
|
|
98
211
|
|
|
212
|
+
const ISO_CODE_MAPPING_KEYS = Object.keys(ISO_CODE_MAPPING);
|
|
213
|
+
|
|
99
214
|
// <https://kb.smarshmail.com/Article/23567>
|
|
100
|
-
const EXECUTABLES = require('./executables.json');
|
|
215
|
+
const EXECUTABLES = new Set(require('./executables.json'));
|
|
101
216
|
|
|
102
217
|
const REPLACEMENT_WORDS = require('./replacement-words.json');
|
|
103
218
|
|
|
104
219
|
const locales = new Set(i18nLocales.map((l) => l.toLowerCase()));
|
|
105
220
|
|
|
106
|
-
const readFile = promisify(fs.readFile);
|
|
107
|
-
|
|
108
221
|
const normalizeUrlOptions = {
|
|
109
222
|
stripProtocol: true,
|
|
110
223
|
stripWWW: false,
|
|
@@ -154,7 +267,8 @@ for (const code of codes()) {
|
|
|
154
267
|
const symbol = getSymbolFromCurrency(code);
|
|
155
268
|
if (
|
|
156
269
|
typeof symbol === 'string' &&
|
|
157
|
-
|
|
270
|
+
// eslint-disable-next-line unicorn/prefer-includes
|
|
271
|
+
currencySymbols.indexOf(symbol) === -1 &&
|
|
158
272
|
!new RE2(/^[a-z]+$/i).test(symbol)
|
|
159
273
|
)
|
|
160
274
|
currencySymbols.push(escapeStringRegexp(symbol));
|
|
@@ -187,7 +301,9 @@ const isURLOptions = {
|
|
|
187
301
|
class SpamScanner {
|
|
188
302
|
constructor(config = {}) {
|
|
189
303
|
this.config = {
|
|
190
|
-
debug:
|
|
304
|
+
debug:
|
|
305
|
+
process.env.NODE_ENV === 'test' ||
|
|
306
|
+
process.env.NODE_ENV === 'development',
|
|
191
307
|
checkIDNHomographAttack: false,
|
|
192
308
|
// note that if you attempt to train an existing `scanner.classifier`
|
|
193
309
|
// then you will need to re-use these, so we suggest you store them
|
|
@@ -312,19 +428,20 @@ class SpamScanner {
|
|
|
312
428
|
userAgent: `${PKG.name}/${PKG.version}`,
|
|
313
429
|
timeout: ms('10s'),
|
|
314
430
|
clamscan: {
|
|
431
|
+
debugMode:
|
|
432
|
+
process.env.NODE_ENV === 'test' ||
|
|
433
|
+
process.env.NODE_ENV === 'development',
|
|
434
|
+
clamscan: {
|
|
435
|
+
path: which.sync('clamscan', { nothrow: true })
|
|
436
|
+
},
|
|
315
437
|
clamdscan: {
|
|
316
438
|
timeout: ms('10s'),
|
|
439
|
+
path: which.sync('clamdscan', { nothrow: true }),
|
|
317
440
|
socket: macosVersion.isMacOS
|
|
318
441
|
? '/tmp/clamd.socket'
|
|
319
442
|
: '/var/run/clamav/clamd.ctl'
|
|
320
443
|
}
|
|
321
444
|
},
|
|
322
|
-
franc: {
|
|
323
|
-
minLength: 100,
|
|
324
|
-
// we can only support languages available
|
|
325
|
-
// in stopwords and natural's tokenizer methods
|
|
326
|
-
only: Object.keys(ISO_CODE_MAPPING)
|
|
327
|
-
},
|
|
328
445
|
hasha: {
|
|
329
446
|
algorithm: 'sha256'
|
|
330
447
|
},
|
|
@@ -339,6 +456,11 @@ class SpamScanner {
|
|
|
339
456
|
client: false,
|
|
340
457
|
cachePrefix: 'spamscanner',
|
|
341
458
|
ttlMs: ms('1h'),
|
|
459
|
+
// franc
|
|
460
|
+
franc: {
|
|
461
|
+
minLength: 5,
|
|
462
|
+
only: ISO_CODE_MAPPING_KEYS
|
|
463
|
+
},
|
|
342
464
|
...config
|
|
343
465
|
};
|
|
344
466
|
|
|
@@ -416,9 +538,7 @@ class SpamScanner {
|
|
|
416
538
|
// cache in the background
|
|
417
539
|
this.config.client
|
|
418
540
|
.set(key, `${isAdult}:${isMalware}`, 'PX', this.config.ttlMs)
|
|
419
|
-
// eslint-disable-next-line promise/prefer-await-to-then
|
|
420
541
|
.then(this.config.logger.info)
|
|
421
|
-
// eslint-disable-next-line promise/prefer-await-to-then
|
|
422
542
|
.catch(this.config.logger.error);
|
|
423
543
|
return { isAdult, isMalware };
|
|
424
544
|
};
|
|
@@ -432,6 +552,27 @@ class SpamScanner {
|
|
|
432
552
|
throw new Error(
|
|
433
553
|
`Locale of ${this.config.locale} was not valid according to locales list.`
|
|
434
554
|
);
|
|
555
|
+
|
|
556
|
+
//
|
|
557
|
+
// set up regex helpers
|
|
558
|
+
//
|
|
559
|
+
this.EMAIL_REPLACEMENT_REGEX = new RE2(this.config.replacements.email, 'g');
|
|
560
|
+
const replacementRegexes = [];
|
|
561
|
+
for (const key of Object.keys(this.config.replacements)) {
|
|
562
|
+
replacementRegexes.push(
|
|
563
|
+
escapeStringRegexp(this.config.replacements[key])
|
|
564
|
+
);
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
this.REPLACEMENTS_REGEX = new RE2(
|
|
568
|
+
new RegExp(replacementRegexes.join('|'), 'g')
|
|
569
|
+
);
|
|
570
|
+
|
|
571
|
+
//
|
|
572
|
+
// set up helper Map and Sets for fast lookup
|
|
573
|
+
// (Set.has is 2x faster than includes, and 50% faster than indexOf)
|
|
574
|
+
//
|
|
575
|
+
this.WHITELISTED_WORDS = new Set(Object.values(this.config.replacements));
|
|
435
576
|
}
|
|
436
577
|
|
|
437
578
|
getHostname(link) {
|
|
@@ -521,15 +662,12 @@ class SpamScanner {
|
|
|
521
662
|
const stream = isStream(attachment.content)
|
|
522
663
|
? attachment.content
|
|
523
664
|
: intoStream(attachment.content);
|
|
524
|
-
const {
|
|
525
|
-
await clamscan.scan_stream(stream);
|
|
665
|
+
const { isInfected, viruses } = await clamscan.scanStream(stream);
|
|
526
666
|
const name = isSANB(attachment.filename)
|
|
527
667
|
? `"${attachment.filename}"`
|
|
528
668
|
: `#${i + 1}`;
|
|
529
669
|
if (isInfected)
|
|
530
|
-
messages.push(
|
|
531
|
-
`Attachment ${name} was infected with "${viruses}".`
|
|
532
|
-
);
|
|
670
|
+
messages.push(`Attachment ${name} was infected with ${viruses}.`);
|
|
533
671
|
} catch (err) {
|
|
534
672
|
this.config.logger.error(err);
|
|
535
673
|
}
|
|
@@ -547,13 +685,16 @@ class SpamScanner {
|
|
|
547
685
|
|
|
548
686
|
let gtube = false;
|
|
549
687
|
|
|
550
|
-
|
|
688
|
+
// eslint-disable-next-line unicorn/prefer-includes
|
|
689
|
+
if (isSANB(mail.html) && mail.html.indexOf(GTUBE) !== -1) gtube = true;
|
|
551
690
|
|
|
552
|
-
|
|
691
|
+
// eslint-disable-next-line unicorn/prefer-includes
|
|
692
|
+
if (isSANB(mail.text) && !gtube && mail.text.indexOf(GTUBE) !== -1)
|
|
693
|
+
gtube = true;
|
|
553
694
|
|
|
554
695
|
if (gtube)
|
|
555
696
|
messages.push(
|
|
556
|
-
'Message detected to contain the GTUBE test from
|
|
697
|
+
'Message detected to contain the GTUBE test from https://spamassassin.apache.org/gtube/.'
|
|
557
698
|
);
|
|
558
699
|
|
|
559
700
|
return messages;
|
|
@@ -619,8 +760,6 @@ class SpamScanner {
|
|
|
619
760
|
//
|
|
620
761
|
// However we don't recommend this and therefore have our servers set to standard Cloudflare DNS
|
|
621
762
|
//
|
|
622
|
-
// TODO: we need to do two lookups in parallel, one against adult and one against malware
|
|
623
|
-
// and also make sure the messages aren't duplicated when we concatenate final array of messages
|
|
624
763
|
const [isAdult, isMalware] = await Promise.all([
|
|
625
764
|
this.malwareLookup('https://family.cloudflare-dns.com/dns-query', name),
|
|
626
765
|
this.malwareLookup('https://security.cloudflare-dns.com/dns-query', name)
|
|
@@ -742,14 +881,14 @@ class SpamScanner {
|
|
|
742
881
|
})
|
|
743
882
|
.match(URL_REGEX) || [];
|
|
744
883
|
|
|
745
|
-
const array =
|
|
884
|
+
const array = new Set();
|
|
746
885
|
for (const url of urls) {
|
|
747
886
|
const normalized = this.getNormalizedUrl(url);
|
|
748
887
|
|
|
749
|
-
if (normalized
|
|
888
|
+
if (normalized) array.add(normalized);
|
|
750
889
|
}
|
|
751
890
|
|
|
752
|
-
return array;
|
|
891
|
+
return [...array];
|
|
753
892
|
}
|
|
754
893
|
|
|
755
894
|
parseLocale(locale) {
|
|
@@ -763,12 +902,6 @@ class SpamScanner {
|
|
|
763
902
|
// <https://github.com/NaturalNode/natural#stemmers>
|
|
764
903
|
// eslint-disable-next-line complexity
|
|
765
904
|
async getTokens(string, locale, isHTML = false) {
|
|
766
|
-
// get the current email replacement regex
|
|
767
|
-
const EMAIL_REPLACEMENT_REGEX = new RE2(
|
|
768
|
-
this.config.replacements.email,
|
|
769
|
-
'g'
|
|
770
|
-
);
|
|
771
|
-
|
|
772
905
|
//
|
|
773
906
|
// parse HTML for <html> tag with lang attr
|
|
774
907
|
// otherwise if that wasn't found then look for this
|
|
@@ -816,17 +949,6 @@ class SpamScanner {
|
|
|
816
949
|
|
|
817
950
|
if (isHTML) string = sanitizeHtml(string, this.config.sanitizeHtml);
|
|
818
951
|
|
|
819
|
-
const replacementRegexes = [];
|
|
820
|
-
for (const key of Object.keys(this.config.replacements)) {
|
|
821
|
-
replacementRegexes.push(
|
|
822
|
-
escapeStringRegexp(this.config.replacements[key])
|
|
823
|
-
);
|
|
824
|
-
}
|
|
825
|
-
|
|
826
|
-
const REPLACEMENTS_REGEX = new RE2(
|
|
827
|
-
new RegExp(replacementRegexes.join('|'), 'g')
|
|
828
|
-
);
|
|
829
|
-
|
|
830
952
|
string = striptags(string, [], ' ')
|
|
831
953
|
.trim()
|
|
832
954
|
// replace newlines
|
|
@@ -835,7 +957,7 @@ class SpamScanner {
|
|
|
835
957
|
// attackers may try to inject our replacements into the message
|
|
836
958
|
// therefore we should strip all of them before doing any replacements
|
|
837
959
|
//
|
|
838
|
-
.replace(REPLACEMENTS_REGEX, ' ');
|
|
960
|
+
.replace(this.REPLACEMENTS_REGEX, ' ');
|
|
839
961
|
|
|
840
962
|
//
|
|
841
963
|
// we should instead use language detection to determine
|
|
@@ -845,6 +967,7 @@ class SpamScanner {
|
|
|
845
967
|
// <https://github.com/FGRibreau/node-language-detect> (not too accurate)
|
|
846
968
|
//
|
|
847
969
|
const detectedLanguage = franc(string, this.config.franc);
|
|
970
|
+
|
|
848
971
|
if (
|
|
849
972
|
detectedLanguage !== 'und' &&
|
|
850
973
|
isSANB(ISO_CODE_MAPPING[detectedLanguage])
|
|
@@ -853,7 +976,8 @@ class SpamScanner {
|
|
|
853
976
|
|
|
854
977
|
locale = this.parseLocale(isSANB(locale) ? locale : this.config.locale);
|
|
855
978
|
|
|
856
|
-
|
|
979
|
+
// NOTE: "in" and "po" are valid locales but not from i18n
|
|
980
|
+
if (!locales.has(locale) && locale !== 'in' && locale !== 'po') {
|
|
857
981
|
debug(`Locale ${locale} was not valid and will use default`);
|
|
858
982
|
locale = this.parseLocale(this.config.locale);
|
|
859
983
|
}
|
|
@@ -865,103 +989,149 @@ class SpamScanner {
|
|
|
865
989
|
let stopwords = stopwordsEn;
|
|
866
990
|
let language = 'english';
|
|
867
991
|
let stemword = 'default';
|
|
992
|
+
|
|
868
993
|
switch (locale) {
|
|
869
994
|
case 'ar':
|
|
995
|
+
// arb
|
|
996
|
+
// ISO 639-3 = ara
|
|
997
|
+
stopwords = stopwordsAra;
|
|
870
998
|
language = 'arabic';
|
|
871
999
|
break;
|
|
872
1000
|
case 'da':
|
|
1001
|
+
// dan
|
|
873
1002
|
language = 'danish';
|
|
1003
|
+
stopwords = stopwordsDan;
|
|
874
1004
|
break;
|
|
875
1005
|
case 'nl':
|
|
1006
|
+
// nld
|
|
876
1007
|
stopwords = stopwordsNl;
|
|
877
1008
|
language = 'dutch';
|
|
878
1009
|
break;
|
|
879
1010
|
case 'en':
|
|
1011
|
+
// eng
|
|
880
1012
|
language = 'english';
|
|
881
1013
|
break;
|
|
882
1014
|
case 'fi':
|
|
1015
|
+
// fin
|
|
883
1016
|
language = 'finnish';
|
|
884
1017
|
tokenizer = orthographyTokenizer;
|
|
1018
|
+
stopwords = stopwordsFin;
|
|
885
1019
|
break;
|
|
886
1020
|
case 'fa':
|
|
1021
|
+
// fas (Persian/Farsi)
|
|
887
1022
|
language = 'farsi';
|
|
888
1023
|
tokenizer = aggressiveTokenizerFa;
|
|
889
1024
|
stopwords = stopwordsFa;
|
|
890
1025
|
stemword = natural.PorterStemmerFa.stem.bind(natural.PorterStemmerFa);
|
|
891
1026
|
break;
|
|
892
1027
|
case 'fr':
|
|
1028
|
+
// fra
|
|
893
1029
|
language = 'french';
|
|
894
1030
|
tokenizer = aggressiveTokenizerFr;
|
|
895
1031
|
stopwords = stopwordsFr;
|
|
896
1032
|
break;
|
|
897
1033
|
case 'de':
|
|
1034
|
+
// deu
|
|
898
1035
|
language = 'german';
|
|
1036
|
+
stopwords = stopwordsDeu;
|
|
899
1037
|
break;
|
|
900
1038
|
case 'hu':
|
|
1039
|
+
// hun
|
|
901
1040
|
language = 'hungarian';
|
|
1041
|
+
stopwords = stopwordsHun;
|
|
902
1042
|
break;
|
|
903
1043
|
case 'in':
|
|
1044
|
+
// ind
|
|
904
1045
|
language = 'indonesian';
|
|
905
1046
|
tokenizer = aggressiveTokenizerId;
|
|
906
1047
|
stopwords = stopwordsId;
|
|
907
1048
|
break;
|
|
908
1049
|
case 'it':
|
|
1050
|
+
// ita
|
|
909
1051
|
language = 'italian';
|
|
910
1052
|
tokenizer = aggressiveTokenizerIt;
|
|
911
1053
|
stopwords = stopwordsIt;
|
|
912
1054
|
break;
|
|
913
1055
|
case 'ja':
|
|
1056
|
+
// jpn
|
|
914
1057
|
tokenizer = tokenizerJa;
|
|
915
1058
|
stopwords = stopwordsJa;
|
|
916
1059
|
stemword = natural.StemmerJa.stem.bind(natural.StemmerJa);
|
|
917
1060
|
break;
|
|
918
1061
|
case 'nb':
|
|
1062
|
+
// nob
|
|
1063
|
+
language = 'norwegian';
|
|
1064
|
+
tokenizer = aggressiveTokenizerNo;
|
|
1065
|
+
stopwords = stopwordsNo;
|
|
1066
|
+
break;
|
|
919
1067
|
case 'nn':
|
|
1068
|
+
// nno
|
|
1069
|
+
// ISO 639-3 = nob
|
|
920
1070
|
language = 'norwegian';
|
|
921
1071
|
tokenizer = aggressiveTokenizerNo;
|
|
922
1072
|
stopwords = stopwordsNo;
|
|
923
1073
|
break;
|
|
924
1074
|
case 'po':
|
|
1075
|
+
// pol
|
|
925
1076
|
language = 'polish';
|
|
926
1077
|
tokenizer = aggressiveTokenizerPl;
|
|
927
1078
|
stopwords = stopwordsPl;
|
|
928
1079
|
stemword = false;
|
|
929
1080
|
break;
|
|
930
1081
|
case 'pt':
|
|
1082
|
+
// por
|
|
931
1083
|
language = 'portuguese';
|
|
932
1084
|
tokenizer = aggressiveTokenizerPt;
|
|
933
1085
|
stopwords = stopwordsPt;
|
|
934
1086
|
break;
|
|
935
1087
|
case 'es':
|
|
1088
|
+
// spa
|
|
936
1089
|
language = 'spanish';
|
|
937
1090
|
tokenizer = aggressiveTokenizerEs;
|
|
938
1091
|
stopwords = stopwordsEs;
|
|
939
1092
|
break;
|
|
940
1093
|
case 'sv':
|
|
1094
|
+
// swe
|
|
941
1095
|
language = 'swedish';
|
|
942
1096
|
tokenizer = aggressiveTokenizerSv;
|
|
943
1097
|
stopwords = stopwordsSv;
|
|
944
1098
|
break;
|
|
945
1099
|
case 'ro':
|
|
1100
|
+
// ron
|
|
946
1101
|
language = 'romanian';
|
|
1102
|
+
stopwords = stopwordsRon;
|
|
947
1103
|
break;
|
|
948
1104
|
case 'ru':
|
|
1105
|
+
// rus
|
|
949
1106
|
language = 'russian';
|
|
950
1107
|
tokenizer = aggressiveTokenizerRu;
|
|
951
1108
|
stopwords = stopwordsRu;
|
|
952
1109
|
break;
|
|
953
1110
|
case 'ta':
|
|
1111
|
+
// tam
|
|
1112
|
+
// NOTE: no stopwords available
|
|
954
1113
|
language = 'tamil';
|
|
955
1114
|
break;
|
|
956
1115
|
case 'tr':
|
|
1116
|
+
// tur
|
|
957
1117
|
language = 'turkish';
|
|
1118
|
+
stopwords = stopwordsTur;
|
|
958
1119
|
break;
|
|
959
1120
|
case 'vi':
|
|
1121
|
+
// vie
|
|
960
1122
|
language = 'vietnamese';
|
|
961
1123
|
tokenizer = aggressiveTokenizerVi;
|
|
1124
|
+
stopwords = stopwordsVie;
|
|
962
1125
|
stemword = false;
|
|
963
1126
|
break;
|
|
964
1127
|
case 'zh':
|
|
1128
|
+
// cmn
|
|
1129
|
+
// ISO 639-3 = zho (Chinese, Macrolanguage)
|
|
1130
|
+
// https://github.com/yishn/chinese-tokenizer
|
|
1131
|
+
tokenizer = {
|
|
1132
|
+
tokenize: (str) =>
|
|
1133
|
+
chineseTokenizer(str).map((results) => results.text)
|
|
1134
|
+
};
|
|
965
1135
|
language = 'chinese';
|
|
966
1136
|
stopwords = stopwordsZh;
|
|
967
1137
|
stemword = false;
|
|
@@ -979,7 +1149,7 @@ class SpamScanner {
|
|
|
979
1149
|
string
|
|
980
1150
|
.split(' ')
|
|
981
1151
|
.map((_string) =>
|
|
982
|
-
_string.
|
|
1152
|
+
_string.indexOf(':') === 0 &&
|
|
983
1153
|
_string.endsWith(':') &&
|
|
984
1154
|
typeof toEmoji[_string.slice(1, -1)] === 'string'
|
|
985
1155
|
? toEmoji[_string.slice(1, -1)]
|
|
@@ -1027,7 +1197,10 @@ class SpamScanner {
|
|
|
1027
1197
|
|
|
1028
1198
|
// now we ensure that URL's and EMAIL's are properly spaced out
|
|
1029
1199
|
// (e.g. in case ?email=some@email.com was in a URL)
|
|
1030
|
-
.replace(
|
|
1200
|
+
.replace(
|
|
1201
|
+
this.EMAIL_REPLACEMENT_REGEX,
|
|
1202
|
+
` ${this.config.replacements.email} `
|
|
1203
|
+
)
|
|
1031
1204
|
|
|
1032
1205
|
// TODO: replace file paths, file dirs, dotfiles, and dotdirs
|
|
1033
1206
|
|
|
@@ -1042,12 +1215,14 @@ class SpamScanner {
|
|
|
1042
1215
|
// replace currency
|
|
1043
1216
|
.replace(CURRENCY_REGEX, ` ${this.config.replacements.currency} `);
|
|
1044
1217
|
|
|
1218
|
+
//
|
|
1045
1219
|
// expand contractions so "they're" -> [ they, are ] vs. [ they, re ]
|
|
1046
1220
|
// <https://github.com/NaturalNode/natural/issues/533>
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
//
|
|
1050
|
-
|
|
1221
|
+
//
|
|
1222
|
+
// NOTE: we're doing this for all languages now, not just en
|
|
1223
|
+
// if (locale === 'en')
|
|
1224
|
+
//
|
|
1225
|
+
string = contractions.expand(string);
|
|
1051
1226
|
|
|
1052
1227
|
//
|
|
1053
1228
|
// Future research:
|
|
@@ -1059,45 +1234,37 @@ class SpamScanner {
|
|
|
1059
1234
|
//
|
|
1060
1235
|
const tokens = [];
|
|
1061
1236
|
for (const token of tokenizer.tokenize(string.toLowerCase())) {
|
|
1237
|
+
// zh tokenizr yields empty strings
|
|
1238
|
+
if (token === '' || token === ' ') continue;
|
|
1239
|
+
|
|
1062
1240
|
// whitelist words from being stemmed (safeguard)
|
|
1063
1241
|
if (
|
|
1064
|
-
|
|
1065
|
-
token.
|
|
1066
|
-
token.
|
|
1242
|
+
this.WHITELISTED_WORDS.has(token) ||
|
|
1243
|
+
token.indexOf(this.config.replacements.initialism) === 0 ||
|
|
1244
|
+
token.indexOf(this.config.replacements.abbrevation) === 0
|
|
1067
1245
|
) {
|
|
1068
1246
|
tokens.push(token);
|
|
1069
1247
|
continue;
|
|
1070
1248
|
}
|
|
1071
1249
|
|
|
1072
|
-
if (
|
|
1073
|
-
stopwords.includes(token) ||
|
|
1074
|
-
(sw[locale] && sw[locale].includes(token)) ||
|
|
1075
|
-
(locale !== 'en' &&
|
|
1076
|
-
(stopwordsEn.includes(token) || sw.en.includes(token)))
|
|
1077
|
-
)
|
|
1250
|
+
if (stopwords.has(token) || (locale !== 'en' && stopwordsEn.has(token))) {
|
|
1078
1251
|
continue;
|
|
1252
|
+
}
|
|
1079
1253
|
|
|
1080
1254
|
// locale specific stopwords to ignore
|
|
1081
1255
|
let localeStem;
|
|
1082
1256
|
if (typeof stemword === 'function') {
|
|
1083
1257
|
localeStem = stemword(token);
|
|
1084
|
-
if (
|
|
1085
|
-
localeStem &&
|
|
1086
|
-
(stopwords.includes(localeStem) ||
|
|
1087
|
-
(sw[locale] && sw[locale].includes(localeStem)))
|
|
1088
|
-
)
|
|
1258
|
+
if (localeStem && stopwords.has(localeStem)) {
|
|
1089
1259
|
continue;
|
|
1260
|
+
}
|
|
1090
1261
|
}
|
|
1091
1262
|
|
|
1092
1263
|
// always check against English stemwords
|
|
1093
1264
|
let englishStem;
|
|
1094
1265
|
if (locale !== 'en') {
|
|
1095
1266
|
englishStem = snowball.stemword(token, 'english');
|
|
1096
|
-
if (
|
|
1097
|
-
englishStem &&
|
|
1098
|
-
(stopwordsEn.includes(englishStem) || sw.en.includes(englishStem))
|
|
1099
|
-
)
|
|
1100
|
-
continue;
|
|
1267
|
+
if (englishStem && stopwordsEn.has(englishStem)) continue;
|
|
1101
1268
|
}
|
|
1102
1269
|
|
|
1103
1270
|
tokens.push(
|
|
@@ -1105,6 +1272,8 @@ class SpamScanner {
|
|
|
1105
1272
|
);
|
|
1106
1273
|
}
|
|
1107
1274
|
|
|
1275
|
+
debug('locale', locale, 'tokens', tokens);
|
|
1276
|
+
|
|
1108
1277
|
if (this.config.debug) return tokens;
|
|
1109
1278
|
|
|
1110
1279
|
// we should sha256 all tokens with hasha if not in debug mode
|
|
@@ -1117,7 +1286,7 @@ class SpamScanner {
|
|
|
1117
1286
|
let source = string;
|
|
1118
1287
|
if (isBuffer(string)) source = string.toString();
|
|
1119
1288
|
else if (typeof string === 'string' && isValidPath(string))
|
|
1120
|
-
source = await readFile(string);
|
|
1289
|
+
source = await fs.promises.readFile(string);
|
|
1121
1290
|
|
|
1122
1291
|
const tokens = [];
|
|
1123
1292
|
const mail = await simpleParser(source, this.config.simpleParser);
|
|
@@ -1155,12 +1324,11 @@ class SpamScanner {
|
|
|
1155
1324
|
|
|
1156
1325
|
// eslint-disable-next-line complexity
|
|
1157
1326
|
async getPhishingResults(mail) {
|
|
1158
|
-
const messages =
|
|
1159
|
-
|
|
1327
|
+
const messages = new Set();
|
|
1160
1328
|
//
|
|
1161
1329
|
// NOTE: all links pushed are lowercased
|
|
1162
1330
|
//
|
|
1163
|
-
const links =
|
|
1331
|
+
const links = new Set();
|
|
1164
1332
|
|
|
1165
1333
|
// parse <a> tags with different org domain in text vs the link
|
|
1166
1334
|
if (isSANB(mail.html)) {
|
|
@@ -1170,7 +1338,7 @@ class SpamScanner {
|
|
|
1170
1338
|
// elements concatenate to form a URL which is malicious or phishing
|
|
1171
1339
|
//
|
|
1172
1340
|
for (const link of this.getUrls(striptags(mail.html, [], ' ').trim())) {
|
|
1173
|
-
|
|
1341
|
+
links.add(link);
|
|
1174
1342
|
}
|
|
1175
1343
|
|
|
1176
1344
|
//
|
|
@@ -1212,7 +1380,7 @@ class SpamScanner {
|
|
|
1212
1380
|
// (this is needed because some have "Web:%20http://google.com" for example in href tags)
|
|
1213
1381
|
[href] = this.getUrls(href);
|
|
1214
1382
|
// eslint-disable-next-line max-depth
|
|
1215
|
-
if (href
|
|
1383
|
+
if (href) links.add(href);
|
|
1216
1384
|
}
|
|
1217
1385
|
|
|
1218
1386
|
// the text content could contain multiple URL's
|
|
@@ -1222,7 +1390,7 @@ class SpamScanner {
|
|
|
1222
1390
|
isSANB(href) &&
|
|
1223
1391
|
validator.isURL(href, isURLOptions)
|
|
1224
1392
|
) {
|
|
1225
|
-
const string = `Anchor link with href of
|
|
1393
|
+
const string = `Anchor link with href of ${href} and inner text value of "${textContent}"`;
|
|
1226
1394
|
// eslint-disable-next-line max-depth
|
|
1227
1395
|
if (this.config.checkIDNHomographAttack) {
|
|
1228
1396
|
const anchorUrlHostname = this.getHostname(href);
|
|
@@ -1231,8 +1399,8 @@ class SpamScanner {
|
|
|
1231
1399
|
const anchorUrlHostnameToASCII =
|
|
1232
1400
|
punycode.toASCII(anchorUrlHostname);
|
|
1233
1401
|
// eslint-disable-next-line max-depth
|
|
1234
|
-
if (anchorUrlHostnameToASCII.
|
|
1235
|
-
messages.
|
|
1402
|
+
if (anchorUrlHostnameToASCII.indexOf('xn--') === 0)
|
|
1403
|
+
messages.add(
|
|
1236
1404
|
`${string} has possible IDN homograph attack from anchor hostname.`
|
|
1237
1405
|
);
|
|
1238
1406
|
}
|
|
@@ -1241,8 +1409,8 @@ class SpamScanner {
|
|
|
1241
1409
|
// eslint-disable-next-line max-depth
|
|
1242
1410
|
for (const link of this.getUrls(textContent)) {
|
|
1243
1411
|
// this link should have already been included but just in case
|
|
1244
|
-
|
|
1245
|
-
|
|
1412
|
+
|
|
1413
|
+
links.add(link);
|
|
1246
1414
|
|
|
1247
1415
|
// eslint-disable-next-line max-depth
|
|
1248
1416
|
if (this.config.checkIDNHomographAttack) {
|
|
@@ -1252,8 +1420,8 @@ class SpamScanner {
|
|
|
1252
1420
|
const innerTextUrlHostnameToASCII =
|
|
1253
1421
|
punycode.toASCII(innerTextUrlHostname);
|
|
1254
1422
|
// eslint-disable-next-line max-depth
|
|
1255
|
-
if (innerTextUrlHostnameToASCII.
|
|
1256
|
-
messages.
|
|
1423
|
+
if (innerTextUrlHostnameToASCII.indexOf('xn--') === 0)
|
|
1424
|
+
messages.add(
|
|
1257
1425
|
`${string} has possible IDN homograph attack from inner text hostname.`
|
|
1258
1426
|
);
|
|
1259
1427
|
}
|
|
@@ -1269,7 +1437,7 @@ class SpamScanner {
|
|
|
1269
1437
|
for (const prop of MAIL_PHISHING_PROPS) {
|
|
1270
1438
|
if (isSANB(mail[prop])) {
|
|
1271
1439
|
for (const link of this.getUrls(mail[prop])) {
|
|
1272
|
-
|
|
1440
|
+
links.add(link);
|
|
1273
1441
|
}
|
|
1274
1442
|
}
|
|
1275
1443
|
}
|
|
@@ -1279,9 +1447,9 @@ class SpamScanner {
|
|
|
1279
1447
|
const urlHostname = this.getHostname(link);
|
|
1280
1448
|
if (urlHostname) {
|
|
1281
1449
|
const toASCII = punycode.toASCII(urlHostname);
|
|
1282
|
-
if (toASCII.
|
|
1283
|
-
messages.
|
|
1284
|
-
`Possible IDN homograph attack from link of
|
|
1450
|
+
if (toASCII.indexOf('xn--') === 0)
|
|
1451
|
+
messages.add(
|
|
1452
|
+
`Possible IDN homograph attack from link of ${link} with punycode converted hostname of ${toASCII}.`
|
|
1285
1453
|
);
|
|
1286
1454
|
}
|
|
1287
1455
|
}
|
|
@@ -1290,28 +1458,25 @@ class SpamScanner {
|
|
|
1290
1458
|
// check against Cloudflare malware/phishing/adult DNS lookup
|
|
1291
1459
|
// if it returns `0.0.0.0` it means it was flagged
|
|
1292
1460
|
await Promise.all(
|
|
1293
|
-
links.map(async (link) => {
|
|
1461
|
+
[...links].map(async (link) => {
|
|
1294
1462
|
try {
|
|
1295
1463
|
const urlHostname = this.getHostname(link);
|
|
1296
1464
|
if (urlHostname) {
|
|
1297
1465
|
const toASCII = punycode.toASCII(urlHostname);
|
|
1298
|
-
const adultMessage = `Link hostname of
|
|
1299
|
-
const malwareMessage = `Link hostname of ${toASCII}
|
|
1466
|
+
const adultMessage = `Link hostname of ${toASCII} was detected by Cloudflare's Family DNS to contain adult-related content, phishing, and/or malware.`;
|
|
1467
|
+
const malwareMessage = `Link hostname of ${toASCII} was detected by Cloudflare's Security DNS to contain phishing and/or malware.`;
|
|
1300
1468
|
|
|
1301
1469
|
// if it already included both messages then return early
|
|
1302
|
-
if (
|
|
1303
|
-
messages.includes(adultMessage) &&
|
|
1304
|
-
messages.includes(malwareMessage)
|
|
1305
|
-
)
|
|
1470
|
+
if (messages.has(adultMessage) && messages.has(malwareMessage))
|
|
1306
1471
|
return;
|
|
1307
1472
|
|
|
1308
1473
|
const { isAdult, isMalware } =
|
|
1309
1474
|
await this.memoizedIsCloudflareBlocked(toASCII);
|
|
1310
1475
|
|
|
1311
|
-
if (isAdult && !messages.
|
|
1312
|
-
messages.
|
|
1313
|
-
if (isMalware && !messages.
|
|
1314
|
-
messages.
|
|
1476
|
+
if (isAdult && !messages.has(adultMessage))
|
|
1477
|
+
messages.add(adultMessage);
|
|
1478
|
+
if (isMalware && !messages.has(malwareMessage))
|
|
1479
|
+
messages.add(malwareMessage);
|
|
1315
1480
|
}
|
|
1316
1481
|
} catch (err) {
|
|
1317
1482
|
this.config.logger.error(err);
|
|
@@ -1319,7 +1484,7 @@ class SpamScanner {
|
|
|
1319
1484
|
})
|
|
1320
1485
|
);
|
|
1321
1486
|
|
|
1322
|
-
return { messages, links };
|
|
1487
|
+
return { messages: [...messages], links: [...links] };
|
|
1323
1488
|
}
|
|
1324
1489
|
|
|
1325
1490
|
// getNSFWResults() {
|
|
@@ -1340,7 +1505,7 @@ class SpamScanner {
|
|
|
1340
1505
|
try {
|
|
1341
1506
|
const fileType = await FileType.fromBuffer(attachment.content);
|
|
1342
1507
|
|
|
1343
|
-
if (fileType && fileType.ext && EXECUTABLES.
|
|
1508
|
+
if (fileType && fileType.ext && EXECUTABLES.has(fileType.ext))
|
|
1344
1509
|
messages.push(
|
|
1345
1510
|
`Attachment's "magic number" indicated it was a dangerous executable with a ".${fileType.ext}" extension.`
|
|
1346
1511
|
);
|
|
@@ -1355,7 +1520,7 @@ class SpamScanner {
|
|
|
1355
1520
|
punycode.toUnicode(attachment.filename.split('?')[0])
|
|
1356
1521
|
);
|
|
1357
1522
|
const ext = fileExtension(filename);
|
|
1358
|
-
if (ext && EXECUTABLES.
|
|
1523
|
+
if (ext && EXECUTABLES.has(ext))
|
|
1359
1524
|
messages.push(
|
|
1360
1525
|
`Attachment's file name indicated it was a dangerous executable with a ".${ext}" extension.`
|
|
1361
1526
|
);
|
|
@@ -1363,7 +1528,7 @@ class SpamScanner {
|
|
|
1363
1528
|
|
|
1364
1529
|
if (isSANB(attachment.contentType)) {
|
|
1365
1530
|
const ext = mime.extension(attachment.contentType);
|
|
1366
|
-
if (isSANB(ext) && EXECUTABLES.
|
|
1531
|
+
if (isSANB(ext) && EXECUTABLES.has(ext))
|
|
1367
1532
|
messages.push(
|
|
1368
1533
|
`Attachment's Content-Type was a dangerous executable with a ".${ext}" extension.`
|
|
1369
1534
|
);
|