spamscanner 5.0.0 → 5.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -4
- package/cedict_1_0_ts_utf-8_mdbg.txt +120853 -0
- package/index.js +75 -16
- package/package.json +6 -2
package/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
const process = require('process');
|
|
2
1
|
const dns = require('dns');
|
|
3
2
|
const fs = require('fs');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const process = require('process');
|
|
4
5
|
const { debuglog } = require('util');
|
|
5
6
|
|
|
6
7
|
// eslint-disable-next-line n/no-deprecated-api
|
|
@@ -10,6 +11,7 @@ const ClamScan = require('clamscan');
|
|
|
10
11
|
const FileType = require('file-type');
|
|
11
12
|
const NaiveBayes = require('@ladjs/naivebayes');
|
|
12
13
|
const RE2 = require('re2');
|
|
14
|
+
const arrayJoinConjunction = require('array-join-conjunction');
|
|
13
15
|
const bitcoinRegex = require('bitcoin-regex');
|
|
14
16
|
const contractions = require('expand-contractions');
|
|
15
17
|
const creditCardRegex = require('credit-card-regex');
|
|
@@ -55,6 +57,19 @@ const { simpleParser } = require('mailparser');
|
|
|
55
57
|
|
|
56
58
|
const debug = debuglog('spamscanner');
|
|
57
59
|
|
|
60
|
+
//
|
|
61
|
+
// NOTE: we periodically need to update this
|
|
62
|
+
//
|
|
63
|
+
// Source from: CC-CEDICT
|
|
64
|
+
// Licensed under Creative Commons Attribution-ShareAlike 4.0 International License
|
|
65
|
+
// <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>
|
|
66
|
+
//
|
|
67
|
+
// <https://github.com/yishn/chinese-tokenizer>
|
|
68
|
+
//
|
|
69
|
+
const chineseTokenizer = require('chinese-tokenizer').loadFile(
|
|
70
|
+
path.join(__dirname, 'cedict_1_0_ts_utf-8_mdbg.txt')
|
|
71
|
+
);
|
|
72
|
+
|
|
58
73
|
const aggressiveTokenizer = new natural.AggressiveTokenizer();
|
|
59
74
|
const orthographyTokenizer = new natural.OrthographyTokenizer({
|
|
60
75
|
language: 'fi'
|
|
@@ -195,6 +210,8 @@ const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
|
|
|
195
210
|
// TODO: convert this into a Map
|
|
196
211
|
const ISO_CODE_MAPPING = require('./iso-code-mapping.json');
|
|
197
212
|
|
|
213
|
+
const ISO_CODE_MAPPING_KEYS = Object.keys(ISO_CODE_MAPPING);
|
|
214
|
+
|
|
198
215
|
// <https://kb.smarshmail.com/Article/23567>
|
|
199
216
|
const EXECUTABLES = new Set(require('./executables.json'));
|
|
200
217
|
|
|
@@ -426,12 +443,6 @@ class SpamScanner {
|
|
|
426
443
|
: '/var/run/clamav/clamd.ctl'
|
|
427
444
|
}
|
|
428
445
|
},
|
|
429
|
-
franc: {
|
|
430
|
-
minLength: 100,
|
|
431
|
-
// we can only support languages available
|
|
432
|
-
// in stopwords and natural's tokenizer methods
|
|
433
|
-
only: Object.keys(ISO_CODE_MAPPING)
|
|
434
|
-
},
|
|
435
446
|
hasha: {
|
|
436
447
|
algorithm: 'sha256'
|
|
437
448
|
},
|
|
@@ -446,6 +457,15 @@ class SpamScanner {
|
|
|
446
457
|
client: false,
|
|
447
458
|
cachePrefix: 'spamscanner',
|
|
448
459
|
ttlMs: ms('1h'),
|
|
460
|
+
// franc
|
|
461
|
+
franc: {
|
|
462
|
+
minLength: 5,
|
|
463
|
+
only: ISO_CODE_MAPPING_KEYS
|
|
464
|
+
},
|
|
465
|
+
// if franc detects multiple languages that have >= % threshold
|
|
466
|
+
// then if the locale detected was one of them, what is the probability
|
|
467
|
+
// it must have in order to override all the other matches
|
|
468
|
+
detectedLocaleOverrideProbability: 0.9,
|
|
449
469
|
...config
|
|
450
470
|
};
|
|
451
471
|
|
|
@@ -652,7 +672,11 @@ class SpamScanner {
|
|
|
652
672
|
? `"${attachment.filename}"`
|
|
653
673
|
: `#${i + 1}`;
|
|
654
674
|
if (isInfected)
|
|
655
|
-
messages.push(
|
|
675
|
+
messages.push(
|
|
676
|
+
`Attachment ${name} was infected with ${arrayJoinConjunction(
|
|
677
|
+
viruses
|
|
678
|
+
)}.`
|
|
679
|
+
);
|
|
656
680
|
} catch (err) {
|
|
657
681
|
this.config.logger.error(err);
|
|
658
682
|
}
|
|
@@ -951,12 +975,41 @@ class SpamScanner {
|
|
|
951
975
|
// <https://github.com/wooorm/franc/issues/86> (accurate with min length)
|
|
952
976
|
// <https://github.com/FGRibreau/node-language-detect> (not too accurate)
|
|
953
977
|
//
|
|
954
|
-
const
|
|
955
|
-
if (
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
978
|
+
const detectedLanguages = franc.all(string, this.config.franc);
|
|
979
|
+
if (Array.isArray(detectedLanguages) && detectedLanguages.length > 0) {
|
|
980
|
+
let detected = this.config.locale;
|
|
981
|
+
let probability = 0;
|
|
982
|
+
for (const lang of detectedLanguages) {
|
|
983
|
+
// if it was undetermined then break out and revert to default (English)
|
|
984
|
+
if (lang[0] && lang[0] === 'und') break;
|
|
985
|
+
|
|
986
|
+
//
|
|
987
|
+
// otherwise only use detected languages that have >= 90% accuracy
|
|
988
|
+
// and if no matches were found, the revert to use English as it's most likely spam
|
|
989
|
+
// (we can assume that users would understand a different language sent to them is spam)
|
|
990
|
+
// (so we can assume that language is spoofed to bypass English, the most widely spoken)
|
|
991
|
+
//
|
|
992
|
+
if (lang[0] && ISO_CODE_MAPPING[lang[0]] && lang[1]) {
|
|
993
|
+
// we don't want to check anything lower than our threshold
|
|
994
|
+
if (lang[1] < this.config.detectedLocaleOverrideProbability) break;
|
|
995
|
+
if (probability >= lang[1]) {
|
|
996
|
+
// exit early since we found a match that matched the passed locale
|
|
997
|
+
// eslint-disable-next-line max-depth
|
|
998
|
+
if (locale && locale === ISO_CODE_MAPPING[lang[0]]) {
|
|
999
|
+
detected = locale;
|
|
1000
|
+
probability = lang[1];
|
|
1001
|
+
break;
|
|
1002
|
+
}
|
|
1003
|
+
} else {
|
|
1004
|
+
detected = ISO_CODE_MAPPING[lang[0]];
|
|
1005
|
+
probability = lang[1];
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
// override the locale based off detected
|
|
1011
|
+
locale = detected;
|
|
1012
|
+
}
|
|
960
1013
|
|
|
961
1014
|
locale = this.parseLocale(isSANB(locale) ? locale : this.config.locale);
|
|
962
1015
|
|
|
@@ -1110,8 +1163,12 @@ class SpamScanner {
|
|
|
1110
1163
|
break;
|
|
1111
1164
|
case 'zh':
|
|
1112
1165
|
// cmn
|
|
1113
|
-
// TODO: use this instead https://github.com/yishn/chinese-tokenizer
|
|
1114
1166
|
// ISO 639-3 = zho (Chinese, Macrolanguage)
|
|
1167
|
+
// https://github.com/yishn/chinese-tokenizer
|
|
1168
|
+
tokenizer = {
|
|
1169
|
+
tokenize: (str) =>
|
|
1170
|
+
chineseTokenizer(str).map((results) => results.text)
|
|
1171
|
+
};
|
|
1115
1172
|
language = 'chinese';
|
|
1116
1173
|
stopwords = stopwordsZh;
|
|
1117
1174
|
stemword = false;
|
|
@@ -1200,7 +1257,6 @@ class SpamScanner {
|
|
|
1200
1257
|
// <https://github.com/NaturalNode/natural/issues/533>
|
|
1201
1258
|
//
|
|
1202
1259
|
// NOTE: we're doing this for all languages now, not just en
|
|
1203
|
-
// if (locale === 'en')
|
|
1204
1260
|
//
|
|
1205
1261
|
string = contractions.expand(string);
|
|
1206
1262
|
|
|
@@ -1214,6 +1270,9 @@ class SpamScanner {
|
|
|
1214
1270
|
//
|
|
1215
1271
|
const tokens = [];
|
|
1216
1272
|
for (const token of tokenizer.tokenize(string.toLowerCase())) {
|
|
1273
|
+
// zh tokenizr yields empty strings
|
|
1274
|
+
if (token === '' || token === ' ') continue;
|
|
1275
|
+
|
|
1217
1276
|
// whitelist words from being stemmed (safeguard)
|
|
1218
1277
|
if (
|
|
1219
1278
|
this.WHITELISTED_WORDS.has(token) ||
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spamscanner",
|
|
3
3
|
"description": "Spam Scanner - The Best Anti-Spam Scanning Service and Anti-Spam API",
|
|
4
|
-
"version": "5.
|
|
4
|
+
"version": "5.1.2",
|
|
5
5
|
"author": "Niftylettuce, LLC. <niftylettuce@gmail.com> (https://niftylettuce.com/)",
|
|
6
6
|
"bugs": {
|
|
7
7
|
"url": "https://github.com/spamscanner/spamscanner/issues",
|
|
@@ -13,7 +13,9 @@
|
|
|
13
13
|
],
|
|
14
14
|
"dependencies": {
|
|
15
15
|
"@ladjs/naivebayes": "^0.1.0",
|
|
16
|
+
"array-join-conjunction": "^1.0.0",
|
|
16
17
|
"bitcoin-regex": "^2.0.0",
|
|
18
|
+
"chinese-tokenizer": "^2.4.0",
|
|
17
19
|
"clamscan": "^2.1.2",
|
|
18
20
|
"credit-card-regex": "^3.0.0",
|
|
19
21
|
"crypto-random-string": "3",
|
|
@@ -66,7 +68,6 @@
|
|
|
66
68
|
"devDependencies": {
|
|
67
69
|
"@commitlint/cli": "^17.0.2",
|
|
68
70
|
"@commitlint/config-conventional": "^17.0.2",
|
|
69
|
-
"@ladjs/redis": "^1.0.7",
|
|
70
71
|
"ava": "^4.3.0",
|
|
71
72
|
"cross-env": "^7.0.3",
|
|
72
73
|
"delay": "^5.0.0",
|
|
@@ -74,6 +75,8 @@
|
|
|
74
75
|
"eslint-config-xo-lass": "^2.0.1",
|
|
75
76
|
"fixpack": "^4.0.0",
|
|
76
77
|
"husky": "^8.0.1",
|
|
78
|
+
"ioredis": "^5.0.6",
|
|
79
|
+
"ioredis-mock": "^8.2.2",
|
|
77
80
|
"is-ci": "^3.0.1",
|
|
78
81
|
"lint-staged": "^13.0.1",
|
|
79
82
|
"lookpath": "^1.2.2",
|
|
@@ -91,6 +94,7 @@
|
|
|
91
94
|
"node": ">=14"
|
|
92
95
|
},
|
|
93
96
|
"files": [
|
|
97
|
+
"cedict_1_0_ts_utf-8_mdbg.txt",
|
|
94
98
|
"package.json",
|
|
95
99
|
"index.js",
|
|
96
100
|
"vocabulary-limit.js",
|