spamscanner 5.0.0 → 5.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/cedict_1_0_ts_utf-8_mdbg.txt +120853 -0
- package/index.js +41 -8
- package/package.json +3 -1
package/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
const process = require('process');
|
|
2
1
|
const dns = require('dns');
|
|
3
2
|
const fs = require('fs');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const process = require('process');
|
|
4
5
|
const { debuglog } = require('util');
|
|
5
6
|
|
|
6
7
|
// eslint-disable-next-line n/no-deprecated-api
|
|
@@ -55,6 +56,19 @@ const { simpleParser } = require('mailparser');
|
|
|
55
56
|
|
|
56
57
|
const debug = debuglog('spamscanner');
|
|
57
58
|
|
|
59
|
+
//
|
|
60
|
+
// NOTE: we periodically need to update this
|
|
61
|
+
//
|
|
62
|
+
// Source from: CC-CEDICT
|
|
63
|
+
// Licensed under Creative Commons Attribution-ShareAlike 4.0 International License
|
|
64
|
+
// <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>
|
|
65
|
+
//
|
|
66
|
+
// <https://github.com/yishn/chinese-tokenizer>
|
|
67
|
+
//
|
|
68
|
+
const chineseTokenizer = require('chinese-tokenizer').loadFile(
|
|
69
|
+
path.join(__dirname, 'cedict_1_0_ts_utf-8_mdbg.txt')
|
|
70
|
+
);
|
|
71
|
+
|
|
58
72
|
const aggressiveTokenizer = new natural.AggressiveTokenizer();
|
|
59
73
|
const orthographyTokenizer = new natural.OrthographyTokenizer({
|
|
60
74
|
language: 'fi'
|
|
@@ -195,6 +209,8 @@ const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
|
|
|
195
209
|
// TODO: convert this into a Map
|
|
196
210
|
const ISO_CODE_MAPPING = require('./iso-code-mapping.json');
|
|
197
211
|
|
|
212
|
+
const ISO_CODE_MAPPING_KEYS = Object.keys(ISO_CODE_MAPPING);
|
|
213
|
+
|
|
198
214
|
// <https://kb.smarshmail.com/Article/23567>
|
|
199
215
|
const EXECUTABLES = new Set(require('./executables.json'));
|
|
200
216
|
|
|
@@ -426,12 +442,6 @@ class SpamScanner {
|
|
|
426
442
|
: '/var/run/clamav/clamd.ctl'
|
|
427
443
|
}
|
|
428
444
|
},
|
|
429
|
-
franc: {
|
|
430
|
-
minLength: 100,
|
|
431
|
-
// we can only support languages available
|
|
432
|
-
// in stopwords and natural's tokenizer methods
|
|
433
|
-
only: Object.keys(ISO_CODE_MAPPING)
|
|
434
|
-
},
|
|
435
445
|
hasha: {
|
|
436
446
|
algorithm: 'sha256'
|
|
437
447
|
},
|
|
@@ -446,6 +456,21 @@ class SpamScanner {
|
|
|
446
456
|
client: false,
|
|
447
457
|
cachePrefix: 'spamscanner',
|
|
448
458
|
ttlMs: ms('1h'),
|
|
459
|
+
// franc
|
|
460
|
+
franc: {
|
|
461
|
+
// NOTE: if locale was passed and was valid
|
|
462
|
+
// then we need to compare it against english
|
|
463
|
+
// and if it was english detected (and not und)
|
|
464
|
+
// then switch the detected locale to english
|
|
465
|
+
minLength: 5,
|
|
466
|
+
// we can only support languages available
|
|
467
|
+
// in stopwords and natural's tokenizer methods
|
|
468
|
+
// and if it was detected to be english, compare against all languages
|
|
469
|
+
// otherwise if not, then compare only against english
|
|
470
|
+
// (namely we need to check against JP/ZH, but perhaps _all_ in future)
|
|
471
|
+
// (the edge case is that someone could spoof a language and it go undetected and tokenization bugs occur)
|
|
472
|
+
only: ISO_CODE_MAPPING_KEYS
|
|
473
|
+
},
|
|
449
474
|
...config
|
|
450
475
|
};
|
|
451
476
|
|
|
@@ -952,6 +977,7 @@ class SpamScanner {
|
|
|
952
977
|
// <https://github.com/FGRibreau/node-language-detect> (not too accurate)
|
|
953
978
|
//
|
|
954
979
|
const detectedLanguage = franc(string, this.config.franc);
|
|
980
|
+
|
|
955
981
|
if (
|
|
956
982
|
detectedLanguage !== 'und' &&
|
|
957
983
|
isSANB(ISO_CODE_MAPPING[detectedLanguage])
|
|
@@ -1110,8 +1136,12 @@ class SpamScanner {
|
|
|
1110
1136
|
break;
|
|
1111
1137
|
case 'zh':
|
|
1112
1138
|
// cmn
|
|
1113
|
-
// TODO: use this instead https://github.com/yishn/chinese-tokenizer
|
|
1114
1139
|
// ISO 639-3 = zho (Chinese, Macrolanguage)
|
|
1140
|
+
// https://github.com/yishn/chinese-tokenizer
|
|
1141
|
+
tokenizer = {
|
|
1142
|
+
tokenize: (str) =>
|
|
1143
|
+
chineseTokenizer(str).map((results) => results.text)
|
|
1144
|
+
};
|
|
1115
1145
|
language = 'chinese';
|
|
1116
1146
|
stopwords = stopwordsZh;
|
|
1117
1147
|
stemword = false;
|
|
@@ -1214,6 +1244,9 @@ class SpamScanner {
|
|
|
1214
1244
|
//
|
|
1215
1245
|
const tokens = [];
|
|
1216
1246
|
for (const token of tokenizer.tokenize(string.toLowerCase())) {
|
|
1247
|
+
// zh tokenizr yields empty strings
|
|
1248
|
+
if (token === '' || token === ' ') continue;
|
|
1249
|
+
|
|
1217
1250
|
// whitelist words from being stemmed (safeguard)
|
|
1218
1251
|
if (
|
|
1219
1252
|
this.WHITELISTED_WORDS.has(token) ||
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spamscanner",
|
|
3
3
|
"description": "Spam Scanner - The Best Anti-Spam Scanning Service and Anti-Spam API",
|
|
4
|
-
"version": "5.
|
|
4
|
+
"version": "5.1.0",
|
|
5
5
|
"author": "Niftylettuce, LLC. <niftylettuce@gmail.com> (https://niftylettuce.com/)",
|
|
6
6
|
"bugs": {
|
|
7
7
|
"url": "https://github.com/spamscanner/spamscanner/issues",
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
"dependencies": {
|
|
15
15
|
"@ladjs/naivebayes": "^0.1.0",
|
|
16
16
|
"bitcoin-regex": "^2.0.0",
|
|
17
|
+
"chinese-tokenizer": "^2.4.0",
|
|
17
18
|
"clamscan": "^2.1.2",
|
|
18
19
|
"credit-card-regex": "^3.0.0",
|
|
19
20
|
"crypto-random-string": "3",
|
|
@@ -91,6 +92,7 @@
|
|
|
91
92
|
"node": ">=14"
|
|
92
93
|
},
|
|
93
94
|
"files": [
|
|
95
|
+
"cedict_1_0_ts_utf-8_mdbg.txt",
|
|
94
96
|
"package.json",
|
|
95
97
|
"index.js",
|
|
96
98
|
"vocabulary-limit.js",
|