spamscanner 5.0.0 → 5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -1,6 +1,7 @@
1
- const process = require('process');
2
1
  const dns = require('dns');
3
2
  const fs = require('fs');
3
+ const path = require('path');
4
+ const process = require('process');
4
5
  const { debuglog } = require('util');
5
6
 
6
7
  // eslint-disable-next-line n/no-deprecated-api
@@ -55,6 +56,19 @@ const { simpleParser } = require('mailparser');
55
56
 
56
57
  const debug = debuglog('spamscanner');
57
58
 
59
+ //
60
+ // NOTE: we periodically need to update this
61
+ //
62
+ // Source from: CC-CEDICT
63
+ // Licensed under Creative Commons Attribution-ShareAlike 4.0 International License
64
+ // <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>
65
+ //
66
+ // <https://github.com/yishn/chinese-tokenizer>
67
+ //
68
+ const chineseTokenizer = require('chinese-tokenizer').loadFile(
69
+ path.join(__dirname, 'cedict_1_0_ts_utf-8_mdbg.txt')
70
+ );
71
+
58
72
  const aggressiveTokenizer = new natural.AggressiveTokenizer();
59
73
  const orthographyTokenizer = new natural.OrthographyTokenizer({
60
74
  language: 'fi'
@@ -195,6 +209,8 @@ const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
195
209
  // TODO: convert this into a Map
196
210
  const ISO_CODE_MAPPING = require('./iso-code-mapping.json');
197
211
 
212
+ const ISO_CODE_MAPPING_KEYS = Object.keys(ISO_CODE_MAPPING);
213
+
198
214
  // <https://kb.smarshmail.com/Article/23567>
199
215
  const EXECUTABLES = new Set(require('./executables.json'));
200
216
 
@@ -426,12 +442,6 @@ class SpamScanner {
426
442
  : '/var/run/clamav/clamd.ctl'
427
443
  }
428
444
  },
429
- franc: {
430
- minLength: 100,
431
- // we can only support languages available
432
- // in stopwords and natural's tokenizer methods
433
- only: Object.keys(ISO_CODE_MAPPING)
434
- },
435
445
  hasha: {
436
446
  algorithm: 'sha256'
437
447
  },
@@ -446,6 +456,21 @@ class SpamScanner {
446
456
  client: false,
447
457
  cachePrefix: 'spamscanner',
448
458
  ttlMs: ms('1h'),
459
+ // franc
460
+ franc: {
461
+ // NOTE: if locale was passed and was valid
462
+ // then we need to compare it against english
463
+ // and if it was english detected (and not und)
464
+ // then switch the detected locale to english
465
+ minLength: 5,
466
+ // we can only support languages available
467
+ // in stopwords and natural's tokenizer methods
468
+ // and if it was detected to be english, compare against all languages
469
+ // otherwise if not, then compare only against english
470
+ // (namely we need to check against JP/ZH, but perhaps _all_ in future)
471
+ // (the edge case is that someone could spoof a language and it go undetected and tokenization bugs occur)
472
+ only: ISO_CODE_MAPPING_KEYS
473
+ },
449
474
  ...config
450
475
  };
451
476
 
@@ -952,6 +977,7 @@ class SpamScanner {
952
977
  // <https://github.com/FGRibreau/node-language-detect> (not too accurate)
953
978
  //
954
979
  const detectedLanguage = franc(string, this.config.franc);
980
+
955
981
  if (
956
982
  detectedLanguage !== 'und' &&
957
983
  isSANB(ISO_CODE_MAPPING[detectedLanguage])
@@ -1110,8 +1136,12 @@ class SpamScanner {
1110
1136
  break;
1111
1137
  case 'zh':
1112
1138
  // cmn
1113
- // TODO: use this instead https://github.com/yishn/chinese-tokenizer
1114
1139
  // ISO 639-3 = zho (Chinese, Macrolanguage)
1140
+ // https://github.com/yishn/chinese-tokenizer
1141
+ tokenizer = {
1142
+ tokenize: (str) =>
1143
+ chineseTokenizer(str).map((results) => results.text)
1144
+ };
1115
1145
  language = 'chinese';
1116
1146
  stopwords = stopwordsZh;
1117
1147
  stemword = false;
@@ -1214,6 +1244,9 @@ class SpamScanner {
1214
1244
  //
1215
1245
  const tokens = [];
1216
1246
  for (const token of tokenizer.tokenize(string.toLowerCase())) {
1247
+ // zh tokenizr yields empty strings
1248
+ if (token === '' || token === ' ') continue;
1249
+
1217
1250
  // whitelist words from being stemmed (safeguard)
1218
1251
  if (
1219
1252
  this.WHITELISTED_WORDS.has(token) ||
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "spamscanner",
3
3
  "description": "Spam Scanner - The Best Anti-Spam Scanning Service and Anti-Spam API",
4
- "version": "5.0.0",
4
+ "version": "5.1.0",
5
5
  "author": "Niftylettuce, LLC. <niftylettuce@gmail.com> (https://niftylettuce.com/)",
6
6
  "bugs": {
7
7
  "url": "https://github.com/spamscanner/spamscanner/issues",
@@ -14,6 +14,7 @@
14
14
  "dependencies": {
15
15
  "@ladjs/naivebayes": "^0.1.0",
16
16
  "bitcoin-regex": "^2.0.0",
17
+ "chinese-tokenizer": "^2.4.0",
17
18
  "clamscan": "^2.1.2",
18
19
  "credit-card-regex": "^3.0.0",
19
20
  "crypto-random-string": "3",
@@ -91,6 +92,7 @@
91
92
  "node": ">=14"
92
93
  },
93
94
  "files": [
95
+ "cedict_1_0_ts_utf-8_mdbg.txt",
94
96
  "package.json",
95
97
  "index.js",
96
98
  "vocabulary-limit.js",