spamscanner 5.1.0 → 5.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -54,7 +54,7 @@
54
54
 
55
55
  ## Foreword
56
56
 
57
- Spam Scanner is a tool and service built by [@niftylettuce][niftylettuce] after hitting countless roadblocks with existing spam-detection solutions. In other words, it's our current [plan][plan-for-spam] for [spam][better-plan-for-spam].
57
+ Spam Scanner is a tool and service created after hitting countless roadblocks with existing spam-detection solutions. In other words, it's our current [plan][plan-for-spam] for [spam][better-plan-for-spam].
58
58
 
59
59
  Our goal is to build and utilize a scalable, performant, simple, easy to maintain, and powerful API for use in our service at [Forward Email][forward-email] to limit spam and provide other measures to prevent attacks on our users.
60
60
 
@@ -566,7 +566,7 @@ Note that in [Forward Email][forward-email] we use the `client` approach as we h
566
566
 
567
567
  ## Debugging
568
568
 
569
- Spam Scanner has built-in debug output via `util.debuglog('spamscanner')`.
569
+ Spam Scanner has built-in debug output via `util.debuglog('spamscanner')`. You can also pass `debug: true` to your instance to get more verbose output.
570
570
 
571
571
  This means you can run your app with `NODE_DEBUG=spamscanner node app.js` to get useful debug output to your console.
572
572
 
@@ -606,8 +606,6 @@ This means you can run your app with `NODE_DEBUG=spamscanner node app.js` to get
606
606
 
607
607
  [homograph-attack]: https://en.wikipedia.org/wiki/IDN_homograph_attack
608
608
 
609
- [niftylettuce]: https://github.com/niftylettuce
610
-
611
609
  [forward-email]: https://forwardemail.net
612
610
 
613
611
  [rspamd]: https://rspamd.com/
package/get-classifier.js CHANGED
@@ -1,5 +1,8 @@
1
+ const { debuglog } = require('util');
2
+
1
3
  const NaiveBayes = require('@ladjs/naivebayes');
2
- const debug = require('debug')('spamscanner');
4
+
5
+ const debug = debuglog('spamscanner');
3
6
 
4
7
  let classifier = new NaiveBayes().toJsonObject();
5
8
 
package/index.js CHANGED
@@ -11,6 +11,7 @@ const ClamScan = require('clamscan');
11
11
  const FileType = require('file-type');
12
12
  const NaiveBayes = require('@ladjs/naivebayes');
13
13
  const RE2 = require('re2');
14
+ const arrayJoinConjunction = require('array-join-conjunction');
14
15
  const bitcoinRegex = require('bitcoin-regex');
15
16
  const contractions = require('expand-contractions');
16
17
  const creditCardRegex = require('credit-card-regex');
@@ -458,19 +459,13 @@ class SpamScanner {
458
459
  ttlMs: ms('1h'),
459
460
  // franc
460
461
  franc: {
461
- // NOTE: if locale was passed and was valid
462
- // then we need to compare it against english
463
- // and if it was english detected (and not und)
464
- // then switch the detected locale to english
465
462
  minLength: 5,
466
- // we can only support languages available
467
- // in stopwords and natural's tokenizer methods
468
- // and if it was detected to be english, compare against all languages
469
- // otherwise if not, then compare only against english
470
- // (namely we need to check against JP/ZH, but perhaps _all_ in future)
471
- // (the edge case is that someone could spoof a language and it go undetected and tokenization bugs occur)
472
463
  only: ISO_CODE_MAPPING_KEYS
473
464
  },
465
+ // if franc detects multiple languages that have >= % threshold
466
+ // then if the locale detected was one of them, what is the probability
467
+ // it must have in order to override all the other matches
468
+ detectedLocaleOverrideProbability: 0.9,
474
469
  ...config
475
470
  };
476
471
 
@@ -501,7 +496,7 @@ class SpamScanner {
501
496
  return tokens;
502
497
  };
503
498
 
504
- this.clamscan = new ClamScan();
499
+ this.clamscan = this.config.clamscan === false ? false : new ClamScan();
505
500
 
506
501
  this.getTokensAndMailFromSource = universalify.fromPromise(
507
502
  this.getTokensAndMailFromSource.bind(this)
@@ -659,7 +654,13 @@ class SpamScanner {
659
654
  async getVirusResults(mail) {
660
655
  const messages = [];
661
656
 
662
- if (!Array.isArray(mail.attachments)) return messages;
657
+ if (!this.clamscan) {
658
+ debug('clamscan disabled');
659
+ return messages;
660
+ }
661
+
662
+ if (!Array.isArray(mail.attachments) || mail.attachments.length === 0)
663
+ return messages;
663
664
 
664
665
  try {
665
666
  // if it was already loaded, clamscan won't reload itself
@@ -677,7 +678,11 @@ class SpamScanner {
677
678
  ? `"${attachment.filename}"`
678
679
  : `#${i + 1}`;
679
680
  if (isInfected)
680
- messages.push(`Attachment ${name} was infected with ${viruses}.`);
681
+ messages.push(
682
+ `Attachment ${name} was infected with ${arrayJoinConjunction(
683
+ viruses
684
+ )}.`
685
+ );
681
686
  } catch (err) {
682
687
  this.config.logger.error(err);
683
688
  }
@@ -976,13 +981,41 @@ class SpamScanner {
976
981
  // <https://github.com/wooorm/franc/issues/86> (accurate with min length)
977
982
  // <https://github.com/FGRibreau/node-language-detect> (not too accurate)
978
983
  //
979
- const detectedLanguage = franc(string, this.config.franc);
984
+ const detectedLanguages = franc.all(string, this.config.franc);
985
+ if (Array.isArray(detectedLanguages) && detectedLanguages.length > 0) {
986
+ let detected = this.config.locale;
987
+ let probability = 0;
988
+ for (const lang of detectedLanguages) {
989
+ // if it was undetermined then break out and revert to default (English)
990
+ if (lang[0] && lang[0] === 'und') break;
991
+
992
+ //
993
+ // otherwise only use detected languages that have >= 90% accuracy
994
+ // and if no matches were found, the revert to use English as it's most likely spam
995
+ // (we can assume that users would understand a different language sent to them is spam)
996
+ // (so we can assume that language is spoofed to bypass English, the most widely spoken)
997
+ //
998
+ if (lang[0] && ISO_CODE_MAPPING[lang[0]] && lang[1]) {
999
+ // we don't want to check anything lower than our threshold
1000
+ if (lang[1] < this.config.detectedLocaleOverrideProbability) break;
1001
+ if (probability >= lang[1]) {
1002
+ // exit early since we found a match that matched the passed locale
1003
+ // eslint-disable-next-line max-depth
1004
+ if (locale && locale === ISO_CODE_MAPPING[lang[0]]) {
1005
+ detected = locale;
1006
+ probability = lang[1];
1007
+ break;
1008
+ }
1009
+ } else {
1010
+ detected = ISO_CODE_MAPPING[lang[0]];
1011
+ probability = lang[1];
1012
+ }
1013
+ }
1014
+ }
980
1015
 
981
- if (
982
- detectedLanguage !== 'und' &&
983
- isSANB(ISO_CODE_MAPPING[detectedLanguage])
984
- )
985
- locale = ISO_CODE_MAPPING[detectedLanguage];
1016
+ // override the locale based off detected
1017
+ locale = detected;
1018
+ }
986
1019
 
987
1020
  locale = this.parseLocale(isSANB(locale) ? locale : this.config.locale);
988
1021
 
@@ -1230,7 +1263,6 @@ class SpamScanner {
1230
1263
  // <https://github.com/NaturalNode/natural/issues/533>
1231
1264
  //
1232
1265
  // NOTE: we're doing this for all languages now, not just en
1233
- // if (locale === 'en')
1234
1266
  //
1235
1267
  string = contractions.expand(string);
1236
1268
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "spamscanner",
3
3
  "description": "Spam Scanner - The Best Anti-Spam Scanning Service and Anti-Spam API",
4
- "version": "5.1.0",
4
+ "version": "5.1.3",
5
5
  "author": "Niftylettuce, LLC. <niftylettuce@gmail.com> (https://niftylettuce.com/)",
6
6
  "bugs": {
7
7
  "url": "https://github.com/spamscanner/spamscanner/issues",
@@ -13,6 +13,7 @@
13
13
  ],
14
14
  "dependencies": {
15
15
  "@ladjs/naivebayes": "^0.1.0",
16
+ "array-join-conjunction": "^1.0.0",
16
17
  "bitcoin-regex": "^2.0.0",
17
18
  "chinese-tokenizer": "^2.4.0",
18
19
  "clamscan": "^2.1.2",
@@ -67,14 +68,16 @@
67
68
  "devDependencies": {
68
69
  "@commitlint/cli": "^17.0.2",
69
70
  "@commitlint/config-conventional": "^17.0.2",
70
- "@ladjs/redis": "^1.0.7",
71
71
  "ava": "^4.3.0",
72
+ "chance": "^1.1.8",
72
73
  "cross-env": "^7.0.3",
73
74
  "delay": "^5.0.0",
74
75
  "eslint": "^8.17.0",
75
76
  "eslint-config-xo-lass": "^2.0.1",
76
77
  "fixpack": "^4.0.0",
77
78
  "husky": "^8.0.1",
79
+ "ioredis": "^5.0.6",
80
+ "ioredis-mock": "^8.2.2",
78
81
  "is-ci": "^3.0.1",
79
82
  "lint-staged": "^13.0.1",
80
83
  "lookpath": "^1.2.2",
@@ -83,6 +86,7 @@
83
86
  "numeral": "^2.0.6",
84
87
  "nyc": "^15.1.0",
85
88
  "p-map": "4",
89
+ "p-queue": "6",
86
90
  "read-dir-deep": "^7.0.1",
87
91
  "remark-cli": "^10.0.1",
88
92
  "remark-preset-github": "^4.0.4",
package/replacements.js CHANGED
@@ -1,5 +1,8 @@
1
+ const { debuglog } = require('util');
2
+
1
3
  const cryptoRandomString = require('crypto-random-string');
2
- const debug = require('debug')('spamscanner');
4
+
5
+ const debug = debuglog('spamscanner');
3
6
 
4
7
  const REPLACEMENT_WORDS = require('./replacement-words.json');
5
8