despamilator 2.0.1 → 2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/History.txt +7 -0
  2. data/lib/despamilator.rb +38 -7
  3. data/lib/despamilator/filter.rb +39 -23
  4. data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
  5. data/lib/despamilator/filter/html_tags.rb +9 -7
  6. data/lib/despamilator/filter/ip_address_url.rb +6 -4
  7. data/lib/despamilator/filter/long_words.rb +7 -5
  8. data/lib/despamilator/filter/mixed_case.rb +21 -0
  9. data/lib/despamilator/filter/naughty_words.rb +5 -5
  10. data/lib/despamilator/filter/numbers_and_words.rb +19 -11
  11. data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
  12. data/lib/despamilator/filter/prices.rb +19 -0
  13. data/lib/despamilator/filter/script_tag.rb +4 -4
  14. data/lib/despamilator/filter/shouting.rb +9 -6
  15. data/lib/despamilator/filter/spammy_tlds.rb +22 -0
  16. data/lib/despamilator/filter/square_brackets.rb +5 -5
  17. data/lib/despamilator/filter/trailing_number.rb +4 -4
  18. data/lib/despamilator/filter/unusual_characters.rb +5 -5
  19. data/lib/despamilator/filter/urls.rb +7 -9
  20. data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
  21. data/lib/despamilator/filter/weird_punctuation.rb +44 -0
  22. data/lib/despamilator/subject.rb +30 -0
  23. data/lib/despamilator/subject/text.rb +32 -0
  24. data/lib/despamilator/version.rb +3 -0
  25. metadata +29 -75
  26. data/.rspec +0 -2
  27. data/.rvmrc +0 -1
  28. data/Gemfile +0 -12
  29. data/Gemfile.lock +0 -47
  30. data/Manifest.txt +0 -46
  31. data/PostInstall.txt +0 -1
  32. data/Rakefile +0 -39
  33. data/conf/unusual_characters.txt +0 -6674
  34. data/despamilator.gemspec +0 -38
  35. data/lib/despamilator/filter_base.rb +0 -82
  36. data/scripts/despamilator_score.rb +0 -25
  37. data/scripts/from_file.rb +0 -26
  38. data/spec/despamilator_spec.rb +0 -13
  39. data/spec/filter_base_spec.rb +0 -30
  40. data/spec/filters/gtubs_test_filter_spec.rb +0 -9
  41. data/spec/filters/html_tags_spec.rb +0 -129
  42. data/spec/filters/ip_address_url_spec.rb +0 -11
  43. data/spec/filters/long_words_spec.rb +0 -11
  44. data/spec/filters/naughty_words_spec.rb +0 -11
  45. data/spec/filters/numbers_and_words_spec.rb +0 -34
  46. data/spec/filters/script_tag_spec.rb +0 -22
  47. data/spec/filters/shouting_spec.rb +0 -45
  48. data/spec/filters/square_brackets_spec.rb +0 -11
  49. data/spec/filters/trailing_number_spec.rb +0 -10
  50. data/spec/filters/unusual_characters_spec.rb +0 -9
  51. data/spec/filters/urls_spec.rb +0 -11
  52. data/spec/helpers/corpus_helper.rb +0 -5
  53. data/spec/helpers/filter_helper.rb +0 -59
  54. data/spec/helpers/spec_helper.rb +0 -6
  55. data/tasks/test.rake +0 -6
data/History.txt CHANGED
@@ -63,3 +63,10 @@
63
63
 
64
64
  * Rejigged url and html tag scores.
65
65
 
66
+ === 2.0.1
67
+
68
+ * Minor bug fix in shouting filter.
69
+
70
+ === IN PROGRESS
71
+
72
+ * Removed the "matched_by" method. Replaced by "matches".
data/lib/despamilator.rb CHANGED
@@ -1,6 +1,11 @@
1
1
  $:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
2
2
 
3
- require 'despamilator/filter'
3
+ Dir.glob(File.join(File.dirname(__FILE__), 'despamilator', 'filter', '*.rb')).each do |filter_file|
4
+ require filter_file
5
+ end
6
+
7
+ require 'despamilator/subject'
8
+ require 'ostruct'
4
9
 
5
10
  #== SYNOPSIS:
6
11
  #
@@ -14,25 +19,39 @@ require 'despamilator/filter'
14
19
  # dspam.matched_by #=> array of matching filters
15
20
 
16
21
  class Despamilator
17
- VERSION = "2.0.1"
18
22
 
19
23
  # Constructor. Takes the text you which to parse and score.
20
24
 
21
25
  def initialize text
22
- @filters = Despamilator::Filter.new text
26
+ @subject = Despamilator::Subject.new text
27
+ run_filters @subject
23
28
  end
24
29
 
25
30
  # Returns the total score as a Float.
26
31
 
27
32
  def score
28
- @filters.score
33
+ @subject.score
34
+ end
35
+
36
+ def matched_by
37
+ warn 'Despamilator.matched_by is deprecated, please use Despamilator.matches by 2011-12-31.'
38
+
39
+ matches.map do |match|
40
+ filter = match[:filter]
41
+
42
+ OpenStruct.new(
43
+ :name => filter.name,
44
+ :description => filter.description,
45
+ :score => match[:score]
46
+ )
47
+ end
29
48
  end
30
49
 
31
- # Returns an array of filters that have matched and contributed to the score.
50
+ # Returns an array of scores and filters that have matched and contributed to the score.
32
51
  # Each element is a a child of the Despamilator::FilterBase class.
33
52
 
34
- def matched_by
35
- @filters.matches
53
+ def matches
54
+ @subject.matches
36
55
  end
37
56
 
38
57
  # Generic Test for Unsolicited Bulk Submissions. Similar to SpamAssassin's GTUBE.
@@ -41,4 +60,16 @@ class Despamilator
41
60
  def self.gtubs_test_string
42
61
  '89913b8a065b7092721fe995877e097681683af9d3ab767146d5d6fd050fc0bda7ab99f4232d94a1'
43
62
  end
63
+
64
+ private
65
+
66
+ def run_filters subject
67
+ filter_namespace = Object.const_get('DespamilatorFilter')
68
+
69
+ filter_namespace.constants.each do |filter_class|
70
+ filter = filter_namespace.const_get(filter_class).new
71
+ filter.parse(subject)
72
+ end
73
+ end
74
+
44
75
  end
@@ -1,36 +1,52 @@
1
1
  class Despamilator
2
- class Filter
3
- attr_accessor :matches, :score
4
2
 
5
- def initialize text
6
- @matches ||= []
7
- @score ||= 0
8
- run_filters text
9
- end
3
+ #This class is the base class of all the despamilator filters.
4
+ #
5
+ #== EXAMPLE:
6
+ #
7
+ #This example is to detect the letter "a". Put the code in
8
+ #lib/despamilator/filter/detect_letter_a.rb:
9
+ #
10
+ # require 'despamilator/filter_base'
11
+ #
12
+ # module DespamilatorFilter
13
+ #
14
+ # class DetectLetterA < Despamilator::FilterBase
15
+ #
16
+ # def name
17
+ # 'Detecting the letter A'
18
+ # end
19
+ #
20
+ # def description
21
+ # 'Detects the letter "a" in a string for no reason other than a demo'
22
+ # end
23
+ #
24
+ # def parse text
25
+ # if text.downcase.scan(/a/)
26
+ # # add 0.1 to the score of the text
27
+ # self.append_score = 0.1
28
+ # end
29
+ # end
30
+ # end
10
31
 
11
- private
32
+ class Filter
12
33
 
13
- def run_filters text
14
- filter_namespace = Object.const_get('DespamilatorFilter')
34
+ # The nice description of the filter. Usually no more than a sentence.
15
35
 
16
- filter_namespace.constants.each do |filter_class|
17
- execute_filter(filter_namespace.const_get(filter_class).new, text)
18
- end
36
+ def description
37
+ raise "No description defined for #{self.class}"
19
38
  end
20
39
 
21
- private
40
+ # This method parses some text. The score is assigned to the same instance.
22
41
 
23
- def execute_filter filter, text
24
- filter.parse text.dup
25
-
26
- if filter.matched?
27
- @matches.push(filter)
28
- @score += filter.score
29
- end
42
+ def parse text
43
+ raise "No parser defined for #{self.class}"
30
44
  end
31
45
 
32
- Dir.glob(File.join(File.dirname(__FILE__), 'filter', '*.rb')).each do |filter_file|
33
- require filter_file
46
+ # The one or two word name for the filter.
47
+
48
+ def name
49
+ raise "No name defined for #{self.class}"
34
50
  end
35
51
 
36
52
  end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class GtubsTestFilter < Despamilator::FilterBase
5
+ class GtubsTestFilter < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'GTubs Test Filter'
@@ -12,8 +12,8 @@ module DespamilatorFilter
12
12
  'Detects the special test string (Despamilator.gtubs_test_string) and assigns a big score.'
13
13
  end
14
14
 
15
- def parse text
16
- self.append_score = 100 if text == Despamilator.gtubs_test_string
15
+ def parse subject
16
+ subject.register_match!({:score => 100, :filter => self}) if subject.text == Despamilator.gtubs_test_string
17
17
  end
18
18
 
19
19
  end
@@ -1,15 +1,17 @@
1
- require 'despamilator/filter_base'
2
-
3
1
  module DespamilatorFilter
4
2
 
5
- class HtmlTags < Despamilator::FilterBase
3
+ class HtmlTags < Despamilator::Filter
6
4
 
7
- def parse text
8
- text.downcase!
5
+ def parse subject
6
+ text = subject.text.downcase
9
7
 
10
8
  html_tags.each do |tag|
11
- if text.match(/<\s*#{tag}\W/) || text.match(/<\n*#{tag}\W/) || text.match(/\W#{tag}\s*\//) || text.match(/\W#{tag}\n*\//)
12
- self.append_score = 0.6
9
+ opening_elements = text.count(/<\s*#{tag}\W/)
10
+ closing_elements = text.count(/\W#{tag}\s*\/>/)
11
+
12
+ if opening_elements > 0 or closing_elements > 0
13
+ safest_element_count = opening_elements > closing_elements ? opening_elements : closing_elements
14
+ subject.register_match!({:score => 0.6 * safest_element_count, :filter => self})
13
15
  end
14
16
  end
15
17
  end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class IPAddressURL < Despamilator::FilterBase
5
+ class IPAddressURL < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'IP Address URL'
@@ -12,8 +12,10 @@ module DespamilatorFilter
12
12
  'Detects IP address URLs'
13
13
  end
14
14
 
15
- def parse text
16
- self.append_score = 0.5 if text.downcase.scan(/http:\/\/\d+\.\d+\.\d+\.\d+/).length > 0
15
+ def parse subject
16
+ subject.register_match!({
17
+ :score => 0.5, :filter => self
18
+ }) if subject.text.downcase.count(/http:\/\/\d+\.\d+\.\d+\.\d+/) > 0
17
19
  end
18
20
 
19
21
  end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class LongWords < Despamilator::FilterBase
5
+ class LongWords < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Long Words'
@@ -12,9 +12,11 @@ module DespamilatorFilter
12
12
  'Detects long and unbroken strings'
13
13
  end
14
14
 
15
- def parse text
16
- text.split(/\W+/).each do |word|
17
- self.append_score = 0.1 if word.length > 20
15
+ def parse subject
16
+ subject.text.without_uris.words.each do |word|
17
+ subject.register_match!({
18
+ :score => 0.1, :filter => self
19
+ }) if word.length > 20
18
20
  end
19
21
  end
20
22
 
@@ -0,0 +1,21 @@
1
+ module DespamilatorFilter
2
+
3
+ class MixedCase < Despamilator::Filter
4
+ def name
5
+ 'Mixed Case String'
6
+ end
7
+
8
+ def description
9
+ 'Detects mixed case strings.'
10
+ end
11
+
12
+ def parse subject
13
+ text = subject.text.without_uris
14
+ count = text.remove_and_count!(/[a-z][A-Z]/)
15
+ count += text.remove_and_count!(/[a-z][A-Z][a-z]/)
16
+ subject.register_match!({:score => 0.1 * count, :filter => self}) if count > 0
17
+ end
18
+
19
+ end
20
+
21
+ end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class NaughtyWords < Despamilator::FilterBase
5
+ class NaughtyWords < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Naughty Words'
@@ -12,11 +12,11 @@ module DespamilatorFilter
12
12
  'Detects cheeky words'
13
13
  end
14
14
 
15
- def parse text
16
- text.downcase!
15
+ def parse subject
16
+ text = subject.text.downcase
17
17
 
18
18
  naughty_words.each do |word|
19
- self.append_score = 0.1 if text =~ /\b#{word}s?\b/
19
+ subject.register_match!({:score => 0.1, :filter => self}) if text =~ /\b#{word}s?\b/
20
20
  end
21
21
  end
22
22
 
@@ -1,17 +1,11 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class NumbersAndWords < Despamilator::FilterBase
5
+ class NumbersAndWords < Despamilator::Filter
6
6
 
7
- def parse text
8
- text.downcase!
9
-
10
- # strip out "good numbers"
11
- text.gsub!(/h[1-6]/, '')
12
- text.gsub!(/(^|\b)\d+($|\b)/, '')
13
- text.gsub!(/(^|\b)\d+(,|\.)\d+($|\b)/, '')
14
- text.gsub!(/(^|\b)\d+(st|nd|rd|th)($|\b)/, '')
7
+ def parse subject
8
+ text = tidy_text(subject)
15
9
 
16
10
  [
17
11
  /\w\d+/,
@@ -25,7 +19,7 @@ module DespamilatorFilter
25
19
  matches.each do |to_remove|
26
20
  to_remove = to_remove.to_s
27
21
  text.sub!(to_remove, '') unless to_remove.empty?
28
- self.append_score = 0.1
22
+ subject.register_match!({:score => 0.1, :filter => self})
29
23
  end
30
24
  end
31
25
  end
@@ -38,6 +32,20 @@ module DespamilatorFilter
38
32
  'Detects unusual number/word combinations'
39
33
  end
40
34
 
35
+ private
36
+
37
+ def tidy_text subject
38
+ text = subject.text.without_uris
39
+ text.downcase!
40
+
41
+ # strip out "good numbers"
42
+ text.gsub!(/h[1-6]/, '')
43
+ text.gsub!(/(^|\b)\d+($|\b)/, '')
44
+ text.gsub!(/(^|\b)\d+(,|\.)\d+($|\b)/, '')
45
+ text.gsub!(/(^|\b)\d+(st|nd|rd|th)($|\b)/, '')
46
+
47
+ text
48
+ end
41
49
  end
42
50
 
43
51
  end
@@ -0,0 +1,41 @@
1
+ module DespamilatorFilter
2
+
3
+ class ObfuscatedURLs < Despamilator::Filter
4
+ def name
5
+ 'Obfuscated URLs'
6
+ end
7
+
8
+ def description
9
+ 'Finds lame attempts at obfuscating urls.'
10
+ end
11
+
12
+ def parse subject
13
+ text = subject.text.without_uris.downcase
14
+ count = find_space_separated_parts text
15
+ count += find_space_separated_characters text
16
+
17
+ # weird maths below is due to some issue with ruby 1.9.2 multiplying floats by 3 (?!)
18
+ subject.register_match!({score: (4.0 * count) / 10, filter: self}) if count > 0
19
+ end
20
+
21
+ private
22
+
23
+ def find_space_separated_parts text
24
+ text.count(/www\s+\w+\s+com/)
25
+ end
26
+
27
+ def find_space_separated_characters text
28
+ count = 0
29
+
30
+ text.split(/[a-z][a-z]/).each do |candidate|
31
+ candidate.strip!
32
+ candidate.gsub!(/\s+/, '')
33
+ count += 1 if candidate =~ /\w{5,}\.\w{2,3}/
34
+ end
35
+
36
+ count
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,19 @@
1
+ module DespamilatorFilter
2
+
3
+ class Prices < Despamilator::Filter
4
+ def name
5
+ 'Prices'
6
+ end
7
+
8
+ def description
9
+ 'Detects prices in text.'
10
+ end
11
+
12
+ def parse subject
13
+ price_count = subject.text.count(/\$\s*\d+/)
14
+ subject.register_match!({:score => 0.075 * price_count, :filter => self}) if price_count > 0
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -1,11 +1,11 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class ScriptTag < Despamilator::FilterBase
5
+ class ScriptTag < Despamilator::Filter
6
6
 
7
- def parse text
8
- self.append_score = 1 if text.downcase.match(/<\/?script(>|\s+|\n|\r)/)
7
+ def parse subject
8
+ subject.register_match!({:score => 1, :filter => self}) if subject.text.downcase.match(/<\/?script(>|\s+|\n|\r)/)
9
9
  end
10
10
 
11
11
  def name
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class Shouting < Despamilator::FilterBase
5
+ class Shouting < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Shouting'
@@ -12,17 +12,20 @@ module DespamilatorFilter
12
12
  'Detects and scores shouting (all caps)'
13
13
  end
14
14
 
15
- def parse text
15
+ def parse subject
16
16
  # strip HTML
17
- text.gsub!(/<\/?[^>]*>/, "")
17
+ text = subject.text.gsub(/<\/?[^>]*>/, "")
18
18
 
19
19
  return if text.length < 20
20
20
 
21
21
  uppercased = text.scan(/[A-Z][A-Z]+/).join.length
22
- lowercased = text.scan(/[a-z]/).length
22
+ lowercased = text.count(/[a-z]/)
23
23
 
24
24
  if uppercased > 0
25
- self.append_score = (uppercased.to_f / (uppercased + lowercased)) * 0.5
25
+ subject.register_match!({
26
+ :score => (uppercased.to_f / (uppercased + lowercased)) * 0.5,
27
+ :filter => self
28
+ })
26
29
  end
27
30
  end
28
31