despamilator 2.0.1 → 2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/History.txt +7 -0
  2. data/lib/despamilator.rb +38 -7
  3. data/lib/despamilator/filter.rb +39 -23
  4. data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
  5. data/lib/despamilator/filter/html_tags.rb +9 -7
  6. data/lib/despamilator/filter/ip_address_url.rb +6 -4
  7. data/lib/despamilator/filter/long_words.rb +7 -5
  8. data/lib/despamilator/filter/mixed_case.rb +21 -0
  9. data/lib/despamilator/filter/naughty_words.rb +5 -5
  10. data/lib/despamilator/filter/numbers_and_words.rb +19 -11
  11. data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
  12. data/lib/despamilator/filter/prices.rb +19 -0
  13. data/lib/despamilator/filter/script_tag.rb +4 -4
  14. data/lib/despamilator/filter/shouting.rb +9 -6
  15. data/lib/despamilator/filter/spammy_tlds.rb +22 -0
  16. data/lib/despamilator/filter/square_brackets.rb +5 -5
  17. data/lib/despamilator/filter/trailing_number.rb +4 -4
  18. data/lib/despamilator/filter/unusual_characters.rb +5 -5
  19. data/lib/despamilator/filter/urls.rb +7 -9
  20. data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
  21. data/lib/despamilator/filter/weird_punctuation.rb +44 -0
  22. data/lib/despamilator/subject.rb +30 -0
  23. data/lib/despamilator/subject/text.rb +32 -0
  24. data/lib/despamilator/version.rb +3 -0
  25. metadata +29 -75
  26. data/.rspec +0 -2
  27. data/.rvmrc +0 -1
  28. data/Gemfile +0 -12
  29. data/Gemfile.lock +0 -47
  30. data/Manifest.txt +0 -46
  31. data/PostInstall.txt +0 -1
  32. data/Rakefile +0 -39
  33. data/conf/unusual_characters.txt +0 -6674
  34. data/despamilator.gemspec +0 -38
  35. data/lib/despamilator/filter_base.rb +0 -82
  36. data/scripts/despamilator_score.rb +0 -25
  37. data/scripts/from_file.rb +0 -26
  38. data/spec/despamilator_spec.rb +0 -13
  39. data/spec/filter_base_spec.rb +0 -30
  40. data/spec/filters/gtubs_test_filter_spec.rb +0 -9
  41. data/spec/filters/html_tags_spec.rb +0 -129
  42. data/spec/filters/ip_address_url_spec.rb +0 -11
  43. data/spec/filters/long_words_spec.rb +0 -11
  44. data/spec/filters/naughty_words_spec.rb +0 -11
  45. data/spec/filters/numbers_and_words_spec.rb +0 -34
  46. data/spec/filters/script_tag_spec.rb +0 -22
  47. data/spec/filters/shouting_spec.rb +0 -45
  48. data/spec/filters/square_brackets_spec.rb +0 -11
  49. data/spec/filters/trailing_number_spec.rb +0 -10
  50. data/spec/filters/unusual_characters_spec.rb +0 -9
  51. data/spec/filters/urls_spec.rb +0 -11
  52. data/spec/helpers/corpus_helper.rb +0 -5
  53. data/spec/helpers/filter_helper.rb +0 -59
  54. data/spec/helpers/spec_helper.rb +0 -6
  55. data/tasks/test.rake +0 -6
data/History.txt CHANGED
@@ -63,3 +63,10 @@
63
63
 
64
64
  * Rejigged url and html tag scores.
65
65
 
66
+ === 2.0.1
67
+
68
+ * Minor bug fix in shouting filter.
69
+
70
+ === IN PROGRESS
71
+
72
+ * Removed the "matched_by" method. Replaced by "matches".
data/lib/despamilator.rb CHANGED
@@ -1,6 +1,11 @@
1
1
  $:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
2
2
 
3
- require 'despamilator/filter'
3
+ Dir.glob(File.join(File.dirname(__FILE__), 'despamilator', 'filter', '*.rb')).each do |filter_file|
4
+ require filter_file
5
+ end
6
+
7
+ require 'despamilator/subject'
8
+ require 'ostruct'
4
9
 
5
10
  #== SYNOPSIS:
6
11
  #
@@ -14,25 +19,39 @@ require 'despamilator/filter'
14
19
  # dspam.matched_by #=> array of matching filters
15
20
 
16
21
  class Despamilator
17
- VERSION = "2.0.1"
18
22
 
19
23
  # Constructor. Takes the text you which to parse and score.
20
24
 
21
25
  def initialize text
22
- @filters = Despamilator::Filter.new text
26
+ @subject = Despamilator::Subject.new text
27
+ run_filters @subject
23
28
  end
24
29
 
25
30
  # Returns the total score as a Float.
26
31
 
27
32
  def score
28
- @filters.score
33
+ @subject.score
34
+ end
35
+
36
+ def matched_by
37
+ warn 'Despamilator.matched_by is deprecated, please use Despamilator.matches by 2011-12-31.'
38
+
39
+ matches.map do |match|
40
+ filter = match[:filter]
41
+
42
+ OpenStruct.new(
43
+ :name => filter.name,
44
+ :description => filter.description,
45
+ :score => match[:score]
46
+ )
47
+ end
29
48
  end
30
49
 
31
- # Returns an array of filters that have matched and contributed to the score.
50
+ # Returns an array of scores and filters that have matched and contributed to the score.
32
51
  # Each element is a a child of the Despamilator::FilterBase class.
33
52
 
34
- def matched_by
35
- @filters.matches
53
+ def matches
54
+ @subject.matches
36
55
  end
37
56
 
38
57
  # Generic Test for Unsolicited Bulk Submissions. Similar to SpamAssassin's GTUBE.
@@ -41,4 +60,16 @@ class Despamilator
41
60
  def self.gtubs_test_string
42
61
  '89913b8a065b7092721fe995877e097681683af9d3ab767146d5d6fd050fc0bda7ab99f4232d94a1'
43
62
  end
63
+
64
+ private
65
+
66
+ def run_filters subject
67
+ filter_namespace = Object.const_get('DespamilatorFilter')
68
+
69
+ filter_namespace.constants.each do |filter_class|
70
+ filter = filter_namespace.const_get(filter_class).new
71
+ filter.parse(subject)
72
+ end
73
+ end
74
+
44
75
  end
@@ -1,36 +1,52 @@
1
1
  class Despamilator
2
- class Filter
3
- attr_accessor :matches, :score
4
2
 
5
- def initialize text
6
- @matches ||= []
7
- @score ||= 0
8
- run_filters text
9
- end
3
+ #This class is the base class of all the despamilator filters.
4
+ #
5
+ #== EXAMPLE:
6
+ #
7
+ #This example is to detect the letter "a". Put the code in
8
+ #lib/despamilator/filter/detect_letter_a.rb:
9
+ #
10
+ # require 'despamilator/filter_base'
11
+ #
12
+ # module DespamilatorFilter
13
+ #
14
+ # class DetectLetterA < Despamilator::FilterBase
15
+ #
16
+ # def name
17
+ # 'Detecting the letter A'
18
+ # end
19
+ #
20
+ # def description
21
+ # 'Detects the letter "a" in a string for no reason other than a demo'
22
+ # end
23
+ #
24
+ # def parse text
25
+ # if text.downcase.scan(/a/)
26
+ # # add 0.1 to the score of the text
27
+ # self.append_score = 0.1
28
+ # end
29
+ # end
30
+ # end
10
31
 
11
- private
32
+ class Filter
12
33
 
13
- def run_filters text
14
- filter_namespace = Object.const_get('DespamilatorFilter')
34
+ # The nice description of the filter. Usually no more than a sentence.
15
35
 
16
- filter_namespace.constants.each do |filter_class|
17
- execute_filter(filter_namespace.const_get(filter_class).new, text)
18
- end
36
+ def description
37
+ raise "No description defined for #{self.class}"
19
38
  end
20
39
 
21
- private
40
+ # This method parses some text. The score is assigned to the same instance.
22
41
 
23
- def execute_filter filter, text
24
- filter.parse text.dup
25
-
26
- if filter.matched?
27
- @matches.push(filter)
28
- @score += filter.score
29
- end
42
+ def parse text
43
+ raise "No parser defined for #{self.class}"
30
44
  end
31
45
 
32
- Dir.glob(File.join(File.dirname(__FILE__), 'filter', '*.rb')).each do |filter_file|
33
- require filter_file
46
+ # The one or two word name for the filter.
47
+
48
+ def name
49
+ raise "No name defined for #{self.class}"
34
50
  end
35
51
 
36
52
  end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class GtubsTestFilter < Despamilator::FilterBase
5
+ class GtubsTestFilter < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'GTubs Test Filter'
@@ -12,8 +12,8 @@ module DespamilatorFilter
12
12
  'Detects the special test string (Despamilator.gtubs_test_string) and assigns a big score.'
13
13
  end
14
14
 
15
- def parse text
16
- self.append_score = 100 if text == Despamilator.gtubs_test_string
15
+ def parse subject
16
+ subject.register_match!({:score => 100, :filter => self}) if subject.text == Despamilator.gtubs_test_string
17
17
  end
18
18
 
19
19
  end
@@ -1,15 +1,17 @@
1
- require 'despamilator/filter_base'
2
-
3
1
  module DespamilatorFilter
4
2
 
5
- class HtmlTags < Despamilator::FilterBase
3
+ class HtmlTags < Despamilator::Filter
6
4
 
7
- def parse text
8
- text.downcase!
5
+ def parse subject
6
+ text = subject.text.downcase
9
7
 
10
8
  html_tags.each do |tag|
11
- if text.match(/<\s*#{tag}\W/) || text.match(/<\n*#{tag}\W/) || text.match(/\W#{tag}\s*\//) || text.match(/\W#{tag}\n*\//)
12
- self.append_score = 0.6
9
+ opening_elements = text.count(/<\s*#{tag}\W/)
10
+ closing_elements = text.count(/\W#{tag}\s*\/>/)
11
+
12
+ if opening_elements > 0 or closing_elements > 0
13
+ safest_element_count = opening_elements > closing_elements ? opening_elements : closing_elements
14
+ subject.register_match!({:score => 0.6 * safest_element_count, :filter => self})
13
15
  end
14
16
  end
15
17
  end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class IPAddressURL < Despamilator::FilterBase
5
+ class IPAddressURL < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'IP Address URL'
@@ -12,8 +12,10 @@ module DespamilatorFilter
12
12
  'Detects IP address URLs'
13
13
  end
14
14
 
15
- def parse text
16
- self.append_score = 0.5 if text.downcase.scan(/http:\/\/\d+\.\d+\.\d+\.\d+/).length > 0
15
+ def parse subject
16
+ subject.register_match!({
17
+ :score => 0.5, :filter => self
18
+ }) if subject.text.downcase.count(/http:\/\/\d+\.\d+\.\d+\.\d+/) > 0
17
19
  end
18
20
 
19
21
  end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class LongWords < Despamilator::FilterBase
5
+ class LongWords < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Long Words'
@@ -12,9 +12,11 @@ module DespamilatorFilter
12
12
  'Detects long and unbroken strings'
13
13
  end
14
14
 
15
- def parse text
16
- text.split(/\W+/).each do |word|
17
- self.append_score = 0.1 if word.length > 20
15
+ def parse subject
16
+ subject.text.without_uris.words.each do |word|
17
+ subject.register_match!({
18
+ :score => 0.1, :filter => self
19
+ }) if word.length > 20
18
20
  end
19
21
  end
20
22
 
@@ -0,0 +1,21 @@
1
+ module DespamilatorFilter
2
+
3
+ class MixedCase < Despamilator::Filter
4
+ def name
5
+ 'Mixed Case String'
6
+ end
7
+
8
+ def description
9
+ 'Detects mixed case strings.'
10
+ end
11
+
12
+ def parse subject
13
+ text = subject.text.without_uris
14
+ count = text.remove_and_count!(/[a-z][A-Z]/)
15
+ count += text.remove_and_count!(/[a-z][A-Z][a-z]/)
16
+ subject.register_match!({:score => 0.1 * count, :filter => self}) if count > 0
17
+ end
18
+
19
+ end
20
+
21
+ end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class NaughtyWords < Despamilator::FilterBase
5
+ class NaughtyWords < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Naughty Words'
@@ -12,11 +12,11 @@ module DespamilatorFilter
12
12
  'Detects cheeky words'
13
13
  end
14
14
 
15
- def parse text
16
- text.downcase!
15
+ def parse subject
16
+ text = subject.text.downcase
17
17
 
18
18
  naughty_words.each do |word|
19
- self.append_score = 0.1 if text =~ /\b#{word}s?\b/
19
+ subject.register_match!({:score => 0.1, :filter => self}) if text =~ /\b#{word}s?\b/
20
20
  end
21
21
  end
22
22
 
@@ -1,17 +1,11 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class NumbersAndWords < Despamilator::FilterBase
5
+ class NumbersAndWords < Despamilator::Filter
6
6
 
7
- def parse text
8
- text.downcase!
9
-
10
- # strip out "good numbers"
11
- text.gsub!(/h[1-6]/, '')
12
- text.gsub!(/(^|\b)\d+($|\b)/, '')
13
- text.gsub!(/(^|\b)\d+(,|\.)\d+($|\b)/, '')
14
- text.gsub!(/(^|\b)\d+(st|nd|rd|th)($|\b)/, '')
7
+ def parse subject
8
+ text = tidy_text(subject)
15
9
 
16
10
  [
17
11
  /\w\d+/,
@@ -25,7 +19,7 @@ module DespamilatorFilter
25
19
  matches.each do |to_remove|
26
20
  to_remove = to_remove.to_s
27
21
  text.sub!(to_remove, '') unless to_remove.empty?
28
- self.append_score = 0.1
22
+ subject.register_match!({:score => 0.1, :filter => self})
29
23
  end
30
24
  end
31
25
  end
@@ -38,6 +32,20 @@ module DespamilatorFilter
38
32
  'Detects unusual number/word combinations'
39
33
  end
40
34
 
35
+ private
36
+
37
+ def tidy_text subject
38
+ text = subject.text.without_uris
39
+ text.downcase!
40
+
41
+ # strip out "good numbers"
42
+ text.gsub!(/h[1-6]/, '')
43
+ text.gsub!(/(^|\b)\d+($|\b)/, '')
44
+ text.gsub!(/(^|\b)\d+(,|\.)\d+($|\b)/, '')
45
+ text.gsub!(/(^|\b)\d+(st|nd|rd|th)($|\b)/, '')
46
+
47
+ text
48
+ end
41
49
  end
42
50
 
43
51
  end
@@ -0,0 +1,41 @@
1
+ module DespamilatorFilter
2
+
3
+ class ObfuscatedURLs < Despamilator::Filter
4
+ def name
5
+ 'Obfuscated URLs'
6
+ end
7
+
8
+ def description
9
+ 'Finds lame attempts at obfuscating urls.'
10
+ end
11
+
12
+ def parse subject
13
+ text = subject.text.without_uris.downcase
14
+ count = find_space_separated_parts text
15
+ count += find_space_separated_characters text
16
+
17
+ # weird maths below is due to some issue with ruby 1.9.2 multiplying floats by 3 (?!)
18
+ subject.register_match!({score: (4.0 * count) / 10, filter: self}) if count > 0
19
+ end
20
+
21
+ private
22
+
23
+ def find_space_separated_parts text
24
+ text.count(/www\s+\w+\s+com/)
25
+ end
26
+
27
+ def find_space_separated_characters text
28
+ count = 0
29
+
30
+ text.split(/[a-z][a-z]/).each do |candidate|
31
+ candidate.strip!
32
+ candidate.gsub!(/\s+/, '')
33
+ count += 1 if candidate =~ /\w{5,}\.\w{2,3}/
34
+ end
35
+
36
+ count
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,19 @@
1
+ module DespamilatorFilter
2
+
3
+ class Prices < Despamilator::Filter
4
+ def name
5
+ 'Prices'
6
+ end
7
+
8
+ def description
9
+ 'Detects prices in text.'
10
+ end
11
+
12
+ def parse subject
13
+ price_count = subject.text.count(/\$\s*\d+/)
14
+ subject.register_match!({:score => 0.075 * price_count, :filter => self}) if price_count > 0
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -1,11 +1,11 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class ScriptTag < Despamilator::FilterBase
5
+ class ScriptTag < Despamilator::Filter
6
6
 
7
- def parse text
8
- self.append_score = 1 if text.downcase.match(/<\/?script(>|\s+|\n|\r)/)
7
+ def parse subject
8
+ subject.register_match!({:score => 1, :filter => self}) if subject.text.downcase.match(/<\/?script(>|\s+|\n|\r)/)
9
9
  end
10
10
 
11
11
  def name
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class Shouting < Despamilator::FilterBase
5
+ class Shouting < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Shouting'
@@ -12,17 +12,20 @@ module DespamilatorFilter
12
12
  'Detects and scores shouting (all caps)'
13
13
  end
14
14
 
15
- def parse text
15
+ def parse subject
16
16
  # strip HTML
17
- text.gsub!(/<\/?[^>]*>/, "")
17
+ text = subject.text.gsub(/<\/?[^>]*>/, "")
18
18
 
19
19
  return if text.length < 20
20
20
 
21
21
  uppercased = text.scan(/[A-Z][A-Z]+/).join.length
22
- lowercased = text.scan(/[a-z]/).length
22
+ lowercased = text.count(/[a-z]/)
23
23
 
24
24
  if uppercased > 0
25
- self.append_score = (uppercased.to_f / (uppercased + lowercased)) * 0.5
25
+ subject.register_match!({
26
+ :score => (uppercased.to_f / (uppercased + lowercased)) * 0.5,
27
+ :filter => self
28
+ })
26
29
  end
27
30
  end
28
31