despamilator 2.0.1 → 2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/History.txt +7 -0
  2. data/lib/despamilator.rb +38 -7
  3. data/lib/despamilator/filter.rb +39 -23
  4. data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
  5. data/lib/despamilator/filter/html_tags.rb +9 -7
  6. data/lib/despamilator/filter/ip_address_url.rb +6 -4
  7. data/lib/despamilator/filter/long_words.rb +7 -5
  8. data/lib/despamilator/filter/mixed_case.rb +21 -0
  9. data/lib/despamilator/filter/naughty_words.rb +5 -5
  10. data/lib/despamilator/filter/numbers_and_words.rb +19 -11
  11. data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
  12. data/lib/despamilator/filter/prices.rb +19 -0
  13. data/lib/despamilator/filter/script_tag.rb +4 -4
  14. data/lib/despamilator/filter/shouting.rb +9 -6
  15. data/lib/despamilator/filter/spammy_tlds.rb +22 -0
  16. data/lib/despamilator/filter/square_brackets.rb +5 -5
  17. data/lib/despamilator/filter/trailing_number.rb +4 -4
  18. data/lib/despamilator/filter/unusual_characters.rb +5 -5
  19. data/lib/despamilator/filter/urls.rb +7 -9
  20. data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
  21. data/lib/despamilator/filter/weird_punctuation.rb +44 -0
  22. data/lib/despamilator/subject.rb +30 -0
  23. data/lib/despamilator/subject/text.rb +32 -0
  24. data/lib/despamilator/version.rb +3 -0
  25. metadata +29 -75
  26. data/.rspec +0 -2
  27. data/.rvmrc +0 -1
  28. data/Gemfile +0 -12
  29. data/Gemfile.lock +0 -47
  30. data/Manifest.txt +0 -46
  31. data/PostInstall.txt +0 -1
  32. data/Rakefile +0 -39
  33. data/conf/unusual_characters.txt +0 -6674
  34. data/despamilator.gemspec +0 -38
  35. data/lib/despamilator/filter_base.rb +0 -82
  36. data/scripts/despamilator_score.rb +0 -25
  37. data/scripts/from_file.rb +0 -26
  38. data/spec/despamilator_spec.rb +0 -13
  39. data/spec/filter_base_spec.rb +0 -30
  40. data/spec/filters/gtubs_test_filter_spec.rb +0 -9
  41. data/spec/filters/html_tags_spec.rb +0 -129
  42. data/spec/filters/ip_address_url_spec.rb +0 -11
  43. data/spec/filters/long_words_spec.rb +0 -11
  44. data/spec/filters/naughty_words_spec.rb +0 -11
  45. data/spec/filters/numbers_and_words_spec.rb +0 -34
  46. data/spec/filters/script_tag_spec.rb +0 -22
  47. data/spec/filters/shouting_spec.rb +0 -45
  48. data/spec/filters/square_brackets_spec.rb +0 -11
  49. data/spec/filters/trailing_number_spec.rb +0 -10
  50. data/spec/filters/unusual_characters_spec.rb +0 -9
  51. data/spec/filters/urls_spec.rb +0 -11
  52. data/spec/helpers/corpus_helper.rb +0 -5
  53. data/spec/helpers/filter_helper.rb +0 -59
  54. data/spec/helpers/spec_helper.rb +0 -6
  55. data/tasks/test.rake +0 -6
data/despamilator.gemspec DELETED
@@ -1,38 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
- Gem::Specification.new do |s|
4
- s.name = %q{despamilator}
5
- s.version = "2.0.1"
6
-
7
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
- s.authors = ["Stephen Hardisty"]
9
- s.date = %q{2011-08-11}
10
- s.description = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
11
- Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
12
- some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.}
13
- s.email = ["moowahaha@hotmail.com"]
14
- s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt", "conf/unusual_characters.txt"]
15
- s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "conf/unusual_characters.txt", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/gtubs_test_filter.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/trailing_number.rb", "lib/despamilator/filter/unusual_characters.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "scripts/from_file.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/gtubs_test_filter_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/trailing_number_spec.rb", "spec/filters/unusual_characters_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
16
- s.homepage = %q{http://github.com/moowahaha/despamilator}
17
- s.post_install_message = %q{PostInstall.txt}
18
- s.rdoc_options = ["--main", "README.rdoc"]
19
- s.require_paths = ["lib"]
20
- s.rubyforge_project = %q{despamilator}
21
- s.rubygems_version = %q{1.5.2}
22
- s.summary = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive}
23
-
24
- if s.respond_to? :specification_version then
25
- s.specification_version = 3
26
-
27
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
28
- s.add_development_dependency(%q<rubyforge>, [">= 2.0.4"])
29
- s.add_development_dependency(%q<hoe>, [">= 2.7.0"])
30
- else
31
- s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
32
- s.add_dependency(%q<hoe>, [">= 2.7.0"])
33
- end
34
- else
35
- s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
36
- s.add_dependency(%q<hoe>, [">= 2.7.0"])
37
- end
38
- end
@@ -1,82 +0,0 @@
1
- class Despamilator
2
-
3
- #This class is the base class of all the despamilator filters.
4
- #
5
- #== EXAMPLE:
6
- #
7
- #This example is to detect the letter "a". Put the code in
8
- #lib/despamilator/filter/detect_letter_a.rb:
9
- #
10
- # require 'despamilator/filter_base'
11
- #
12
- # module DespamilatorFilter
13
- #
14
- # class DetectLetterA < Despamilator::FilterBase
15
- #
16
- # def name
17
- # 'Detecting the letter A'
18
- # end
19
- #
20
- # def description
21
- # 'Detects the letter "a" in a string for no reason other than a demo'
22
- # end
23
- #
24
- # def parse text
25
- # if text.downcase.scan(/a/)
26
- # # add 0.1 to the score of the text
27
- # self.append_score = 0.1
28
- # end
29
- # end
30
- # end
31
-
32
- class FilterBase
33
- attr_accessor :text, :score, :matches
34
-
35
- # Constructor for the class.
36
-
37
- def initialize
38
- @matches = 0
39
- @score = 0
40
- @matched = false
41
- end
42
-
43
- # Returns the score the filter instance has calculated.
44
-
45
- def score
46
- # workaround for a bug in ruby 1.9.2's floats
47
- sprintf("%.3f", @score).to_f
48
- end
49
-
50
- # The nice description of the filter. Usually no more than a sentence.
51
-
52
- def description
53
- raise "No description defined for #{self.class}"
54
- end
55
-
56
- # This method parses some text. The score is assigned to the same instance.
57
-
58
- def parse text
59
- raise "No parse defined for #{self.class}"
60
- end
61
-
62
- # The one or two word name for the filter.
63
-
64
- def name
65
- raise "No name defined for #{self.class}"
66
- end
67
-
68
- # Boolean. Whether or not the filter matched anything.
69
-
70
- def matched?
71
- @score > 0
72
- end
73
-
74
- protected
75
-
76
- def append_score= new_score
77
- @matches += 1
78
- @score += new_score
79
- end
80
-
81
- end
82
- end
@@ -1,25 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'lib', 'despamilator')
4
- require 'zlib'
5
-
6
- file = ARGV[0] || raise("Usage: despamilator_spec.rb [filename]")
7
-
8
- text = file =~ /\.gz$/i ? Zlib::GzipReader.open(file).read : File.open(file).read
9
-
10
- puts "Testing:"
11
- puts "========================"
12
- puts text
13
- puts "========================"
14
- puts "\n"
15
-
16
- dspam = Despamilator.new(text)
17
-
18
- puts "Total Score: #{dspam.score}\n\n"
19
-
20
- puts "Matched by..." unless dspam.matched_by.empty?
21
- dspam.matched_by.each do |match|
22
- puts "\tFilter: #{match.name}"
23
- puts "\tScore: #{match.score}"
24
- puts "\n"
25
- end
data/scripts/from_file.rb DELETED
@@ -1,26 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'cgi'
4
-
5
- slots = {};
6
-
7
- 1.upto(10000) do |i|
8
- slots[i] = true
9
- end
10
-
11
- dir = './spec/spam_corpus/'
12
-
13
- Dir[dir + '*.gz'].each do |file|
14
- slots.delete(file.scan(/\d+/).first.to_i)
15
- end
16
-
17
- slots = slots.keys.sort
18
-
19
- File.open(ARGV[0] || raise).each do |line|
20
- txt = dir + "#{slots.shift}.txt"
21
- File.open(txt, 'w') do |fh|
22
- fh.puts CGI.unescapeHTML(line)
23
- end
24
-
25
- `gzip #{txt}`
26
- end
@@ -1,13 +0,0 @@
1
- describe Despamilator do
2
- before :each do
3
- @dspam = Despamilator.new('this text is absolutely fine')
4
- end
5
-
6
- it "should return a zero score for fine text" do
7
- @dspam.score.should == 0
8
- end
9
-
10
- it "should return no matching filter for fine text" do
11
- @dspam.matched_by.should be_empty
12
- end
13
- end
@@ -1,30 +0,0 @@
1
- require 'despamilator/filter_base'
2
-
3
- class UnimplementedFilter < Despamilator::FilterBase
4
- end
5
-
6
- describe Despamilator::FilterBase do
7
- describe "abstract method" do
8
-
9
- before do
10
- @filter = UnimplementedFilter.new
11
- end
12
-
13
- [
14
- ['name', 'No name defined for UnimplementedFilter'],
15
- ['description', 'No description defined for UnimplementedFilter'],
16
-
17
- ].each do |method, exception|
18
-
19
- it "should throw an error when the child class has not implemented a '#{method}' method" do
20
- -> {@filter.send(method)}.should raise_error(exception)
21
- end
22
-
23
- end
24
-
25
- it "should throw an error when the child class has not implemented a 'parse' method" do
26
- -> {@filter.parse('abc')}.should raise_error('No parse defined for UnimplementedFilter')
27
- end
28
-
29
- end
30
- end
@@ -1,9 +0,0 @@
1
- describe DespamilatorFilter::GtubsTestFilter do
2
- the_name_should_be 'GTubs Test Filter'
3
- the_description_should_be 'Detects the special test string (Despamilator.gtubs_test_string) and assigns a big score.'
4
-
5
- despamilator_should_apply_the_filter_for(Despamilator.gtubs_test_string)
6
-
7
- a_single_match_of(Despamilator.gtubs_test_string, should_score: 100)
8
- a_multiple_match_of(Despamilator.gtubs_test_string, should_score: [100, 1.times])
9
- end
@@ -1,129 +0,0 @@
1
- describe DespamilatorFilter::HtmlTags do
2
-
3
- the_name_should_be 'HTML tags'
4
- the_description_should_be 'Detects HTML tags in text'
5
-
6
- despamilator_should_apply_the_filter_for('<xmp>')
7
-
8
- a_single_match_of('<xmp>', should_score: 0.6)
9
- a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [1.2, 2.times])
10
-
11
- [
12
- '!--',
13
- '!DOCTYPE',
14
- 'a',
15
- 'abbr',
16
- 'acronym',
17
- 'address',
18
- 'applet',
19
- 'area',
20
- 'b',
21
- 'base',
22
- 'basefont',
23
- 'bdo',
24
- 'big',
25
- 'blockquote',
26
- 'body',
27
- 'br',
28
- 'button',
29
- 'caption',
30
- 'center',
31
- 'cite',
32
- 'code',
33
- 'col',
34
- 'colgroup',
35
- 'dd',
36
- 'del',
37
- 'dfn',
38
- 'dir',
39
- 'div',
40
- 'dl',
41
- 'dt',
42
- 'em',
43
- 'fieldset',
44
- 'font',
45
- 'form',
46
- 'frame',
47
- 'frameset',
48
- 'h1',
49
- 'h2',
50
- 'h3',
51
- 'h4',
52
- 'h5',
53
- 'h6',
54
- 'head',
55
- 'hr',
56
- 'html',
57
- 'i',
58
- 'iframe',
59
- 'img',
60
- 'input',
61
- 'ins',
62
- 'isindex',
63
- 'kbd',
64
- 'label',
65
- 'legend',
66
- 'li',
67
- 'link',
68
- 'map',
69
- 'menu',
70
- 'meta',
71
- 'noframes',
72
- 'noscript',
73
- 'object',
74
- 'ol',
75
- 'optgroup',
76
- 'option',
77
- 'p',
78
- 'param',
79
- 'pre',
80
- 'q',
81
- 's',
82
- 'samp',
83
- 'select',
84
- 'small',
85
- 'span',
86
- 'strike',
87
- 'strong',
88
- 'style',
89
- 'sub',
90
- 'sup',
91
- 'table',
92
- 'tbody',
93
- 'td',
94
- 'textarea',
95
- 'tfoot',
96
- 'th',
97
- 'thead',
98
- 'title',
99
- 'tr',
100
- 'tt',
101
- 'u',
102
- 'ul',
103
- 'var',
104
- 'xmp'
105
- ].each do |script_tag|
106
- [script_tag.upcase, script_tag.downcase].each do |script_tag|
107
- [
108
- "<#{script_tag}>",
109
- "<#{script_tag}/>",
110
- "< #{script_tag} >",
111
- "<#{script_tag} />",
112
- "<\n#{script_tag}\n/>",
113
- "<\n#{script_tag} >",
114
- "<#{script_tag}\n/>",
115
- "<\r#{script_tag}\r/>"
116
- ].each do |tag|
117
- it "should detect '#{tag}'" do
118
- dspam = DespamilatorFilter::HtmlTags.new
119
- dspam.parse(tag)
120
- dspam.score.should == 0.6
121
- end
122
-
123
- end
124
-
125
- end
126
-
127
- end
128
-
129
- end
@@ -1,11 +0,0 @@
1
- describe DespamilatorFilter::IPAddressURL do
2
-
3
- the_name_should_be 'IP Address URL'
4
- the_description_should_be 'Detects IP address URLs'
5
-
6
- despamilator_should_apply_the_filter_for('http://12.34.56.78/')
7
-
8
- a_single_match_of('http://12.34.56.78/', should_score: 0.5)
9
- a_multiple_match_of('http://12.34.56.78/ http://98.76.54.32/', should_score: [0.5, 1.times])
10
-
11
- end
@@ -1,11 +0,0 @@
1
- describe DespamilatorFilter::LongWords do
2
-
3
- the_name_should_be 'Long Words'
4
- the_description_should_be 'Detects long and unbroken strings'
5
-
6
- despamilator_should_apply_the_filter_for('honorificabilitudinitatibus')
7
-
8
- a_single_match_of('honorificabilitudinitatibus', should_score: 0.1)
9
- a_multiple_match_of('honorificabilitudinitatibus antidisestablishmentarianism', should_score: [0.2, 2.times])
10
-
11
- end
@@ -1,11 +0,0 @@
1
- describe DespamilatorFilter::NaughtyWords do
2
-
3
- the_name_should_be 'Naughty Words'
4
- the_description_should_be 'Detects cheeky words'
5
-
6
- despamilator_should_apply_the_filter_for('bondage')
7
-
8
- a_single_match_of('bondage', should_score: 0.1)
9
- a_multiple_match_of('viagra penis', should_score: [0.2, 2.times])
10
-
11
- end
@@ -1,34 +0,0 @@
1
- describe DespamilatorFilter::NumbersAndWords do
2
-
3
- the_name_should_be 'Numbers next to words'
4
- the_description_should_be 'Detects unusual number/word combinations'
5
-
6
- despamilator_should_apply_the_filter_for('X5T')
7
-
8
- a_single_match_of('X5T', should_score: 0.1)
9
- a_multiple_match_of('4g6hk', should_score: [0.2, 2.times])
10
-
11
- describe 'exceptions' do
12
- before :all do
13
- @filter = DespamilatorFilter::NumbersAndWords.new
14
- end
15
-
16
- [1, 4, 10, 100000, '1,000,000', '1st', '2nd', '3rd', '4th', '5th', '6th', '10th', '122nd'].each do |number|
17
- it "should return a blank for a #{number}" do
18
- @filter.parse(number.to_s)
19
- @filter.score.should == 0
20
- end
21
- end
22
-
23
- [1, 2, 3, 4, 5, 6].each do |tag_no|
24
- header_tag = "h#{tag_no}"
25
-
26
- it "should ignore html header tag #{header_tag}" do
27
- @filter.parse(header_tag)
28
- @filter.score.should == 0
29
- end
30
- end
31
-
32
- end
33
-
34
- end
@@ -1,22 +0,0 @@
1
- describe DespamilatorFilter::ScriptTag do
2
-
3
- the_name_should_be 'Script tag'
4
- the_description_should_be 'Searches for variations for the HTML script tag'
5
-
6
- despamilator_should_apply_the_filter_for('<script>')
7
-
8
- a_single_match_of('<script>', should_score: 1)
9
- a_multiple_match_of('<script></script> <script></script>', should_score: [1, 1.times])
10
-
11
- describe "detecting various script tags" do
12
- ['<script type="whatever">', '<script></script>', '</script>', '<script>', "<script\n>"].each do |script_tag|
13
- [script_tag.upcase, script_tag.downcase].each do |script_tag|
14
- it "should detect '#{script_tag}' of a script tag" do
15
- dspam = Despamilator.new(script_tag)
16
- dspam.score.should == 1
17
- end
18
- end
19
- end
20
- end
21
-
22
- end