despamilator 2.0.1 → 2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/History.txt +7 -0
  2. data/lib/despamilator.rb +38 -7
  3. data/lib/despamilator/filter.rb +39 -23
  4. data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
  5. data/lib/despamilator/filter/html_tags.rb +9 -7
  6. data/lib/despamilator/filter/ip_address_url.rb +6 -4
  7. data/lib/despamilator/filter/long_words.rb +7 -5
  8. data/lib/despamilator/filter/mixed_case.rb +21 -0
  9. data/lib/despamilator/filter/naughty_words.rb +5 -5
  10. data/lib/despamilator/filter/numbers_and_words.rb +19 -11
  11. data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
  12. data/lib/despamilator/filter/prices.rb +19 -0
  13. data/lib/despamilator/filter/script_tag.rb +4 -4
  14. data/lib/despamilator/filter/shouting.rb +9 -6
  15. data/lib/despamilator/filter/spammy_tlds.rb +22 -0
  16. data/lib/despamilator/filter/square_brackets.rb +5 -5
  17. data/lib/despamilator/filter/trailing_number.rb +4 -4
  18. data/lib/despamilator/filter/unusual_characters.rb +5 -5
  19. data/lib/despamilator/filter/urls.rb +7 -9
  20. data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
  21. data/lib/despamilator/filter/weird_punctuation.rb +44 -0
  22. data/lib/despamilator/subject.rb +30 -0
  23. data/lib/despamilator/subject/text.rb +32 -0
  24. data/lib/despamilator/version.rb +3 -0
  25. metadata +29 -75
  26. data/.rspec +0 -2
  27. data/.rvmrc +0 -1
  28. data/Gemfile +0 -12
  29. data/Gemfile.lock +0 -47
  30. data/Manifest.txt +0 -46
  31. data/PostInstall.txt +0 -1
  32. data/Rakefile +0 -39
  33. data/conf/unusual_characters.txt +0 -6674
  34. data/despamilator.gemspec +0 -38
  35. data/lib/despamilator/filter_base.rb +0 -82
  36. data/scripts/despamilator_score.rb +0 -25
  37. data/scripts/from_file.rb +0 -26
  38. data/spec/despamilator_spec.rb +0 -13
  39. data/spec/filter_base_spec.rb +0 -30
  40. data/spec/filters/gtubs_test_filter_spec.rb +0 -9
  41. data/spec/filters/html_tags_spec.rb +0 -129
  42. data/spec/filters/ip_address_url_spec.rb +0 -11
  43. data/spec/filters/long_words_spec.rb +0 -11
  44. data/spec/filters/naughty_words_spec.rb +0 -11
  45. data/spec/filters/numbers_and_words_spec.rb +0 -34
  46. data/spec/filters/script_tag_spec.rb +0 -22
  47. data/spec/filters/shouting_spec.rb +0 -45
  48. data/spec/filters/square_brackets_spec.rb +0 -11
  49. data/spec/filters/trailing_number_spec.rb +0 -10
  50. data/spec/filters/unusual_characters_spec.rb +0 -9
  51. data/spec/filters/urls_spec.rb +0 -11
  52. data/spec/helpers/corpus_helper.rb +0 -5
  53. data/spec/helpers/filter_helper.rb +0 -59
  54. data/spec/helpers/spec_helper.rb +0 -6
  55. data/tasks/test.rake +0 -6
data/despamilator.gemspec DELETED
@@ -1,38 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
- Gem::Specification.new do |s|
4
- s.name = %q{despamilator}
5
- s.version = "2.0.1"
6
-
7
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
- s.authors = ["Stephen Hardisty"]
9
- s.date = %q{2011-08-11}
10
- s.description = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
11
- Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
12
- some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.}
13
- s.email = ["moowahaha@hotmail.com"]
14
- s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt", "conf/unusual_characters.txt"]
15
- s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "conf/unusual_characters.txt", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/gtubs_test_filter.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/trailing_number.rb", "lib/despamilator/filter/unusual_characters.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "scripts/from_file.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/gtubs_test_filter_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/trailing_number_spec.rb", "spec/filters/unusual_characters_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
16
- s.homepage = %q{http://github.com/moowahaha/despamilator}
17
- s.post_install_message = %q{PostInstall.txt}
18
- s.rdoc_options = ["--main", "README.rdoc"]
19
- s.require_paths = ["lib"]
20
- s.rubyforge_project = %q{despamilator}
21
- s.rubygems_version = %q{1.5.2}
22
- s.summary = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive}
23
-
24
- if s.respond_to? :specification_version then
25
- s.specification_version = 3
26
-
27
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
28
- s.add_development_dependency(%q<rubyforge>, [">= 2.0.4"])
29
- s.add_development_dependency(%q<hoe>, [">= 2.7.0"])
30
- else
31
- s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
32
- s.add_dependency(%q<hoe>, [">= 2.7.0"])
33
- end
34
- else
35
- s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
36
- s.add_dependency(%q<hoe>, [">= 2.7.0"])
37
- end
38
- end
@@ -1,82 +0,0 @@
1
- class Despamilator
2
-
3
- #This class is the base class of all the despamilator filters.
4
- #
5
- #== EXAMPLE:
6
- #
7
- #This example is to detect the letter "a". Put the code in
8
- #lib/despamilator/filter/detect_letter_a.rb:
9
- #
10
- # require 'despamilator/filter_base'
11
- #
12
- # module DespamilatorFilter
13
- #
14
- # class DetectLetterA < Despamilator::FilterBase
15
- #
16
- # def name
17
- # 'Detecting the letter A'
18
- # end
19
- #
20
- # def description
21
- # 'Detects the letter "a" in a string for no reason other than a demo'
22
- # end
23
- #
24
- # def parse text
25
- # if text.downcase.scan(/a/)
26
- # # add 0.1 to the score of the text
27
- # self.append_score = 0.1
28
- # end
29
- # end
30
- # end
31
-
32
- class FilterBase
33
- attr_accessor :text, :score, :matches
34
-
35
- # Constructor for the class.
36
-
37
- def initialize
38
- @matches = 0
39
- @score = 0
40
- @matched = false
41
- end
42
-
43
- # Returns the score the filter instance has calculated.
44
-
45
- def score
46
- # workaround for a bug in ruby 1.9.2's floats
47
- sprintf("%.3f", @score).to_f
48
- end
49
-
50
- # The nice description of the filter. Usually no more than a sentence.
51
-
52
- def description
53
- raise "No description defined for #{self.class}"
54
- end
55
-
56
- # This method parses some text. The score is assigned to the same instance.
57
-
58
- def parse text
59
- raise "No parse defined for #{self.class}"
60
- end
61
-
62
- # The one or two word name for the filter.
63
-
64
- def name
65
- raise "No name defined for #{self.class}"
66
- end
67
-
68
- # Boolean. Whether or not the filter matched anything.
69
-
70
- def matched?
71
- @score > 0
72
- end
73
-
74
- protected
75
-
76
- def append_score= new_score
77
- @matches += 1
78
- @score += new_score
79
- end
80
-
81
- end
82
- end
@@ -1,25 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'lib', 'despamilator')
4
- require 'zlib'
5
-
6
- file = ARGV[0] || raise("Usage: despamilator_spec.rb [filename]")
7
-
8
- text = file =~ /\.gz$/i ? Zlib::GzipReader.open(file).read : File.open(file).read
9
-
10
- puts "Testing:"
11
- puts "========================"
12
- puts text
13
- puts "========================"
14
- puts "\n"
15
-
16
- dspam = Despamilator.new(text)
17
-
18
- puts "Total Score: #{dspam.score}\n\n"
19
-
20
- puts "Matched by..." unless dspam.matched_by.empty?
21
- dspam.matched_by.each do |match|
22
- puts "\tFilter: #{match.name}"
23
- puts "\tScore: #{match.score}"
24
- puts "\n"
25
- end
data/scripts/from_file.rb DELETED
@@ -1,26 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'cgi'
4
-
5
- slots = {};
6
-
7
- 1.upto(10000) do |i|
8
- slots[i] = true
9
- end
10
-
11
- dir = './spec/spam_corpus/'
12
-
13
- Dir[dir + '*.gz'].each do |file|
14
- slots.delete(file.scan(/\d+/).first.to_i)
15
- end
16
-
17
- slots = slots.keys.sort
18
-
19
- File.open(ARGV[0] || raise).each do |line|
20
- txt = dir + "#{slots.shift}.txt"
21
- File.open(txt, 'w') do |fh|
22
- fh.puts CGI.unescapeHTML(line)
23
- end
24
-
25
- `gzip #{txt}`
26
- end
@@ -1,13 +0,0 @@
1
- describe Despamilator do
2
- before :each do
3
- @dspam = Despamilator.new('this text is absolutely fine')
4
- end
5
-
6
- it "should return a zero score for fine text" do
7
- @dspam.score.should == 0
8
- end
9
-
10
- it "should return no matching filter for fine text" do
11
- @dspam.matched_by.should be_empty
12
- end
13
- end
@@ -1,30 +0,0 @@
1
- require 'despamilator/filter_base'
2
-
3
- class UnimplementedFilter < Despamilator::FilterBase
4
- end
5
-
6
- describe Despamilator::FilterBase do
7
- describe "abstract method" do
8
-
9
- before do
10
- @filter = UnimplementedFilter.new
11
- end
12
-
13
- [
14
- ['name', 'No name defined for UnimplementedFilter'],
15
- ['description', 'No description defined for UnimplementedFilter'],
16
-
17
- ].each do |method, exception|
18
-
19
- it "should throw an error when the child class has not implemented a '#{method}' method" do
20
- -> {@filter.send(method)}.should raise_error(exception)
21
- end
22
-
23
- end
24
-
25
- it "should throw an error when the child class has not implemented a 'parse' method" do
26
- -> {@filter.parse('abc')}.should raise_error('No parse defined for UnimplementedFilter')
27
- end
28
-
29
- end
30
- end
@@ -1,9 +0,0 @@
1
- describe DespamilatorFilter::GtubsTestFilter do
2
- the_name_should_be 'GTubs Test Filter'
3
- the_description_should_be 'Detects the special test string (Despamilator.gtubs_test_string) and assigns a big score.'
4
-
5
- despamilator_should_apply_the_filter_for(Despamilator.gtubs_test_string)
6
-
7
- a_single_match_of(Despamilator.gtubs_test_string, should_score: 100)
8
- a_multiple_match_of(Despamilator.gtubs_test_string, should_score: [100, 1.times])
9
- end
@@ -1,129 +0,0 @@
1
- describe DespamilatorFilter::HtmlTags do
2
-
3
- the_name_should_be 'HTML tags'
4
- the_description_should_be 'Detects HTML tags in text'
5
-
6
- despamilator_should_apply_the_filter_for('<xmp>')
7
-
8
- a_single_match_of('<xmp>', should_score: 0.6)
9
- a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [1.2, 2.times])
10
-
11
- [
12
- '!--',
13
- '!DOCTYPE',
14
- 'a',
15
- 'abbr',
16
- 'acronym',
17
- 'address',
18
- 'applet',
19
- 'area',
20
- 'b',
21
- 'base',
22
- 'basefont',
23
- 'bdo',
24
- 'big',
25
- 'blockquote',
26
- 'body',
27
- 'br',
28
- 'button',
29
- 'caption',
30
- 'center',
31
- 'cite',
32
- 'code',
33
- 'col',
34
- 'colgroup',
35
- 'dd',
36
- 'del',
37
- 'dfn',
38
- 'dir',
39
- 'div',
40
- 'dl',
41
- 'dt',
42
- 'em',
43
- 'fieldset',
44
- 'font',
45
- 'form',
46
- 'frame',
47
- 'frameset',
48
- 'h1',
49
- 'h2',
50
- 'h3',
51
- 'h4',
52
- 'h5',
53
- 'h6',
54
- 'head',
55
- 'hr',
56
- 'html',
57
- 'i',
58
- 'iframe',
59
- 'img',
60
- 'input',
61
- 'ins',
62
- 'isindex',
63
- 'kbd',
64
- 'label',
65
- 'legend',
66
- 'li',
67
- 'link',
68
- 'map',
69
- 'menu',
70
- 'meta',
71
- 'noframes',
72
- 'noscript',
73
- 'object',
74
- 'ol',
75
- 'optgroup',
76
- 'option',
77
- 'p',
78
- 'param',
79
- 'pre',
80
- 'q',
81
- 's',
82
- 'samp',
83
- 'select',
84
- 'small',
85
- 'span',
86
- 'strike',
87
- 'strong',
88
- 'style',
89
- 'sub',
90
- 'sup',
91
- 'table',
92
- 'tbody',
93
- 'td',
94
- 'textarea',
95
- 'tfoot',
96
- 'th',
97
- 'thead',
98
- 'title',
99
- 'tr',
100
- 'tt',
101
- 'u',
102
- 'ul',
103
- 'var',
104
- 'xmp'
105
- ].each do |script_tag|
106
- [script_tag.upcase, script_tag.downcase].each do |script_tag|
107
- [
108
- "<#{script_tag}>",
109
- "<#{script_tag}/>",
110
- "< #{script_tag} >",
111
- "<#{script_tag} />",
112
- "<\n#{script_tag}\n/>",
113
- "<\n#{script_tag} >",
114
- "<#{script_tag}\n/>",
115
- "<\r#{script_tag}\r/>"
116
- ].each do |tag|
117
- it "should detect '#{tag}'" do
118
- dspam = DespamilatorFilter::HtmlTags.new
119
- dspam.parse(tag)
120
- dspam.score.should == 0.6
121
- end
122
-
123
- end
124
-
125
- end
126
-
127
- end
128
-
129
- end
@@ -1,11 +0,0 @@
1
- describe DespamilatorFilter::IPAddressURL do
2
-
3
- the_name_should_be 'IP Address URL'
4
- the_description_should_be 'Detects IP address URLs'
5
-
6
- despamilator_should_apply_the_filter_for('http://12.34.56.78/')
7
-
8
- a_single_match_of('http://12.34.56.78/', should_score: 0.5)
9
- a_multiple_match_of('http://12.34.56.78/ http://98.76.54.32/', should_score: [0.5, 1.times])
10
-
11
- end
@@ -1,11 +0,0 @@
1
- describe DespamilatorFilter::LongWords do
2
-
3
- the_name_should_be 'Long Words'
4
- the_description_should_be 'Detects long and unbroken strings'
5
-
6
- despamilator_should_apply_the_filter_for('honorificabilitudinitatibus')
7
-
8
- a_single_match_of('honorificabilitudinitatibus', should_score: 0.1)
9
- a_multiple_match_of('honorificabilitudinitatibus antidisestablishmentarianism', should_score: [0.2, 2.times])
10
-
11
- end
@@ -1,11 +0,0 @@
1
- describe DespamilatorFilter::NaughtyWords do
2
-
3
- the_name_should_be 'Naughty Words'
4
- the_description_should_be 'Detects cheeky words'
5
-
6
- despamilator_should_apply_the_filter_for('bondage')
7
-
8
- a_single_match_of('bondage', should_score: 0.1)
9
- a_multiple_match_of('viagra penis', should_score: [0.2, 2.times])
10
-
11
- end
@@ -1,34 +0,0 @@
1
- describe DespamilatorFilter::NumbersAndWords do
2
-
3
- the_name_should_be 'Numbers next to words'
4
- the_description_should_be 'Detects unusual number/word combinations'
5
-
6
- despamilator_should_apply_the_filter_for('X5T')
7
-
8
- a_single_match_of('X5T', should_score: 0.1)
9
- a_multiple_match_of('4g6hk', should_score: [0.2, 2.times])
10
-
11
- describe 'exceptions' do
12
- before :all do
13
- @filter = DespamilatorFilter::NumbersAndWords.new
14
- end
15
-
16
- [1, 4, 10, 100000, '1,000,000', '1st', '2nd', '3rd', '4th', '5th', '6th', '10th', '122nd'].each do |number|
17
- it "should return a blank for a #{number}" do
18
- @filter.parse(number.to_s)
19
- @filter.score.should == 0
20
- end
21
- end
22
-
23
- [1, 2, 3, 4, 5, 6].each do |tag_no|
24
- header_tag = "h#{tag_no}"
25
-
26
- it "should ignore html header tag #{header_tag}" do
27
- @filter.parse(header_tag)
28
- @filter.score.should == 0
29
- end
30
- end
31
-
32
- end
33
-
34
- end
@@ -1,22 +0,0 @@
1
- describe DespamilatorFilter::ScriptTag do
2
-
3
- the_name_should_be 'Script tag'
4
- the_description_should_be 'Searches for variations for the HTML script tag'
5
-
6
- despamilator_should_apply_the_filter_for('<script>')
7
-
8
- a_single_match_of('<script>', should_score: 1)
9
- a_multiple_match_of('<script></script> <script></script>', should_score: [1, 1.times])
10
-
11
- describe "detecting various script tags" do
12
- ['<script type="whatever">', '<script></script>', '</script>', '<script>', "<script\n>"].each do |script_tag|
13
- [script_tag.upcase, script_tag.downcase].each do |script_tag|
14
- it "should detect '#{script_tag}' of a script tag" do
15
- dspam = Despamilator.new(script_tag)
16
- dspam.score.should == 1
17
- end
18
- end
19
- end
20
- end
21
-
22
- end