despamilator 1.1 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/despamilator.gemspec CHANGED
@@ -2,27 +2,26 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{despamilator}
5
- s.version = "1.1"
5
+ s.version = "2.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Stephen Hardisty"]
9
- s.date = %q{2011-01-26}
9
+ s.date = %q{2011-05-24}
10
10
  s.description = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
11
11
  Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
12
12
  some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.}
13
13
  s.email = ["moowahaha@hotmail.com"]
14
- s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt"]
15
- s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/funky_consonant.rb", "lib/despamilator/filter/gtubs_test_filter.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_q.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/trailing_number.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/funky_consonant_spec.rb", "spec/filters/gtubs_test_filter_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_q_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/trailing_number_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
14
+ s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt", "conf/unusual_characters.txt"]
15
+ s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "conf/unusual_characters.txt", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/gtubs_test_filter.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/trailing_number.rb", "lib/despamilator/filter/unusual_characters.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "scripts/from_file.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/gtubs_test_filter_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/trailing_number_spec.rb", "spec/filters/unusual_characters_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
16
16
  s.homepage = %q{http://github.com/moowahaha/despamilator}
17
17
  s.post_install_message = %q{PostInstall.txt}
18
18
  s.rdoc_options = ["--main", "README.rdoc"]
19
19
  s.require_paths = ["lib"]
20
20
  s.rubyforge_project = %q{despamilator}
21
- s.rubygems_version = %q{1.3.7}
21
+ s.rubygems_version = %q{1.5.2}
22
22
  s.summary = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive}
23
23
 
24
24
  if s.respond_to? :specification_version then
25
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
26
25
  s.specification_version = 3
27
26
 
28
27
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
data/lib/despamilator.rb CHANGED
@@ -14,7 +14,7 @@ require 'despamilator/filter'
14
14
  # dspam.matched_by #=> array of matching filters
15
15
 
16
16
  class Despamilator
17
- VERSION = "1.1"
17
+ VERSION = "2.0"
18
18
 
19
19
  # Constructor. Takes the text you which to parse and score.
20
20
 
@@ -9,7 +9,7 @@ module DespamilatorFilter
9
9
 
10
10
  html_tags.each do |tag|
11
11
  if text.match(/<\s*#{tag}\W/) || text.match(/<\n*#{tag}\W/) || text.match(/\W#{tag}\s*\//) || text.match(/\W#{tag}\n*\//)
12
- self.append_score = 0.45
12
+ self.append_score = 0.6
13
13
  end
14
14
  end
15
15
  end
@@ -16,12 +16,13 @@ module DespamilatorFilter
16
16
  text.downcase!
17
17
 
18
18
  naughty_words.each do |word|
19
- self.append_score = 0.1 if text =~ /\b#{word}\b/
19
+ self.append_score = 0.1 if text =~ /\b#{word}s?\b/
20
20
  end
21
21
  end
22
22
 
23
23
  def naughty_words
24
24
  %w{
25
+ underage
25
26
  penis
26
27
  viagra
27
28
  bondage
@@ -30,10 +31,9 @@ module DespamilatorFilter
30
31
  shit
31
32
  dick
32
33
  tits
33
- sex
34
34
  nude
35
35
  dicks
36
- shemales
36
+ shemale
37
37
  dildo
38
38
  porn
39
39
  cock
@@ -0,0 +1,47 @@
1
+ require 'despamilator/filter_base'
2
+
3
+ module DespamilatorFilter
4
+
5
+ class UnusualCharacters < Despamilator::FilterBase
6
+
7
+ def name
8
+ 'Unusual Characters'
9
+ end
10
+
11
+ def description
12
+ 'Detects and scores each occurrence of an unusual 2 or 3 character combination'
13
+ end
14
+
15
+ def parse text
16
+ initialize_combos
17
+ tokenize(text).each do |token|
18
+ self.append_score = 0.05 if @@combos[token.to_sym]
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def tokenize text
25
+ tokens = []
26
+ text.downcase.split(/[^a-z]/).each do |word|
27
+ word.chars.each_with_index do |c, i|
28
+ substr = word[i,i+3]
29
+ tokens << substr.to_sym if substr.length == 3
30
+ tokens << substr[0,2].to_sym if substr.length > 1
31
+ end
32
+ end
33
+ tokens
34
+ end
35
+
36
+ def initialize_combos
37
+ @@combos ||= {}
38
+ return @@combos unless @@combos.empty?
39
+
40
+ File.open(File.join(File.dirname(__FILE__), %w{.. .. .. conf unusual_characters.txt}), 'r').each do |line|
41
+ @@combos[line.strip.to_sym] = true
42
+ end
43
+ end
44
+
45
+ end
46
+
47
+ end
@@ -16,9 +16,9 @@ module DespamilatorFilter
16
16
  text.downcase!
17
17
 
18
18
  text.gsub!(/http:\/\/\d+\.\d+\.\d+\.\d+/, '')
19
-
19
+
20
20
  1.upto(text.scan(/http:\/\//).length) do
21
- self.append_score = 0.5
21
+ self.append_score = 0.4
22
22
  end
23
23
  end
24
24
 
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'cgi'
4
+
5
+ slots = {};
6
+
7
+ 1.upto(10000) do |i|
8
+ slots[i] = true
9
+ end
10
+
11
+ dir = './spec/spam_corpus/'
12
+
13
+ Dir[dir + '*.gz'].each do |file|
14
+ slots.delete(file.scan(/\d+/).first.to_i)
15
+ end
16
+
17
+ slots = slots.keys.sort
18
+
19
+ File.open(ARGV[0] || raise).each do |line|
20
+ txt = dir + "#{slots.shift}.txt"
21
+ File.open(txt, 'w') do |fh|
22
+ fh.puts CGI.unescapeHTML(line)
23
+ end
24
+
25
+ `gzip #{txt}`
26
+ end
@@ -5,8 +5,8 @@ describe DespamilatorFilter::HtmlTags do
5
5
 
6
6
  despamilator_should_apply_the_filter_for('<xmp>')
7
7
 
8
- a_single_match_of('<xmp>', should_score: 0.45)
9
- a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [0.9, 2.times])
8
+ a_single_match_of('<xmp>', should_score: 0.6)
9
+ a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [1.2, 2.times])
10
10
 
11
11
  [
12
12
  '!--',
@@ -117,7 +117,7 @@ describe DespamilatorFilter::HtmlTags do
117
117
  it "should detect '#{tag}'" do
118
118
  dspam = DespamilatorFilter::HtmlTags.new
119
119
  dspam.parse(tag)
120
- dspam.score.should == 0.45
120
+ dspam.score.should == 0.6
121
121
  end
122
122
 
123
123
  end
@@ -0,0 +1,9 @@
1
+ describe DespamilatorFilter::UnusualCharacters do
2
+ the_name_should_be 'Unusual Characters'
3
+ the_description_should_be 'Detects and scores each occurrence of an unusual 2 or 3 character combination'
4
+
5
+ despamilator_should_apply_the_filter_for('sx')
6
+
7
+ a_single_match_of('sx', should_score: 0.05)
8
+ a_multiple_match_of('sxsx', should_score: [0.1, 2.times])
9
+ end
@@ -3,9 +3,9 @@ describe DespamilatorFilter::URLs do
3
3
  the_name_should_be 'URLs'
4
4
  the_description_should_be 'Detects each url in a string'
5
5
 
6
- despamilator_should_apply_the_filter_for('zt')
6
+ despamilator_should_apply_the_filter_for('http://www.blah.com')
7
7
 
8
- a_single_match_of('http://www.blah.com', should_score: 0.5)
9
- a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [1.0, 2.times])
8
+ a_single_match_of('http://www.blah.com', should_score: 0.4)
9
+ a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [0.8, 2.times])
10
10
 
11
11
  end
metadata CHANGED
@@ -1,66 +1,57 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: despamilator
3
- version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 1
7
- - 1
8
- version: "1.1"
3
+ version: !ruby/object:Gem::Version
4
+ version: '2.0'
5
+ prerelease:
9
6
  platform: ruby
10
- authors:
7
+ authors:
11
8
  - Stephen Hardisty
12
9
  autorequire:
13
10
  bindir: bin
14
11
  cert_chain: []
15
-
16
- date: 2011-01-26 00:00:00 +11:00
12
+ date: 2011-05-24 00:00:00.000000000 +10:00
17
13
  default_executable:
18
- dependencies:
19
- - !ruby/object:Gem::Dependency
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
20
16
  name: rubyforge
21
- prerelease: false
22
- requirement: &id001 !ruby/object:Gem::Requirement
17
+ requirement: &2730490 !ruby/object:Gem::Requirement
23
18
  none: false
24
- requirements:
25
- - - ">="
26
- - !ruby/object:Gem::Version
27
- segments:
28
- - 2
29
- - 0
30
- - 4
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
31
22
  version: 2.0.4
32
23
  type: :development
33
- version_requirements: *id001
34
- - !ruby/object:Gem::Dependency
35
- name: hoe
36
24
  prerelease: false
37
- requirement: &id002 !ruby/object:Gem::Requirement
25
+ version_requirements: *2730490
26
+ - !ruby/object:Gem::Dependency
27
+ name: hoe
28
+ requirement: &2730250 !ruby/object:Gem::Requirement
38
29
  none: false
39
- requirements:
40
- - - ">="
41
- - !ruby/object:Gem::Version
42
- segments:
43
- - 2
44
- - 7
45
- - 0
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
46
33
  version: 2.7.0
47
34
  type: :development
48
- version_requirements: *id002
49
- description: |-
50
- Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
51
- Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
52
- some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.
53
- email:
35
+ prerelease: false
36
+ version_requirements: *2730250
37
+ description: ! 'Despamilator is a plugin based spam detector designed for use on your
38
+ web forms borne out of two annoyances:
39
+
40
+ Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator
41
+ will apply
42
+
43
+ some commonly used heuristics from the world of anti-spam to help you decide whether
44
+ your users are human or machine.'
45
+ email:
54
46
  - moowahaha@hotmail.com
55
47
  executables: []
56
-
57
48
  extensions: []
58
-
59
- extra_rdoc_files:
49
+ extra_rdoc_files:
60
50
  - History.txt
61
51
  - Manifest.txt
62
52
  - PostInstall.txt
63
- files:
53
+ - conf/unusual_characters.txt
54
+ files:
64
55
  - .rspec
65
56
  - .rvmrc
66
57
  - Gemfile
@@ -70,38 +61,38 @@ files:
70
61
  - PostInstall.txt
71
62
  - README.rdoc
72
63
  - Rakefile
64
+ - conf/unusual_characters.txt
73
65
  - despamilator.gemspec
74
66
  - lib/despamilator.rb
75
67
  - lib/despamilator/filter.rb
76
- - lib/despamilator/filter/funky_consonant.rb
77
68
  - lib/despamilator/filter/gtubs_test_filter.rb
78
69
  - lib/despamilator/filter/html_tags.rb
79
70
  - lib/despamilator/filter/ip_address_url.rb
80
71
  - lib/despamilator/filter/long_words.rb
81
- - lib/despamilator/filter/naughty_q.rb
82
72
  - lib/despamilator/filter/naughty_words.rb
83
73
  - lib/despamilator/filter/numbers_and_words.rb
84
74
  - lib/despamilator/filter/script_tag.rb
85
75
  - lib/despamilator/filter/shouting.rb
86
76
  - lib/despamilator/filter/square_brackets.rb
87
77
  - lib/despamilator/filter/trailing_number.rb
78
+ - lib/despamilator/filter/unusual_characters.rb
88
79
  - lib/despamilator/filter/urls.rb
89
80
  - lib/despamilator/filter_base.rb
90
81
  - scripts/despamilator_score.rb
82
+ - scripts/from_file.rb
91
83
  - spec/despamilator_spec.rb
92
84
  - spec/filter_base_spec.rb
93
- - spec/filters/funky_consonant_spec.rb
94
85
  - spec/filters/gtubs_test_filter_spec.rb
95
86
  - spec/filters/html_tags_spec.rb
96
87
  - spec/filters/ip_address_url_spec.rb
97
88
  - spec/filters/long_words_spec.rb
98
- - spec/filters/naughty_q_spec.rb
99
89
  - spec/filters/naughty_words_spec.rb
100
90
  - spec/filters/numbers_and_words_spec.rb
101
91
  - spec/filters/script_tag_spec.rb
102
92
  - spec/filters/shouting_spec.rb
103
93
  - spec/filters/square_brackets_spec.rb
104
94
  - spec/filters/trailing_number_spec.rb
95
+ - spec/filters/unusual_characters_spec.rb
105
96
  - spec/filters/urls_spec.rb
106
97
  - spec/helpers/corpus_helper.rb
107
98
  - spec/helpers/filter_helper.rb
@@ -110,35 +101,30 @@ files:
110
101
  has_rdoc: true
111
102
  homepage: http://github.com/moowahaha/despamilator
112
103
  licenses: []
113
-
114
104
  post_install_message: PostInstall.txt
115
- rdoc_options:
105
+ rdoc_options:
116
106
  - --main
117
107
  - README.rdoc
118
- require_paths:
108
+ require_paths:
119
109
  - lib
120
- required_ruby_version: !ruby/object:Gem::Requirement
110
+ required_ruby_version: !ruby/object:Gem::Requirement
121
111
  none: false
122
- requirements:
123
- - - ">="
124
- - !ruby/object:Gem::Version
125
- segments:
126
- - 0
127
- version: "0"
128
- required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ! '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
129
117
  none: false
130
- requirements:
131
- - - ">="
132
- - !ruby/object:Gem::Version
133
- segments:
134
- - 0
135
- version: "0"
118
+ requirements:
119
+ - - ! '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
136
122
  requirements: []
137
-
138
123
  rubyforge_project: despamilator
139
- rubygems_version: 1.3.7
124
+ rubygems_version: 1.5.2
140
125
  signing_key:
141
126
  specification_version: 3
142
- summary: "Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive"
127
+ summary: ! 'Despamilator is a plugin based spam detector designed for use on your
128
+ web forms borne out of two annoyances: Spam being submitted in my web forms and
129
+ CAPTCHAS being intrusive'
143
130
  test_files: []
144
-
@@ -1,31 +0,0 @@
1
- require 'despamilator/filter_base'
2
-
3
- module DespamilatorFilter
4
-
5
- class FunkyConsonant < Despamilator::FilterBase
6
-
7
- def name
8
- 'Funky Consonant'
9
- end
10
-
11
- def description
12
- 'Detects and scores each occurrence of a consonant next to an unlikely character'
13
- end
14
-
15
- def parse text
16
- text.downcase!
17
-
18
- consonant_pairs.each do |pair|
19
- [pair, pair.reverse].each do |combo_pair|
20
- self.append_score = 0.05 unless text.scan(/#{combo_pair}/).empty?
21
- end
22
- end
23
- end
24
-
25
- def consonant_pairs
26
- %w{ zt gb vk vt jk mj dm jm xz bn }
27
- end
28
-
29
- end
30
-
31
- end
@@ -1,31 +0,0 @@
1
- require 'despamilator/filter_base'
2
-
3
- module DespamilatorFilter
4
-
5
- class NaughtyQ < Despamilator::FilterBase
6
-
7
- def name
8
- 'Naughty Q'
9
- end
10
-
11
- def description
12
- 'Detects possible misuse of the letter Q (English language)'
13
- end
14
-
15
- def parse text
16
- post_matches = text.downcase.scan(/q(\w|\d)/)
17
- pre_matches = text.downcase.scan(/(\w|\d)q/)
18
-
19
- matches = post_matches + pre_matches
20
-
21
- return unless matches
22
-
23
- matches.each do |match|
24
- match = match.first
25
- self.append_score = 0.2 unless match == 'u' or match == 'a' or match == 'k'
26
- end
27
- end
28
-
29
- end
30
-
31
- end