despamilator 1.1 → 2.0

Sign up to get free protection for your applications and to get access to all the features.
data/despamilator.gemspec CHANGED
@@ -2,27 +2,26 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{despamilator}
5
- s.version = "1.1"
5
+ s.version = "2.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Stephen Hardisty"]
9
- s.date = %q{2011-01-26}
9
+ s.date = %q{2011-05-24}
10
10
  s.description = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
11
11
  Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
12
12
  some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.}
13
13
  s.email = ["moowahaha@hotmail.com"]
14
- s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt"]
15
- s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/funky_consonant.rb", "lib/despamilator/filter/gtubs_test_filter.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_q.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/trailing_number.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/funky_consonant_spec.rb", "spec/filters/gtubs_test_filter_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_q_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/trailing_number_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
14
+ s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt", "conf/unusual_characters.txt"]
15
+ s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "conf/unusual_characters.txt", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/gtubs_test_filter.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/trailing_number.rb", "lib/despamilator/filter/unusual_characters.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "scripts/from_file.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/gtubs_test_filter_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/trailing_number_spec.rb", "spec/filters/unusual_characters_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
16
16
  s.homepage = %q{http://github.com/moowahaha/despamilator}
17
17
  s.post_install_message = %q{PostInstall.txt}
18
18
  s.rdoc_options = ["--main", "README.rdoc"]
19
19
  s.require_paths = ["lib"]
20
20
  s.rubyforge_project = %q{despamilator}
21
- s.rubygems_version = %q{1.3.7}
21
+ s.rubygems_version = %q{1.5.2}
22
22
  s.summary = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive}
23
23
 
24
24
  if s.respond_to? :specification_version then
25
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
26
25
  s.specification_version = 3
27
26
 
28
27
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
data/lib/despamilator.rb CHANGED
@@ -14,7 +14,7 @@ require 'despamilator/filter'
14
14
  # dspam.matched_by #=> array of matching filters
15
15
 
16
16
  class Despamilator
17
- VERSION = "1.1"
17
+ VERSION = "2.0"
18
18
 
19
19
  # Constructor. Takes the text you which to parse and score.
20
20
 
@@ -9,7 +9,7 @@ module DespamilatorFilter
9
9
 
10
10
  html_tags.each do |tag|
11
11
  if text.match(/<\s*#{tag}\W/) || text.match(/<\n*#{tag}\W/) || text.match(/\W#{tag}\s*\//) || text.match(/\W#{tag}\n*\//)
12
- self.append_score = 0.45
12
+ self.append_score = 0.6
13
13
  end
14
14
  end
15
15
  end
@@ -16,12 +16,13 @@ module DespamilatorFilter
16
16
  text.downcase!
17
17
 
18
18
  naughty_words.each do |word|
19
- self.append_score = 0.1 if text =~ /\b#{word}\b/
19
+ self.append_score = 0.1 if text =~ /\b#{word}s?\b/
20
20
  end
21
21
  end
22
22
 
23
23
  def naughty_words
24
24
  %w{
25
+ underage
25
26
  penis
26
27
  viagra
27
28
  bondage
@@ -30,10 +31,9 @@ module DespamilatorFilter
30
31
  shit
31
32
  dick
32
33
  tits
33
- sex
34
34
  nude
35
35
  dicks
36
- shemales
36
+ shemale
37
37
  dildo
38
38
  porn
39
39
  cock
@@ -0,0 +1,47 @@
1
+ require 'despamilator/filter_base'
2
+
3
+ module DespamilatorFilter
4
+
5
+ class UnusualCharacters < Despamilator::FilterBase
6
+
7
+ def name
8
+ 'Unusual Characters'
9
+ end
10
+
11
+ def description
12
+ 'Detects and scores each occurrence of an unusual 2 or 3 character combination'
13
+ end
14
+
15
+ def parse text
16
+ initialize_combos
17
+ tokenize(text).each do |token|
18
+ self.append_score = 0.05 if @@combos[token.to_sym]
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def tokenize text
25
+ tokens = []
26
+ text.downcase.split(/[^a-z]/).each do |word|
27
+ word.chars.each_with_index do |c, i|
28
+ substr = word[i,i+3]
29
+ tokens << substr.to_sym if substr.length == 3
30
+ tokens << substr[0,2].to_sym if substr.length > 1
31
+ end
32
+ end
33
+ tokens
34
+ end
35
+
36
+ def initialize_combos
37
+ @@combos ||= {}
38
+ return @@combos unless @@combos.empty?
39
+
40
+ File.open(File.join(File.dirname(__FILE__), %w{.. .. .. conf unusual_characters.txt}), 'r').each do |line|
41
+ @@combos[line.strip.to_sym] = true
42
+ end
43
+ end
44
+
45
+ end
46
+
47
+ end
@@ -16,9 +16,9 @@ module DespamilatorFilter
16
16
  text.downcase!
17
17
 
18
18
  text.gsub!(/http:\/\/\d+\.\d+\.\d+\.\d+/, '')
19
-
19
+
20
20
  1.upto(text.scan(/http:\/\//).length) do
21
- self.append_score = 0.5
21
+ self.append_score = 0.4
22
22
  end
23
23
  end
24
24
 
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'cgi'
4
+
5
+ slots = {};
6
+
7
+ 1.upto(10000) do |i|
8
+ slots[i] = true
9
+ end
10
+
11
+ dir = './spec/spam_corpus/'
12
+
13
+ Dir[dir + '*.gz'].each do |file|
14
+ slots.delete(file.scan(/\d+/).first.to_i)
15
+ end
16
+
17
+ slots = slots.keys.sort
18
+
19
+ File.open(ARGV[0] || raise).each do |line|
20
+ txt = dir + "#{slots.shift}.txt"
21
+ File.open(txt, 'w') do |fh|
22
+ fh.puts CGI.unescapeHTML(line)
23
+ end
24
+
25
+ `gzip #{txt}`
26
+ end
@@ -5,8 +5,8 @@ describe DespamilatorFilter::HtmlTags do
5
5
 
6
6
  despamilator_should_apply_the_filter_for('<xmp>')
7
7
 
8
- a_single_match_of('<xmp>', should_score: 0.45)
9
- a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [0.9, 2.times])
8
+ a_single_match_of('<xmp>', should_score: 0.6)
9
+ a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [1.2, 2.times])
10
10
 
11
11
  [
12
12
  '!--',
@@ -117,7 +117,7 @@ describe DespamilatorFilter::HtmlTags do
117
117
  it "should detect '#{tag}'" do
118
118
  dspam = DespamilatorFilter::HtmlTags.new
119
119
  dspam.parse(tag)
120
- dspam.score.should == 0.45
120
+ dspam.score.should == 0.6
121
121
  end
122
122
 
123
123
  end
@@ -0,0 +1,9 @@
1
+ describe DespamilatorFilter::UnusualCharacters do
2
+ the_name_should_be 'Unusual Characters'
3
+ the_description_should_be 'Detects and scores each occurrence of an unusual 2 or 3 character combination'
4
+
5
+ despamilator_should_apply_the_filter_for('sx')
6
+
7
+ a_single_match_of('sx', should_score: 0.05)
8
+ a_multiple_match_of('sxsx', should_score: [0.1, 2.times])
9
+ end
@@ -3,9 +3,9 @@ describe DespamilatorFilter::URLs do
3
3
  the_name_should_be 'URLs'
4
4
  the_description_should_be 'Detects each url in a string'
5
5
 
6
- despamilator_should_apply_the_filter_for('zt')
6
+ despamilator_should_apply_the_filter_for('http://www.blah.com')
7
7
 
8
- a_single_match_of('http://www.blah.com', should_score: 0.5)
9
- a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [1.0, 2.times])
8
+ a_single_match_of('http://www.blah.com', should_score: 0.4)
9
+ a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [0.8, 2.times])
10
10
 
11
11
  end
metadata CHANGED
@@ -1,66 +1,57 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: despamilator
3
- version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 1
7
- - 1
8
- version: "1.1"
3
+ version: !ruby/object:Gem::Version
4
+ version: '2.0'
5
+ prerelease:
9
6
  platform: ruby
10
- authors:
7
+ authors:
11
8
  - Stephen Hardisty
12
9
  autorequire:
13
10
  bindir: bin
14
11
  cert_chain: []
15
-
16
- date: 2011-01-26 00:00:00 +11:00
12
+ date: 2011-05-24 00:00:00.000000000 +10:00
17
13
  default_executable:
18
- dependencies:
19
- - !ruby/object:Gem::Dependency
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
20
16
  name: rubyforge
21
- prerelease: false
22
- requirement: &id001 !ruby/object:Gem::Requirement
17
+ requirement: &2730490 !ruby/object:Gem::Requirement
23
18
  none: false
24
- requirements:
25
- - - ">="
26
- - !ruby/object:Gem::Version
27
- segments:
28
- - 2
29
- - 0
30
- - 4
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
31
22
  version: 2.0.4
32
23
  type: :development
33
- version_requirements: *id001
34
- - !ruby/object:Gem::Dependency
35
- name: hoe
36
24
  prerelease: false
37
- requirement: &id002 !ruby/object:Gem::Requirement
25
+ version_requirements: *2730490
26
+ - !ruby/object:Gem::Dependency
27
+ name: hoe
28
+ requirement: &2730250 !ruby/object:Gem::Requirement
38
29
  none: false
39
- requirements:
40
- - - ">="
41
- - !ruby/object:Gem::Version
42
- segments:
43
- - 2
44
- - 7
45
- - 0
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
46
33
  version: 2.7.0
47
34
  type: :development
48
- version_requirements: *id002
49
- description: |-
50
- Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
51
- Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
52
- some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.
53
- email:
35
+ prerelease: false
36
+ version_requirements: *2730250
37
+ description: ! 'Despamilator is a plugin based spam detector designed for use on your
38
+ web forms borne out of two annoyances:
39
+
40
+ Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator
41
+ will apply
42
+
43
+ some commonly used heuristics from the world of anti-spam to help you decide whether
44
+ your users are human or machine.'
45
+ email:
54
46
  - moowahaha@hotmail.com
55
47
  executables: []
56
-
57
48
  extensions: []
58
-
59
- extra_rdoc_files:
49
+ extra_rdoc_files:
60
50
  - History.txt
61
51
  - Manifest.txt
62
52
  - PostInstall.txt
63
- files:
53
+ - conf/unusual_characters.txt
54
+ files:
64
55
  - .rspec
65
56
  - .rvmrc
66
57
  - Gemfile
@@ -70,38 +61,38 @@ files:
70
61
  - PostInstall.txt
71
62
  - README.rdoc
72
63
  - Rakefile
64
+ - conf/unusual_characters.txt
73
65
  - despamilator.gemspec
74
66
  - lib/despamilator.rb
75
67
  - lib/despamilator/filter.rb
76
- - lib/despamilator/filter/funky_consonant.rb
77
68
  - lib/despamilator/filter/gtubs_test_filter.rb
78
69
  - lib/despamilator/filter/html_tags.rb
79
70
  - lib/despamilator/filter/ip_address_url.rb
80
71
  - lib/despamilator/filter/long_words.rb
81
- - lib/despamilator/filter/naughty_q.rb
82
72
  - lib/despamilator/filter/naughty_words.rb
83
73
  - lib/despamilator/filter/numbers_and_words.rb
84
74
  - lib/despamilator/filter/script_tag.rb
85
75
  - lib/despamilator/filter/shouting.rb
86
76
  - lib/despamilator/filter/square_brackets.rb
87
77
  - lib/despamilator/filter/trailing_number.rb
78
+ - lib/despamilator/filter/unusual_characters.rb
88
79
  - lib/despamilator/filter/urls.rb
89
80
  - lib/despamilator/filter_base.rb
90
81
  - scripts/despamilator_score.rb
82
+ - scripts/from_file.rb
91
83
  - spec/despamilator_spec.rb
92
84
  - spec/filter_base_spec.rb
93
- - spec/filters/funky_consonant_spec.rb
94
85
  - spec/filters/gtubs_test_filter_spec.rb
95
86
  - spec/filters/html_tags_spec.rb
96
87
  - spec/filters/ip_address_url_spec.rb
97
88
  - spec/filters/long_words_spec.rb
98
- - spec/filters/naughty_q_spec.rb
99
89
  - spec/filters/naughty_words_spec.rb
100
90
  - spec/filters/numbers_and_words_spec.rb
101
91
  - spec/filters/script_tag_spec.rb
102
92
  - spec/filters/shouting_spec.rb
103
93
  - spec/filters/square_brackets_spec.rb
104
94
  - spec/filters/trailing_number_spec.rb
95
+ - spec/filters/unusual_characters_spec.rb
105
96
  - spec/filters/urls_spec.rb
106
97
  - spec/helpers/corpus_helper.rb
107
98
  - spec/helpers/filter_helper.rb
@@ -110,35 +101,30 @@ files:
110
101
  has_rdoc: true
111
102
  homepage: http://github.com/moowahaha/despamilator
112
103
  licenses: []
113
-
114
104
  post_install_message: PostInstall.txt
115
- rdoc_options:
105
+ rdoc_options:
116
106
  - --main
117
107
  - README.rdoc
118
- require_paths:
108
+ require_paths:
119
109
  - lib
120
- required_ruby_version: !ruby/object:Gem::Requirement
110
+ required_ruby_version: !ruby/object:Gem::Requirement
121
111
  none: false
122
- requirements:
123
- - - ">="
124
- - !ruby/object:Gem::Version
125
- segments:
126
- - 0
127
- version: "0"
128
- required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ! '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
129
117
  none: false
130
- requirements:
131
- - - ">="
132
- - !ruby/object:Gem::Version
133
- segments:
134
- - 0
135
- version: "0"
118
+ requirements:
119
+ - - ! '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
136
122
  requirements: []
137
-
138
123
  rubyforge_project: despamilator
139
- rubygems_version: 1.3.7
124
+ rubygems_version: 1.5.2
140
125
  signing_key:
141
126
  specification_version: 3
142
- summary: "Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive"
127
+ summary: ! 'Despamilator is a plugin based spam detector designed for use on your
128
+ web forms borne out of two annoyances: Spam being submitted in my web forms and
129
+ CAPTCHAS being intrusive'
143
130
  test_files: []
144
-
@@ -1,31 +0,0 @@
1
- require 'despamilator/filter_base'
2
-
3
- module DespamilatorFilter
4
-
5
- class FunkyConsonant < Despamilator::FilterBase
6
-
7
- def name
8
- 'Funky Consonant'
9
- end
10
-
11
- def description
12
- 'Detects and scores each occurrence of a consonant next to an unlikely character'
13
- end
14
-
15
- def parse text
16
- text.downcase!
17
-
18
- consonant_pairs.each do |pair|
19
- [pair, pair.reverse].each do |combo_pair|
20
- self.append_score = 0.05 unless text.scan(/#{combo_pair}/).empty?
21
- end
22
- end
23
- end
24
-
25
- def consonant_pairs
26
- %w{ zt gb vk vt jk mj dm jm xz bn }
27
- end
28
-
29
- end
30
-
31
- end
@@ -1,31 +0,0 @@
1
- require 'despamilator/filter_base'
2
-
3
- module DespamilatorFilter
4
-
5
- class NaughtyQ < Despamilator::FilterBase
6
-
7
- def name
8
- 'Naughty Q'
9
- end
10
-
11
- def description
12
- 'Detects possible misuse of the letter Q (English language)'
13
- end
14
-
15
- def parse text
16
- post_matches = text.downcase.scan(/q(\w|\d)/)
17
- pre_matches = text.downcase.scan(/(\w|\d)q/)
18
-
19
- matches = post_matches + pre_matches
20
-
21
- return unless matches
22
-
23
- matches.each do |match|
24
- match = match.first
25
- self.append_score = 0.2 unless match == 'u' or match == 'a' or match == 'k'
26
- end
27
- end
28
-
29
- end
30
-
31
- end