despamilator 1.1 → 2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +0 -1
- data/History.txt +7 -0
- data/Manifest.txt +4 -4
- data/conf/unusual_characters.txt +6674 -0
- data/despamilator.gemspec +5 -6
- data/lib/despamilator.rb +1 -1
- data/lib/despamilator/filter/html_tags.rb +1 -1
- data/lib/despamilator/filter/naughty_words.rb +3 -3
- data/lib/despamilator/filter/unusual_characters.rb +47 -0
- data/lib/despamilator/filter/urls.rb +2 -2
- data/scripts/from_file.rb +26 -0
- data/spec/filters/html_tags_spec.rb +3 -3
- data/spec/filters/unusual_characters_spec.rb +9 -0
- data/spec/filters/urls_spec.rb +3 -3
- metadata +53 -67
- data/lib/despamilator/filter/funky_consonant.rb +0 -31
- data/lib/despamilator/filter/naughty_q.rb +0 -31
- data/spec/filters/funky_consonant_spec.rb +0 -9
- data/spec/filters/naughty_q_spec.rb +0 -11
data/despamilator.gemspec
CHANGED
@@ -2,27 +2,26 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{despamilator}
|
5
|
-
s.version = "
|
5
|
+
s.version = "2.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Stephen Hardisty"]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-05-24}
|
10
10
|
s.description = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
|
11
11
|
Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
|
12
12
|
some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.}
|
13
13
|
s.email = ["moowahaha@hotmail.com"]
|
14
|
-
s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt"]
|
15
|
-
s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "
|
14
|
+
s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt", "conf/unusual_characters.txt"]
|
15
|
+
s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "conf/unusual_characters.txt", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/gtubs_test_filter.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/trailing_number.rb", "lib/despamilator/filter/unusual_characters.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "scripts/from_file.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/gtubs_test_filter_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/trailing_number_spec.rb", "spec/filters/unusual_characters_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
|
16
16
|
s.homepage = %q{http://github.com/moowahaha/despamilator}
|
17
17
|
s.post_install_message = %q{PostInstall.txt}
|
18
18
|
s.rdoc_options = ["--main", "README.rdoc"]
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
s.rubyforge_project = %q{despamilator}
|
21
|
-
s.rubygems_version = %q{1.
|
21
|
+
s.rubygems_version = %q{1.5.2}
|
22
22
|
s.summary = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive}
|
23
23
|
|
24
24
|
if s.respond_to? :specification_version then
|
25
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
26
25
|
s.specification_version = 3
|
27
26
|
|
28
27
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
data/lib/despamilator.rb
CHANGED
@@ -16,12 +16,13 @@ module DespamilatorFilter
|
|
16
16
|
text.downcase!
|
17
17
|
|
18
18
|
naughty_words.each do |word|
|
19
|
-
self.append_score = 0.1 if text =~ /\b#{word}
|
19
|
+
self.append_score = 0.1 if text =~ /\b#{word}s?\b/
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
23
|
def naughty_words
|
24
24
|
%w{
|
25
|
+
underage
|
25
26
|
penis
|
26
27
|
viagra
|
27
28
|
bondage
|
@@ -30,10 +31,9 @@ module DespamilatorFilter
|
|
30
31
|
shit
|
31
32
|
dick
|
32
33
|
tits
|
33
|
-
sex
|
34
34
|
nude
|
35
35
|
dicks
|
36
|
-
|
36
|
+
shemale
|
37
37
|
dildo
|
38
38
|
porn
|
39
39
|
cock
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'despamilator/filter_base'
|
2
|
+
|
3
|
+
module DespamilatorFilter
|
4
|
+
|
5
|
+
class UnusualCharacters < Despamilator::FilterBase
|
6
|
+
|
7
|
+
def name
|
8
|
+
'Unusual Characters'
|
9
|
+
end
|
10
|
+
|
11
|
+
def description
|
12
|
+
'Detects and scores each occurrence of an unusual 2 or 3 character combination'
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse text
|
16
|
+
initialize_combos
|
17
|
+
tokenize(text).each do |token|
|
18
|
+
self.append_score = 0.05 if @@combos[token.to_sym]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def tokenize text
|
25
|
+
tokens = []
|
26
|
+
text.downcase.split(/[^a-z]/).each do |word|
|
27
|
+
word.chars.each_with_index do |c, i|
|
28
|
+
substr = word[i,i+3]
|
29
|
+
tokens << substr.to_sym if substr.length == 3
|
30
|
+
tokens << substr[0,2].to_sym if substr.length > 1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
tokens
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize_combos
|
37
|
+
@@combos ||= {}
|
38
|
+
return @@combos unless @@combos.empty?
|
39
|
+
|
40
|
+
File.open(File.join(File.dirname(__FILE__), %w{.. .. .. conf unusual_characters.txt}), 'r').each do |line|
|
41
|
+
@@combos[line.strip.to_sym] = true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
slots = {};
|
6
|
+
|
7
|
+
1.upto(10000) do |i|
|
8
|
+
slots[i] = true
|
9
|
+
end
|
10
|
+
|
11
|
+
dir = './spec/spam_corpus/'
|
12
|
+
|
13
|
+
Dir[dir + '*.gz'].each do |file|
|
14
|
+
slots.delete(file.scan(/\d+/).first.to_i)
|
15
|
+
end
|
16
|
+
|
17
|
+
slots = slots.keys.sort
|
18
|
+
|
19
|
+
File.open(ARGV[0] || raise).each do |line|
|
20
|
+
txt = dir + "#{slots.shift}.txt"
|
21
|
+
File.open(txt, 'w') do |fh|
|
22
|
+
fh.puts CGI.unescapeHTML(line)
|
23
|
+
end
|
24
|
+
|
25
|
+
`gzip #{txt}`
|
26
|
+
end
|
@@ -5,8 +5,8 @@ describe DespamilatorFilter::HtmlTags do
|
|
5
5
|
|
6
6
|
despamilator_should_apply_the_filter_for('<xmp>')
|
7
7
|
|
8
|
-
a_single_match_of('<xmp>', should_score: 0.
|
9
|
-
a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [
|
8
|
+
a_single_match_of('<xmp>', should_score: 0.6)
|
9
|
+
a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [1.2, 2.times])
|
10
10
|
|
11
11
|
[
|
12
12
|
'!--',
|
@@ -117,7 +117,7 @@ describe DespamilatorFilter::HtmlTags do
|
|
117
117
|
it "should detect '#{tag}'" do
|
118
118
|
dspam = DespamilatorFilter::HtmlTags.new
|
119
119
|
dspam.parse(tag)
|
120
|
-
dspam.score.should == 0.
|
120
|
+
dspam.score.should == 0.6
|
121
121
|
end
|
122
122
|
|
123
123
|
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
describe DespamilatorFilter::UnusualCharacters do
|
2
|
+
the_name_should_be 'Unusual Characters'
|
3
|
+
the_description_should_be 'Detects and scores each occurrence of an unusual 2 or 3 character combination'
|
4
|
+
|
5
|
+
despamilator_should_apply_the_filter_for('sx')
|
6
|
+
|
7
|
+
a_single_match_of('sx', should_score: 0.05)
|
8
|
+
a_multiple_match_of('sxsx', should_score: [0.1, 2.times])
|
9
|
+
end
|
data/spec/filters/urls_spec.rb
CHANGED
@@ -3,9 +3,9 @@ describe DespamilatorFilter::URLs do
|
|
3
3
|
the_name_should_be 'URLs'
|
4
4
|
the_description_should_be 'Detects each url in a string'
|
5
5
|
|
6
|
-
despamilator_should_apply_the_filter_for('
|
6
|
+
despamilator_should_apply_the_filter_for('http://www.blah.com')
|
7
7
|
|
8
|
-
a_single_match_of('http://www.blah.com', should_score: 0.
|
9
|
-
a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [
|
8
|
+
a_single_match_of('http://www.blah.com', should_score: 0.4)
|
9
|
+
a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [0.8, 2.times])
|
10
10
|
|
11
11
|
end
|
metadata
CHANGED
@@ -1,66 +1,57 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: despamilator
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 1
|
7
|
-
- 1
|
8
|
-
version: "1.1"
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '2.0'
|
5
|
+
prerelease:
|
9
6
|
platform: ruby
|
10
|
-
authors:
|
7
|
+
authors:
|
11
8
|
- Stephen Hardisty
|
12
9
|
autorequire:
|
13
10
|
bindir: bin
|
14
11
|
cert_chain: []
|
15
|
-
|
16
|
-
date: 2011-01-26 00:00:00 +11:00
|
12
|
+
date: 2011-05-24 00:00:00.000000000 +10:00
|
17
13
|
default_executable:
|
18
|
-
dependencies:
|
19
|
-
- !ruby/object:Gem::Dependency
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
20
16
|
name: rubyforge
|
21
|
-
|
22
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
17
|
+
requirement: &2730490 !ruby/object:Gem::Requirement
|
23
18
|
none: false
|
24
|
-
requirements:
|
25
|
-
- -
|
26
|
-
- !ruby/object:Gem::Version
|
27
|
-
segments:
|
28
|
-
- 2
|
29
|
-
- 0
|
30
|
-
- 4
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
31
22
|
version: 2.0.4
|
32
23
|
type: :development
|
33
|
-
version_requirements: *id001
|
34
|
-
- !ruby/object:Gem::Dependency
|
35
|
-
name: hoe
|
36
24
|
prerelease: false
|
37
|
-
|
25
|
+
version_requirements: *2730490
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: hoe
|
28
|
+
requirement: &2730250 !ruby/object:Gem::Requirement
|
38
29
|
none: false
|
39
|
-
requirements:
|
40
|
-
- -
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
segments:
|
43
|
-
- 2
|
44
|
-
- 7
|
45
|
-
- 0
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
46
33
|
version: 2.7.0
|
47
34
|
type: :development
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *2730250
|
37
|
+
description: ! 'Despamilator is a plugin based spam detector designed for use on your
|
38
|
+
web forms borne out of two annoyances:
|
39
|
+
|
40
|
+
Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator
|
41
|
+
will apply
|
42
|
+
|
43
|
+
some commonly used heuristics from the world of anti-spam to help you decide whether
|
44
|
+
your users are human or machine.'
|
45
|
+
email:
|
54
46
|
- moowahaha@hotmail.com
|
55
47
|
executables: []
|
56
|
-
|
57
48
|
extensions: []
|
58
|
-
|
59
|
-
extra_rdoc_files:
|
49
|
+
extra_rdoc_files:
|
60
50
|
- History.txt
|
61
51
|
- Manifest.txt
|
62
52
|
- PostInstall.txt
|
63
|
-
|
53
|
+
- conf/unusual_characters.txt
|
54
|
+
files:
|
64
55
|
- .rspec
|
65
56
|
- .rvmrc
|
66
57
|
- Gemfile
|
@@ -70,38 +61,38 @@ files:
|
|
70
61
|
- PostInstall.txt
|
71
62
|
- README.rdoc
|
72
63
|
- Rakefile
|
64
|
+
- conf/unusual_characters.txt
|
73
65
|
- despamilator.gemspec
|
74
66
|
- lib/despamilator.rb
|
75
67
|
- lib/despamilator/filter.rb
|
76
|
-
- lib/despamilator/filter/funky_consonant.rb
|
77
68
|
- lib/despamilator/filter/gtubs_test_filter.rb
|
78
69
|
- lib/despamilator/filter/html_tags.rb
|
79
70
|
- lib/despamilator/filter/ip_address_url.rb
|
80
71
|
- lib/despamilator/filter/long_words.rb
|
81
|
-
- lib/despamilator/filter/naughty_q.rb
|
82
72
|
- lib/despamilator/filter/naughty_words.rb
|
83
73
|
- lib/despamilator/filter/numbers_and_words.rb
|
84
74
|
- lib/despamilator/filter/script_tag.rb
|
85
75
|
- lib/despamilator/filter/shouting.rb
|
86
76
|
- lib/despamilator/filter/square_brackets.rb
|
87
77
|
- lib/despamilator/filter/trailing_number.rb
|
78
|
+
- lib/despamilator/filter/unusual_characters.rb
|
88
79
|
- lib/despamilator/filter/urls.rb
|
89
80
|
- lib/despamilator/filter_base.rb
|
90
81
|
- scripts/despamilator_score.rb
|
82
|
+
- scripts/from_file.rb
|
91
83
|
- spec/despamilator_spec.rb
|
92
84
|
- spec/filter_base_spec.rb
|
93
|
-
- spec/filters/funky_consonant_spec.rb
|
94
85
|
- spec/filters/gtubs_test_filter_spec.rb
|
95
86
|
- spec/filters/html_tags_spec.rb
|
96
87
|
- spec/filters/ip_address_url_spec.rb
|
97
88
|
- spec/filters/long_words_spec.rb
|
98
|
-
- spec/filters/naughty_q_spec.rb
|
99
89
|
- spec/filters/naughty_words_spec.rb
|
100
90
|
- spec/filters/numbers_and_words_spec.rb
|
101
91
|
- spec/filters/script_tag_spec.rb
|
102
92
|
- spec/filters/shouting_spec.rb
|
103
93
|
- spec/filters/square_brackets_spec.rb
|
104
94
|
- spec/filters/trailing_number_spec.rb
|
95
|
+
- spec/filters/unusual_characters_spec.rb
|
105
96
|
- spec/filters/urls_spec.rb
|
106
97
|
- spec/helpers/corpus_helper.rb
|
107
98
|
- spec/helpers/filter_helper.rb
|
@@ -110,35 +101,30 @@ files:
|
|
110
101
|
has_rdoc: true
|
111
102
|
homepage: http://github.com/moowahaha/despamilator
|
112
103
|
licenses: []
|
113
|
-
|
114
104
|
post_install_message: PostInstall.txt
|
115
|
-
rdoc_options:
|
105
|
+
rdoc_options:
|
116
106
|
- --main
|
117
107
|
- README.rdoc
|
118
|
-
require_paths:
|
108
|
+
require_paths:
|
119
109
|
- lib
|
120
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
110
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
121
111
|
none: false
|
122
|
-
requirements:
|
123
|
-
- -
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
|
126
|
-
|
127
|
-
version: "0"
|
128
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ! '>='
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
117
|
none: false
|
130
|
-
requirements:
|
131
|
-
- -
|
132
|
-
- !ruby/object:Gem::Version
|
133
|
-
|
134
|
-
- 0
|
135
|
-
version: "0"
|
118
|
+
requirements:
|
119
|
+
- - ! '>='
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: '0'
|
136
122
|
requirements: []
|
137
|
-
|
138
123
|
rubyforge_project: despamilator
|
139
|
-
rubygems_version: 1.
|
124
|
+
rubygems_version: 1.5.2
|
140
125
|
signing_key:
|
141
126
|
specification_version: 3
|
142
|
-
summary:
|
127
|
+
summary: ! 'Despamilator is a plugin based spam detector designed for use on your
|
128
|
+
web forms borne out of two annoyances: Spam being submitted in my web forms and
|
129
|
+
CAPTCHAS being intrusive'
|
143
130
|
test_files: []
|
144
|
-
|
@@ -1,31 +0,0 @@
|
|
1
|
-
require 'despamilator/filter_base'
|
2
|
-
|
3
|
-
module DespamilatorFilter
|
4
|
-
|
5
|
-
class FunkyConsonant < Despamilator::FilterBase
|
6
|
-
|
7
|
-
def name
|
8
|
-
'Funky Consonant'
|
9
|
-
end
|
10
|
-
|
11
|
-
def description
|
12
|
-
'Detects and scores each occurrence of a consonant next to an unlikely character'
|
13
|
-
end
|
14
|
-
|
15
|
-
def parse text
|
16
|
-
text.downcase!
|
17
|
-
|
18
|
-
consonant_pairs.each do |pair|
|
19
|
-
[pair, pair.reverse].each do |combo_pair|
|
20
|
-
self.append_score = 0.05 unless text.scan(/#{combo_pair}/).empty?
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def consonant_pairs
|
26
|
-
%w{ zt gb vk vt jk mj dm jm xz bn }
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
@@ -1,31 +0,0 @@
|
|
1
|
-
require 'despamilator/filter_base'
|
2
|
-
|
3
|
-
module DespamilatorFilter
|
4
|
-
|
5
|
-
class NaughtyQ < Despamilator::FilterBase
|
6
|
-
|
7
|
-
def name
|
8
|
-
'Naughty Q'
|
9
|
-
end
|
10
|
-
|
11
|
-
def description
|
12
|
-
'Detects possible misuse of the letter Q (English language)'
|
13
|
-
end
|
14
|
-
|
15
|
-
def parse text
|
16
|
-
post_matches = text.downcase.scan(/q(\w|\d)/)
|
17
|
-
pre_matches = text.downcase.scan(/(\w|\d)q/)
|
18
|
-
|
19
|
-
matches = post_matches + pre_matches
|
20
|
-
|
21
|
-
return unless matches
|
22
|
-
|
23
|
-
matches.each do |match|
|
24
|
-
match = match.first
|
25
|
-
self.append_score = 0.2 unless match == 'u' or match == 'a' or match == 'k'
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|