despamilator 1.1 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +0 -1
- data/History.txt +7 -0
- data/Manifest.txt +4 -4
- data/conf/unusual_characters.txt +6674 -0
- data/despamilator.gemspec +5 -6
- data/lib/despamilator.rb +1 -1
- data/lib/despamilator/filter/html_tags.rb +1 -1
- data/lib/despamilator/filter/naughty_words.rb +3 -3
- data/lib/despamilator/filter/unusual_characters.rb +47 -0
- data/lib/despamilator/filter/urls.rb +2 -2
- data/scripts/from_file.rb +26 -0
- data/spec/filters/html_tags_spec.rb +3 -3
- data/spec/filters/unusual_characters_spec.rb +9 -0
- data/spec/filters/urls_spec.rb +3 -3
- metadata +53 -67
- data/lib/despamilator/filter/funky_consonant.rb +0 -31
- data/lib/despamilator/filter/naughty_q.rb +0 -31
- data/spec/filters/funky_consonant_spec.rb +0 -9
- data/spec/filters/naughty_q_spec.rb +0 -11
data/despamilator.gemspec
CHANGED
@@ -2,27 +2,26 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{despamilator}
|
5
|
-
s.version = "
|
5
|
+
s.version = "2.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Stephen Hardisty"]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-05-24}
|
10
10
|
s.description = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
|
11
11
|
Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
|
12
12
|
some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.}
|
13
13
|
s.email = ["moowahaha@hotmail.com"]
|
14
|
-
s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt"]
|
15
|
-
s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "
|
14
|
+
s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt", "conf/unusual_characters.txt"]
|
15
|
+
s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "conf/unusual_characters.txt", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/gtubs_test_filter.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/trailing_number.rb", "lib/despamilator/filter/unusual_characters.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "scripts/from_file.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/gtubs_test_filter_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/trailing_number_spec.rb", "spec/filters/unusual_characters_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
|
16
16
|
s.homepage = %q{http://github.com/moowahaha/despamilator}
|
17
17
|
s.post_install_message = %q{PostInstall.txt}
|
18
18
|
s.rdoc_options = ["--main", "README.rdoc"]
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
s.rubyforge_project = %q{despamilator}
|
21
|
-
s.rubygems_version = %q{1.
|
21
|
+
s.rubygems_version = %q{1.5.2}
|
22
22
|
s.summary = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive}
|
23
23
|
|
24
24
|
if s.respond_to? :specification_version then
|
25
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
26
25
|
s.specification_version = 3
|
27
26
|
|
28
27
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
data/lib/despamilator.rb
CHANGED
@@ -16,12 +16,13 @@ module DespamilatorFilter
|
|
16
16
|
text.downcase!
|
17
17
|
|
18
18
|
naughty_words.each do |word|
|
19
|
-
self.append_score = 0.1 if text =~ /\b#{word}
|
19
|
+
self.append_score = 0.1 if text =~ /\b#{word}s?\b/
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
23
|
def naughty_words
|
24
24
|
%w{
|
25
|
+
underage
|
25
26
|
penis
|
26
27
|
viagra
|
27
28
|
bondage
|
@@ -30,10 +31,9 @@ module DespamilatorFilter
|
|
30
31
|
shit
|
31
32
|
dick
|
32
33
|
tits
|
33
|
-
sex
|
34
34
|
nude
|
35
35
|
dicks
|
36
|
-
|
36
|
+
shemale
|
37
37
|
dildo
|
38
38
|
porn
|
39
39
|
cock
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'despamilator/filter_base'
|
2
|
+
|
3
|
+
module DespamilatorFilter
|
4
|
+
|
5
|
+
class UnusualCharacters < Despamilator::FilterBase
|
6
|
+
|
7
|
+
def name
|
8
|
+
'Unusual Characters'
|
9
|
+
end
|
10
|
+
|
11
|
+
def description
|
12
|
+
'Detects and scores each occurrence of an unusual 2 or 3 character combination'
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse text
|
16
|
+
initialize_combos
|
17
|
+
tokenize(text).each do |token|
|
18
|
+
self.append_score = 0.05 if @@combos[token.to_sym]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def tokenize text
|
25
|
+
tokens = []
|
26
|
+
text.downcase.split(/[^a-z]/).each do |word|
|
27
|
+
word.chars.each_with_index do |c, i|
|
28
|
+
substr = word[i,i+3]
|
29
|
+
tokens << substr.to_sym if substr.length == 3
|
30
|
+
tokens << substr[0,2].to_sym if substr.length > 1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
tokens
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize_combos
|
37
|
+
@@combos ||= {}
|
38
|
+
return @@combos unless @@combos.empty?
|
39
|
+
|
40
|
+
File.open(File.join(File.dirname(__FILE__), %w{.. .. .. conf unusual_characters.txt}), 'r').each do |line|
|
41
|
+
@@combos[line.strip.to_sym] = true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
slots = {};
|
6
|
+
|
7
|
+
1.upto(10000) do |i|
|
8
|
+
slots[i] = true
|
9
|
+
end
|
10
|
+
|
11
|
+
dir = './spec/spam_corpus/'
|
12
|
+
|
13
|
+
Dir[dir + '*.gz'].each do |file|
|
14
|
+
slots.delete(file.scan(/\d+/).first.to_i)
|
15
|
+
end
|
16
|
+
|
17
|
+
slots = slots.keys.sort
|
18
|
+
|
19
|
+
File.open(ARGV[0] || raise).each do |line|
|
20
|
+
txt = dir + "#{slots.shift}.txt"
|
21
|
+
File.open(txt, 'w') do |fh|
|
22
|
+
fh.puts CGI.unescapeHTML(line)
|
23
|
+
end
|
24
|
+
|
25
|
+
`gzip #{txt}`
|
26
|
+
end
|
@@ -5,8 +5,8 @@ describe DespamilatorFilter::HtmlTags do
|
|
5
5
|
|
6
6
|
despamilator_should_apply_the_filter_for('<xmp>')
|
7
7
|
|
8
|
-
a_single_match_of('<xmp>', should_score: 0.
|
9
|
-
a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [
|
8
|
+
a_single_match_of('<xmp>', should_score: 0.6)
|
9
|
+
a_multiple_match_of('<h1></h1> <h2></h2>', should_score: [1.2, 2.times])
|
10
10
|
|
11
11
|
[
|
12
12
|
'!--',
|
@@ -117,7 +117,7 @@ describe DespamilatorFilter::HtmlTags do
|
|
117
117
|
it "should detect '#{tag}'" do
|
118
118
|
dspam = DespamilatorFilter::HtmlTags.new
|
119
119
|
dspam.parse(tag)
|
120
|
-
dspam.score.should == 0.
|
120
|
+
dspam.score.should == 0.6
|
121
121
|
end
|
122
122
|
|
123
123
|
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
describe DespamilatorFilter::UnusualCharacters do
|
2
|
+
the_name_should_be 'Unusual Characters'
|
3
|
+
the_description_should_be 'Detects and scores each occurrence of an unusual 2 or 3 character combination'
|
4
|
+
|
5
|
+
despamilator_should_apply_the_filter_for('sx')
|
6
|
+
|
7
|
+
a_single_match_of('sx', should_score: 0.05)
|
8
|
+
a_multiple_match_of('sxsx', should_score: [0.1, 2.times])
|
9
|
+
end
|
data/spec/filters/urls_spec.rb
CHANGED
@@ -3,9 +3,9 @@ describe DespamilatorFilter::URLs do
|
|
3
3
|
the_name_should_be 'URLs'
|
4
4
|
the_description_should_be 'Detects each url in a string'
|
5
5
|
|
6
|
-
despamilator_should_apply_the_filter_for('
|
6
|
+
despamilator_should_apply_the_filter_for('http://www.blah.com')
|
7
7
|
|
8
|
-
a_single_match_of('http://www.blah.com', should_score: 0.
|
9
|
-
a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [
|
8
|
+
a_single_match_of('http://www.blah.com', should_score: 0.4)
|
9
|
+
a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [0.8, 2.times])
|
10
10
|
|
11
11
|
end
|
metadata
CHANGED
@@ -1,66 +1,57 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: despamilator
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 1
|
7
|
-
- 1
|
8
|
-
version: "1.1"
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '2.0'
|
5
|
+
prerelease:
|
9
6
|
platform: ruby
|
10
|
-
authors:
|
7
|
+
authors:
|
11
8
|
- Stephen Hardisty
|
12
9
|
autorequire:
|
13
10
|
bindir: bin
|
14
11
|
cert_chain: []
|
15
|
-
|
16
|
-
date: 2011-01-26 00:00:00 +11:00
|
12
|
+
date: 2011-05-24 00:00:00.000000000 +10:00
|
17
13
|
default_executable:
|
18
|
-
dependencies:
|
19
|
-
- !ruby/object:Gem::Dependency
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
20
16
|
name: rubyforge
|
21
|
-
|
22
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
17
|
+
requirement: &2730490 !ruby/object:Gem::Requirement
|
23
18
|
none: false
|
24
|
-
requirements:
|
25
|
-
- -
|
26
|
-
- !ruby/object:Gem::Version
|
27
|
-
segments:
|
28
|
-
- 2
|
29
|
-
- 0
|
30
|
-
- 4
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
31
22
|
version: 2.0.4
|
32
23
|
type: :development
|
33
|
-
version_requirements: *id001
|
34
|
-
- !ruby/object:Gem::Dependency
|
35
|
-
name: hoe
|
36
24
|
prerelease: false
|
37
|
-
|
25
|
+
version_requirements: *2730490
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: hoe
|
28
|
+
requirement: &2730250 !ruby/object:Gem::Requirement
|
38
29
|
none: false
|
39
|
-
requirements:
|
40
|
-
- -
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
segments:
|
43
|
-
- 2
|
44
|
-
- 7
|
45
|
-
- 0
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
46
33
|
version: 2.7.0
|
47
34
|
type: :development
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *2730250
|
37
|
+
description: ! 'Despamilator is a plugin based spam detector designed for use on your
|
38
|
+
web forms borne out of two annoyances:
|
39
|
+
|
40
|
+
Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator
|
41
|
+
will apply
|
42
|
+
|
43
|
+
some commonly used heuristics from the world of anti-spam to help you decide whether
|
44
|
+
your users are human or machine.'
|
45
|
+
email:
|
54
46
|
- moowahaha@hotmail.com
|
55
47
|
executables: []
|
56
|
-
|
57
48
|
extensions: []
|
58
|
-
|
59
|
-
extra_rdoc_files:
|
49
|
+
extra_rdoc_files:
|
60
50
|
- History.txt
|
61
51
|
- Manifest.txt
|
62
52
|
- PostInstall.txt
|
63
|
-
|
53
|
+
- conf/unusual_characters.txt
|
54
|
+
files:
|
64
55
|
- .rspec
|
65
56
|
- .rvmrc
|
66
57
|
- Gemfile
|
@@ -70,38 +61,38 @@ files:
|
|
70
61
|
- PostInstall.txt
|
71
62
|
- README.rdoc
|
72
63
|
- Rakefile
|
64
|
+
- conf/unusual_characters.txt
|
73
65
|
- despamilator.gemspec
|
74
66
|
- lib/despamilator.rb
|
75
67
|
- lib/despamilator/filter.rb
|
76
|
-
- lib/despamilator/filter/funky_consonant.rb
|
77
68
|
- lib/despamilator/filter/gtubs_test_filter.rb
|
78
69
|
- lib/despamilator/filter/html_tags.rb
|
79
70
|
- lib/despamilator/filter/ip_address_url.rb
|
80
71
|
- lib/despamilator/filter/long_words.rb
|
81
|
-
- lib/despamilator/filter/naughty_q.rb
|
82
72
|
- lib/despamilator/filter/naughty_words.rb
|
83
73
|
- lib/despamilator/filter/numbers_and_words.rb
|
84
74
|
- lib/despamilator/filter/script_tag.rb
|
85
75
|
- lib/despamilator/filter/shouting.rb
|
86
76
|
- lib/despamilator/filter/square_brackets.rb
|
87
77
|
- lib/despamilator/filter/trailing_number.rb
|
78
|
+
- lib/despamilator/filter/unusual_characters.rb
|
88
79
|
- lib/despamilator/filter/urls.rb
|
89
80
|
- lib/despamilator/filter_base.rb
|
90
81
|
- scripts/despamilator_score.rb
|
82
|
+
- scripts/from_file.rb
|
91
83
|
- spec/despamilator_spec.rb
|
92
84
|
- spec/filter_base_spec.rb
|
93
|
-
- spec/filters/funky_consonant_spec.rb
|
94
85
|
- spec/filters/gtubs_test_filter_spec.rb
|
95
86
|
- spec/filters/html_tags_spec.rb
|
96
87
|
- spec/filters/ip_address_url_spec.rb
|
97
88
|
- spec/filters/long_words_spec.rb
|
98
|
-
- spec/filters/naughty_q_spec.rb
|
99
89
|
- spec/filters/naughty_words_spec.rb
|
100
90
|
- spec/filters/numbers_and_words_spec.rb
|
101
91
|
- spec/filters/script_tag_spec.rb
|
102
92
|
- spec/filters/shouting_spec.rb
|
103
93
|
- spec/filters/square_brackets_spec.rb
|
104
94
|
- spec/filters/trailing_number_spec.rb
|
95
|
+
- spec/filters/unusual_characters_spec.rb
|
105
96
|
- spec/filters/urls_spec.rb
|
106
97
|
- spec/helpers/corpus_helper.rb
|
107
98
|
- spec/helpers/filter_helper.rb
|
@@ -110,35 +101,30 @@ files:
|
|
110
101
|
has_rdoc: true
|
111
102
|
homepage: http://github.com/moowahaha/despamilator
|
112
103
|
licenses: []
|
113
|
-
|
114
104
|
post_install_message: PostInstall.txt
|
115
|
-
rdoc_options:
|
105
|
+
rdoc_options:
|
116
106
|
- --main
|
117
107
|
- README.rdoc
|
118
|
-
require_paths:
|
108
|
+
require_paths:
|
119
109
|
- lib
|
120
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
110
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
121
111
|
none: false
|
122
|
-
requirements:
|
123
|
-
- -
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
|
126
|
-
|
127
|
-
version: "0"
|
128
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ! '>='
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
117
|
none: false
|
130
|
-
requirements:
|
131
|
-
- -
|
132
|
-
- !ruby/object:Gem::Version
|
133
|
-
|
134
|
-
- 0
|
135
|
-
version: "0"
|
118
|
+
requirements:
|
119
|
+
- - ! '>='
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: '0'
|
136
122
|
requirements: []
|
137
|
-
|
138
123
|
rubyforge_project: despamilator
|
139
|
-
rubygems_version: 1.
|
124
|
+
rubygems_version: 1.5.2
|
140
125
|
signing_key:
|
141
126
|
specification_version: 3
|
142
|
-
summary:
|
127
|
+
summary: ! 'Despamilator is a plugin based spam detector designed for use on your
|
128
|
+
web forms borne out of two annoyances: Spam being submitted in my web forms and
|
129
|
+
CAPTCHAS being intrusive'
|
143
130
|
test_files: []
|
144
|
-
|
@@ -1,31 +0,0 @@
|
|
1
|
-
require 'despamilator/filter_base'
|
2
|
-
|
3
|
-
module DespamilatorFilter
|
4
|
-
|
5
|
-
class FunkyConsonant < Despamilator::FilterBase
|
6
|
-
|
7
|
-
def name
|
8
|
-
'Funky Consonant'
|
9
|
-
end
|
10
|
-
|
11
|
-
def description
|
12
|
-
'Detects and scores each occurrence of a consonant next to an unlikely character'
|
13
|
-
end
|
14
|
-
|
15
|
-
def parse text
|
16
|
-
text.downcase!
|
17
|
-
|
18
|
-
consonant_pairs.each do |pair|
|
19
|
-
[pair, pair.reverse].each do |combo_pair|
|
20
|
-
self.append_score = 0.05 unless text.scan(/#{combo_pair}/).empty?
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def consonant_pairs
|
26
|
-
%w{ zt gb vk vt jk mj dm jm xz bn }
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
@@ -1,31 +0,0 @@
|
|
1
|
-
require 'despamilator/filter_base'
|
2
|
-
|
3
|
-
module DespamilatorFilter
|
4
|
-
|
5
|
-
class NaughtyQ < Despamilator::FilterBase
|
6
|
-
|
7
|
-
def name
|
8
|
-
'Naughty Q'
|
9
|
-
end
|
10
|
-
|
11
|
-
def description
|
12
|
-
'Detects possible misuse of the letter Q (English language)'
|
13
|
-
end
|
14
|
-
|
15
|
-
def parse text
|
16
|
-
post_matches = text.downcase.scan(/q(\w|\d)/)
|
17
|
-
pre_matches = text.downcase.scan(/(\w|\d)q/)
|
18
|
-
|
19
|
-
matches = post_matches + pre_matches
|
20
|
-
|
21
|
-
return unless matches
|
22
|
-
|
23
|
-
matches.each do |match|
|
24
|
-
match = match.first
|
25
|
-
self.append_score = 0.2 unless match == 'u' or match == 'a' or match == 'k'
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|