despamilator 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +44 -0
- data/PostInstall.txt +1 -0
- data/README.rdoc +107 -0
- data/Rakefile +33 -0
- data/despamilator.gemspec +42 -0
- data/lib/despamilator/filter/html_tags.rb +116 -0
- data/lib/despamilator/filter/naughty_q.rb +17 -0
- data/lib/despamilator/filter/numbers_and_words.rb +33 -0
- data/lib/despamilator/filter/script_tag.rb +13 -0
- data/lib/despamilator/filter.rb +52 -0
- data/lib/despamilator/filter_base.rb +37 -0
- data/lib/despamilator.rb +19 -0
- data/pkg/despamilator-0.1/History.txt +4 -0
- data/pkg/despamilator-0.1/Manifest.txt +21 -0
- data/pkg/despamilator-0.1/PostInstall.txt +1 -0
- data/pkg/despamilator-0.1/README.rdoc +107 -0
- data/pkg/despamilator-0.1/Rakefile +33 -0
- data/pkg/despamilator-0.1/despamilator.gemspec +42 -0
- data/pkg/despamilator-0.1/lib/despamilator/filter/html_tags.rb +116 -0
- data/pkg/despamilator-0.1/lib/despamilator/filter/naughty_q.rb +17 -0
- data/pkg/despamilator-0.1/lib/despamilator/filter/numbers_and_words.rb +33 -0
- data/pkg/despamilator-0.1/lib/despamilator/filter/script_tag.rb +13 -0
- data/pkg/despamilator-0.1/lib/despamilator/filter.rb +52 -0
- data/pkg/despamilator-0.1/lib/despamilator/filter_base.rb +37 -0
- data/pkg/despamilator-0.1/lib/despamilator.rb +19 -0
- data/pkg/despamilator-0.1/spec/despamilator_spec.rb +15 -0
- data/pkg/despamilator-0.1/spec/filters/html_tags_spec.rb +144 -0
- data/pkg/despamilator-0.1/spec/filters/naughty_q_spec.rb +39 -0
- data/pkg/despamilator-0.1/spec/filters/numbers_and_words_spec.rb +59 -0
- data/pkg/despamilator-0.1/spec/filters/script_tag_spec.rb +32 -0
- data/pkg/despamilator-0.1/spec/spec.opts +1 -0
- data/pkg/despamilator-0.1/spec/spec_helper.rb +10 -0
- data/pkg/despamilator-0.1/tasks/rspec.rake +21 -0
- data/pkg/despamilator-0.1.gem +0 -0
- data/pkg/despamilator-0.1.tgz +0 -0
- data/spec/despamilator_spec.rb +15 -0
- data/spec/filters/html_tags_spec.rb +144 -0
- data/spec/filters/naughty_q_spec.rb +39 -0
- data/spec/filters/numbers_and_words_spec.rb +59 -0
- data/spec/filters/script_tag_spec.rb +32 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +10 -0
- data/tasks/rspec.rake +21 -0
- metadata +155 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
class Despamilator
|
2
|
+
class Filter
|
3
|
+
attr_accessor :matches, :score
|
4
|
+
|
5
|
+
def initialize text
|
6
|
+
@@loaded ||= {}
|
7
|
+
@filters ||= []
|
8
|
+
@matches ||= []
|
9
|
+
@score ||= 0
|
10
|
+
load_filters text
|
11
|
+
run_filters
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def load_filters text
|
17
|
+
Dir.glob(File.dirname(__FILE__) + "/filter/*.rb").each do |filter_file|
|
18
|
+
filter_name = classify_filename filter_file
|
19
|
+
filter = @@loaded[filter_name]
|
20
|
+
|
21
|
+
unless filter
|
22
|
+
filter_code = File.open(filter_file, File::RDWR).read
|
23
|
+
filter = Class.new
|
24
|
+
filter.class_eval(
|
25
|
+
"require 'despamilator/filter_base'\nclass #{filter_name} < Despamilator::FilterBase\n#{filter_code}\nend"
|
26
|
+
)
|
27
|
+
end
|
28
|
+
|
29
|
+
@filters.push(filter.const_get(filter_name).new(text.to_s.dup, File.basename(filter_file)))
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def run_filters
|
34
|
+
@filters.each do |filter|
|
35
|
+
filter.parse
|
36
|
+
|
37
|
+
if filter.matched?
|
38
|
+
@matches.push(filter)
|
39
|
+
@score += filter.score
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def classify_filename filename
|
45
|
+
classname = ''
|
46
|
+
File.basename(filename).gsub(/\.rb$/, '').split('_').each do |filename_part|
|
47
|
+
classname += filename_part.capitalize
|
48
|
+
end
|
49
|
+
classname || filename.capitalize
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class Despamilator
|
2
|
+
class FilterBase
|
3
|
+
attr_accessor :text, :score, :filename, :matches
|
4
|
+
|
5
|
+
def initialize text, filename
|
6
|
+
@matches = 0
|
7
|
+
@filename = filename
|
8
|
+
@score = 0
|
9
|
+
@text = text
|
10
|
+
@matched = false
|
11
|
+
end
|
12
|
+
|
13
|
+
def description
|
14
|
+
raise "No description defined in #{filename}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse blah
|
18
|
+
raise "No parser defined in #{filename}"
|
19
|
+
end
|
20
|
+
|
21
|
+
def name
|
22
|
+
raise "No name defined in #{filename}"
|
23
|
+
end
|
24
|
+
|
25
|
+
def matched?
|
26
|
+
@score > 0
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def append_score= new_score
|
32
|
+
@matches += 1
|
33
|
+
@score += new_score
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
2
|
+
|
3
|
+
require 'despamilator/filter'
|
4
|
+
|
5
|
+
class Despamilator
|
6
|
+
VERSION = "0.2"
|
7
|
+
|
8
|
+
def initialize text
|
9
|
+
@filters = Despamilator::Filter.new text
|
10
|
+
end
|
11
|
+
|
12
|
+
def score
|
13
|
+
@filters.score
|
14
|
+
end
|
15
|
+
|
16
|
+
def matched_by
|
17
|
+
@filters.matches
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper.rb'
|
2
|
+
|
3
|
+
describe Despamilator do
|
4
|
+
before :each do
|
5
|
+
@dspam = Despamilator.new('this text is absolutely fine')
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should return a zero score for fine text" do
|
9
|
+
@dspam.score.should == 0
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return no matching filter for fine text" do
|
13
|
+
@dspam.matched_by.should be_empty
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
|
+
|
3
|
+
context "HtmlTags" do
|
4
|
+
describe "filtering" do
|
5
|
+
[
|
6
|
+
'!--',
|
7
|
+
'!DOCTYPE',
|
8
|
+
'a',
|
9
|
+
'abbr',
|
10
|
+
'acronym',
|
11
|
+
'address',
|
12
|
+
'applet',
|
13
|
+
'area',
|
14
|
+
'b',
|
15
|
+
'base',
|
16
|
+
'basefont',
|
17
|
+
'bdo',
|
18
|
+
'big',
|
19
|
+
'blockquote',
|
20
|
+
'body',
|
21
|
+
'br',
|
22
|
+
'button',
|
23
|
+
'caption',
|
24
|
+
'center',
|
25
|
+
'cite',
|
26
|
+
'code',
|
27
|
+
'col',
|
28
|
+
'colgroup',
|
29
|
+
'dd',
|
30
|
+
'del',
|
31
|
+
'dfn',
|
32
|
+
'dir',
|
33
|
+
'div',
|
34
|
+
'dl',
|
35
|
+
'dt',
|
36
|
+
'em',
|
37
|
+
'fieldset',
|
38
|
+
'font',
|
39
|
+
'form',
|
40
|
+
'frame',
|
41
|
+
'frameset',
|
42
|
+
'h1',
|
43
|
+
'h2',
|
44
|
+
'h3',
|
45
|
+
'h4',
|
46
|
+
'h5',
|
47
|
+
'h6',
|
48
|
+
'head',
|
49
|
+
'hr',
|
50
|
+
'html',
|
51
|
+
'i',
|
52
|
+
'iframe',
|
53
|
+
'img',
|
54
|
+
'input',
|
55
|
+
'ins',
|
56
|
+
'isindex',
|
57
|
+
'kbd',
|
58
|
+
'label',
|
59
|
+
'legend',
|
60
|
+
'li',
|
61
|
+
'link',
|
62
|
+
'map',
|
63
|
+
'menu',
|
64
|
+
'meta',
|
65
|
+
'noframes',
|
66
|
+
'noscript',
|
67
|
+
'object',
|
68
|
+
'ol',
|
69
|
+
'optgroup',
|
70
|
+
'option',
|
71
|
+
'p',
|
72
|
+
'param',
|
73
|
+
'pre',
|
74
|
+
'q',
|
75
|
+
's',
|
76
|
+
'samp',
|
77
|
+
'select',
|
78
|
+
'small',
|
79
|
+
'span',
|
80
|
+
'strike',
|
81
|
+
'strong',
|
82
|
+
'style',
|
83
|
+
'sub',
|
84
|
+
'sup',
|
85
|
+
'table',
|
86
|
+
'tbody',
|
87
|
+
'td',
|
88
|
+
'textarea',
|
89
|
+
'tfoot',
|
90
|
+
'th',
|
91
|
+
'thead',
|
92
|
+
'title',
|
93
|
+
'tr',
|
94
|
+
'tt',
|
95
|
+
'u',
|
96
|
+
'ul',
|
97
|
+
'var',
|
98
|
+
'xmp'
|
99
|
+
].each do |script_tag|
|
100
|
+
[script_tag.upcase, script_tag.downcase].each do |script_tag|
|
101
|
+
[
|
102
|
+
"<#{script_tag}",
|
103
|
+
"#{script_tag}/>",
|
104
|
+
"<#{script_tag}/>",
|
105
|
+
"< #{script_tag} ",
|
106
|
+
"#{script_tag} />",
|
107
|
+
"<\n#{script_tag}\n/>",
|
108
|
+
"<\n#{script_tag} ",
|
109
|
+
"#{script_tag}\n/>",
|
110
|
+
"<\r#{script_tag}\r/>"
|
111
|
+
].each do |script_tag|
|
112
|
+
it "should detect '#{script_tag}'" do
|
113
|
+
dspam = Despamilator.new(script_tag)
|
114
|
+
dspam.score.should == 0.3
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
describe 'attributes' do
|
121
|
+
before :all do
|
122
|
+
@dspam = Despamilator.new('<xmp>').matched_by.first
|
123
|
+
end
|
124
|
+
|
125
|
+
it "should have a name" do
|
126
|
+
@dspam.name.should == 'Detects HTML tags in text'
|
127
|
+
end
|
128
|
+
|
129
|
+
it "should have a description" do
|
130
|
+
@dspam.description.should == 'Searches for various HTML tags'
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should have a filename" do
|
134
|
+
@dspam.filename.should == 'html_tags.rb'
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
describe 'bug fixes' do
|
139
|
+
it "should detect an h1" do
|
140
|
+
Despamilator.new('<h1>TITLE!!</h1>').score.should == 0.3
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
|
+
|
3
|
+
describe "NaughtyQ" do
|
4
|
+
it "should return a score for 1 misplaced q" do
|
5
|
+
dspam = Despamilator.new('qtu')
|
6
|
+
dspam.score.should == 0.2
|
7
|
+
end
|
8
|
+
|
9
|
+
describe 'attributes' do
|
10
|
+
before :each do
|
11
|
+
@filter = Despamilator.new('qtqt').matched_by.first
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should have a filename" do
|
15
|
+
@filter.filename.should == 'naughty_q.rb'
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should have a name" do
|
19
|
+
@filter.name.should == 'Naughty Q'
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should have a description" do
|
23
|
+
@filter.description.should == 'Detects possible misuse of the letter Q (English language)'
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should have a number of matches" do
|
27
|
+
@filter.matches.should == 2
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should have a score" do
|
31
|
+
@filter.score.should == 0.4
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should score more for 3 misplaced q's" do
|
36
|
+
dspam = Despamilator.new('qtuqsq')
|
37
|
+
dspam.score.to_s.should == 0.4.to_s
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
|
+
|
3
|
+
describe "NumbersAndWords" do
|
4
|
+
[1, 4, 10, 100000, '1,000,000', '1st', '2nd', '3rd', '4th', '5th', '6th', '10th', '122nd'].each do |number|
|
5
|
+
it "should return a blank for a #{number}" do
|
6
|
+
dspam = Despamilator.new(number)
|
7
|
+
dspam.score.should == 0
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
['wanga x5 mool', '4ghk', 'XTHL9'].each do |string|
|
12
|
+
it "should detect suspicious number word combos such as #{string}" do
|
13
|
+
dspam = Despamilator.new(string)
|
14
|
+
dspam.score.should == 0.1
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
['4wanga x5 mool', '4g6hk', 'XT7HL9', '77th8nd'].each do |string|
|
19
|
+
it "should detect multiple suspicious number word combos such as #{string}" do
|
20
|
+
dspam = Despamilator.new(string)
|
21
|
+
dspam.score.should == 0.2
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
[1, 2, 3, 4, 5, 6].each do |tag_no|
|
26
|
+
header_tag = "h#{tag_no}"
|
27
|
+
|
28
|
+
it "should ignore html header tag #{header_tag}" do
|
29
|
+
dspam = Despamilator.new(header_tag)
|
30
|
+
dspam.score.should == 0
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe 'attributes' do
|
35
|
+
before :each do
|
36
|
+
@filter = Despamilator.new('X5T').matched_by.first
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should have a filename" do
|
40
|
+
@filter.filename.should == 'numbers_and_words.rb'
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should have a name" do
|
44
|
+
@filter.name.should == 'Numbers next to words'
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should have a description" do
|
48
|
+
@filter.description.should == 'Detects unusual number/word combinations'
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should have a number of matches" do
|
52
|
+
@filter.matches.should == 1
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should have a score" do
|
56
|
+
@filter.score.should == 0.1
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
|
+
|
3
|
+
context "ScriptTag" do
|
4
|
+
describe "detecting various script tags" do
|
5
|
+
['<script type="whatever">', '<script></script>', '</script>', '<script>', "<script\n>"].each do |script_tag|
|
6
|
+
[script_tag.upcase, script_tag.downcase].each do |script_tag|
|
7
|
+
it "should detect '#{script_tag}' of a script tag" do
|
8
|
+
dspam = Despamilator.new(script_tag)
|
9
|
+
dspam.score.should == 1
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe 'attributes' do
|
16
|
+
before :all do
|
17
|
+
@dspam = Despamilator.new('<script>').matched_by.first
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should have a name" do
|
21
|
+
@dspam.name.should == 'Detects script tags in text'
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should have a description" do
|
25
|
+
@dspam.description.should == 'Searches for variations for the HTML script tag'
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should have a filename" do
|
29
|
+
@dspam.filename.should == 'script_tag.rb'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
@@ -0,0 +1,21 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
5
|
+
require 'spec'
|
6
|
+
end
|
7
|
+
begin
|
8
|
+
require 'spec/rake/spectask'
|
9
|
+
rescue LoadError
|
10
|
+
puts <<-EOS
|
11
|
+
To use rspec for testing you must install rspec gem:
|
12
|
+
gem install rspec
|
13
|
+
EOS
|
14
|
+
exit(0)
|
15
|
+
end
|
16
|
+
|
17
|
+
desc "Run the specs under spec/models"
|
18
|
+
Spec::Rake::SpecTask.new do |t|
|
19
|
+
t.spec_opts = ['--options', "spec/spec.opts"]
|
20
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
21
|
+
end
|