despamilator 2.0.1 → 2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/lib/despamilator.rb +38 -7
- data/lib/despamilator/filter.rb +39 -23
- data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
- data/lib/despamilator/filter/html_tags.rb +9 -7
- data/lib/despamilator/filter/ip_address_url.rb +6 -4
- data/lib/despamilator/filter/long_words.rb +7 -5
- data/lib/despamilator/filter/mixed_case.rb +21 -0
- data/lib/despamilator/filter/naughty_words.rb +5 -5
- data/lib/despamilator/filter/numbers_and_words.rb +19 -11
- data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
- data/lib/despamilator/filter/prices.rb +19 -0
- data/lib/despamilator/filter/script_tag.rb +4 -4
- data/lib/despamilator/filter/shouting.rb +9 -6
- data/lib/despamilator/filter/spammy_tlds.rb +22 -0
- data/lib/despamilator/filter/square_brackets.rb +5 -5
- data/lib/despamilator/filter/trailing_number.rb +4 -4
- data/lib/despamilator/filter/unusual_characters.rb +5 -5
- data/lib/despamilator/filter/urls.rb +7 -9
- data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
- data/lib/despamilator/filter/weird_punctuation.rb +44 -0
- data/lib/despamilator/subject.rb +30 -0
- data/lib/despamilator/subject/text.rb +32 -0
- data/lib/despamilator/version.rb +3 -0
- metadata +29 -75
- data/.rspec +0 -2
- data/.rvmrc +0 -1
- data/Gemfile +0 -12
- data/Gemfile.lock +0 -47
- data/Manifest.txt +0 -46
- data/PostInstall.txt +0 -1
- data/Rakefile +0 -39
- data/conf/unusual_characters.txt +0 -6674
- data/despamilator.gemspec +0 -38
- data/lib/despamilator/filter_base.rb +0 -82
- data/scripts/despamilator_score.rb +0 -25
- data/scripts/from_file.rb +0 -26
- data/spec/despamilator_spec.rb +0 -13
- data/spec/filter_base_spec.rb +0 -30
- data/spec/filters/gtubs_test_filter_spec.rb +0 -9
- data/spec/filters/html_tags_spec.rb +0 -129
- data/spec/filters/ip_address_url_spec.rb +0 -11
- data/spec/filters/long_words_spec.rb +0 -11
- data/spec/filters/naughty_words_spec.rb +0 -11
- data/spec/filters/numbers_and_words_spec.rb +0 -34
- data/spec/filters/script_tag_spec.rb +0 -22
- data/spec/filters/shouting_spec.rb +0 -45
- data/spec/filters/square_brackets_spec.rb +0 -11
- data/spec/filters/trailing_number_spec.rb +0 -10
- data/spec/filters/unusual_characters_spec.rb +0 -9
- data/spec/filters/urls_spec.rb +0 -11
- data/spec/helpers/corpus_helper.rb +0 -5
- data/spec/helpers/filter_helper.rb +0 -59
- data/spec/helpers/spec_helper.rb +0 -6
- data/tasks/test.rake +0 -6
data/History.txt
CHANGED
data/lib/despamilator.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
2
2
|
|
3
|
-
|
3
|
+
Dir.glob(File.join(File.dirname(__FILE__), 'despamilator', 'filter', '*.rb')).each do |filter_file|
|
4
|
+
require filter_file
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'despamilator/subject'
|
8
|
+
require 'ostruct'
|
4
9
|
|
5
10
|
#== SYNOPSIS:
|
6
11
|
#
|
@@ -14,25 +19,39 @@ require 'despamilator/filter'
|
|
14
19
|
# dspam.matched_by #=> array of matching filters
|
15
20
|
|
16
21
|
class Despamilator
|
17
|
-
VERSION = "2.0.1"
|
18
22
|
|
19
23
|
# Constructor. Takes the text you which to parse and score.
|
20
24
|
|
21
25
|
def initialize text
|
22
|
-
@
|
26
|
+
@subject = Despamilator::Subject.new text
|
27
|
+
run_filters @subject
|
23
28
|
end
|
24
29
|
|
25
30
|
# Returns the total score as a Float.
|
26
31
|
|
27
32
|
def score
|
28
|
-
@
|
33
|
+
@subject.score
|
34
|
+
end
|
35
|
+
|
36
|
+
def matched_by
|
37
|
+
warn 'Despamilator.matched_by is deprecated, please use Despamilator.matches by 2011-12-31.'
|
38
|
+
|
39
|
+
matches.map do |match|
|
40
|
+
filter = match[:filter]
|
41
|
+
|
42
|
+
OpenStruct.new(
|
43
|
+
:name => filter.name,
|
44
|
+
:description => filter.description,
|
45
|
+
:score => match[:score]
|
46
|
+
)
|
47
|
+
end
|
29
48
|
end
|
30
49
|
|
31
|
-
# Returns an array of filters that have matched and contributed to the score.
|
50
|
+
# Returns an array of scores and filters that have matched and contributed to the score.
|
32
51
|
# Each element is a a child of the Despamilator::FilterBase class.
|
33
52
|
|
34
|
-
def
|
35
|
-
@
|
53
|
+
def matches
|
54
|
+
@subject.matches
|
36
55
|
end
|
37
56
|
|
38
57
|
# Generic Test for Unsolicited Bulk Submissions. Similar to SpamAssassin's GTUBE.
|
@@ -41,4 +60,16 @@ class Despamilator
|
|
41
60
|
def self.gtubs_test_string
|
42
61
|
'89913b8a065b7092721fe995877e097681683af9d3ab767146d5d6fd050fc0bda7ab99f4232d94a1'
|
43
62
|
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def run_filters subject
|
67
|
+
filter_namespace = Object.const_get('DespamilatorFilter')
|
68
|
+
|
69
|
+
filter_namespace.constants.each do |filter_class|
|
70
|
+
filter = filter_namespace.const_get(filter_class).new
|
71
|
+
filter.parse(subject)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
44
75
|
end
|
data/lib/despamilator/filter.rb
CHANGED
@@ -1,36 +1,52 @@
|
|
1
1
|
class Despamilator
|
2
|
-
class Filter
|
3
|
-
attr_accessor :matches, :score
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
3
|
+
#This class is the base class of all the despamilator filters.
|
4
|
+
#
|
5
|
+
#== EXAMPLE:
|
6
|
+
#
|
7
|
+
#This example is to detect the letter "a". Put the code in
|
8
|
+
#lib/despamilator/filter/detect_letter_a.rb:
|
9
|
+
#
|
10
|
+
# require 'despamilator/filter_base'
|
11
|
+
#
|
12
|
+
# module DespamilatorFilter
|
13
|
+
#
|
14
|
+
# class DetectLetterA < Despamilator::FilterBase
|
15
|
+
#
|
16
|
+
# def name
|
17
|
+
# 'Detecting the letter A'
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# def description
|
21
|
+
# 'Detects the letter "a" in a string for no reason other than a demo'
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# def parse text
|
25
|
+
# if text.downcase.scan(/a/)
|
26
|
+
# # add 0.1 to the score of the text
|
27
|
+
# self.append_score = 0.1
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
# end
|
10
31
|
|
11
|
-
|
32
|
+
class Filter
|
12
33
|
|
13
|
-
|
14
|
-
filter_namespace = Object.const_get('DespamilatorFilter')
|
34
|
+
# The nice description of the filter. Usually no more than a sentence.
|
15
35
|
|
16
|
-
|
17
|
-
|
18
|
-
end
|
36
|
+
def description
|
37
|
+
raise "No description defined for #{self.class}"
|
19
38
|
end
|
20
39
|
|
21
|
-
|
40
|
+
# This method parses some text. The score is assigned to the same instance.
|
22
41
|
|
23
|
-
def
|
24
|
-
|
25
|
-
|
26
|
-
if filter.matched?
|
27
|
-
@matches.push(filter)
|
28
|
-
@score += filter.score
|
29
|
-
end
|
42
|
+
def parse text
|
43
|
+
raise "No parser defined for #{self.class}"
|
30
44
|
end
|
31
45
|
|
32
|
-
|
33
|
-
|
46
|
+
# The one or two word name for the filter.
|
47
|
+
|
48
|
+
def name
|
49
|
+
raise "No name defined for #{self.class}"
|
34
50
|
end
|
35
51
|
|
36
52
|
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class GtubsTestFilter < Despamilator::
|
5
|
+
class GtubsTestFilter < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'GTubs Test Filter'
|
@@ -12,8 +12,8 @@ module DespamilatorFilter
|
|
12
12
|
'Detects the special test string (Despamilator.gtubs_test_string) and assigns a big score.'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
|
15
|
+
def parse subject
|
16
|
+
subject.register_match!({:score => 100, :filter => self}) if subject.text == Despamilator.gtubs_test_string
|
17
17
|
end
|
18
18
|
|
19
19
|
end
|
@@ -1,15 +1,17 @@
|
|
1
|
-
require 'despamilator/filter_base'
|
2
|
-
|
3
1
|
module DespamilatorFilter
|
4
2
|
|
5
|
-
class HtmlTags < Despamilator::
|
3
|
+
class HtmlTags < Despamilator::Filter
|
6
4
|
|
7
|
-
def parse
|
8
|
-
text.downcase
|
5
|
+
def parse subject
|
6
|
+
text = subject.text.downcase
|
9
7
|
|
10
8
|
html_tags.each do |tag|
|
11
|
-
|
12
|
-
|
9
|
+
opening_elements = text.count(/<\s*#{tag}\W/)
|
10
|
+
closing_elements = text.count(/\W#{tag}\s*\/>/)
|
11
|
+
|
12
|
+
if opening_elements > 0 or closing_elements > 0
|
13
|
+
safest_element_count = opening_elements > closing_elements ? opening_elements : closing_elements
|
14
|
+
subject.register_match!({:score => 0.6 * safest_element_count, :filter => self})
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class IPAddressURL < Despamilator::
|
5
|
+
class IPAddressURL < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'IP Address URL'
|
@@ -12,8 +12,10 @@ module DespamilatorFilter
|
|
12
12
|
'Detects IP address URLs'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
|
15
|
+
def parse subject
|
16
|
+
subject.register_match!({
|
17
|
+
:score => 0.5, :filter => self
|
18
|
+
}) if subject.text.downcase.count(/http:\/\/\d+\.\d+\.\d+\.\d+/) > 0
|
17
19
|
end
|
18
20
|
|
19
21
|
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class LongWords < Despamilator::
|
5
|
+
class LongWords < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Long Words'
|
@@ -12,9 +12,11 @@ module DespamilatorFilter
|
|
12
12
|
'Detects long and unbroken strings'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
text.
|
17
|
-
|
15
|
+
def parse subject
|
16
|
+
subject.text.without_uris.words.each do |word|
|
17
|
+
subject.register_match!({
|
18
|
+
:score => 0.1, :filter => self
|
19
|
+
}) if word.length > 20
|
18
20
|
end
|
19
21
|
end
|
20
22
|
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module DespamilatorFilter
|
2
|
+
|
3
|
+
class MixedCase < Despamilator::Filter
|
4
|
+
def name
|
5
|
+
'Mixed Case String'
|
6
|
+
end
|
7
|
+
|
8
|
+
def description
|
9
|
+
'Detects mixed case strings.'
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse subject
|
13
|
+
text = subject.text.without_uris
|
14
|
+
count = text.remove_and_count!(/[a-z][A-Z]/)
|
15
|
+
count += text.remove_and_count!(/[a-z][A-Z][a-z]/)
|
16
|
+
subject.register_match!({:score => 0.1 * count, :filter => self}) if count > 0
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class NaughtyWords < Despamilator::
|
5
|
+
class NaughtyWords < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Naughty Words'
|
@@ -12,11 +12,11 @@ module DespamilatorFilter
|
|
12
12
|
'Detects cheeky words'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
text.downcase
|
15
|
+
def parse subject
|
16
|
+
text = subject.text.downcase
|
17
17
|
|
18
18
|
naughty_words.each do |word|
|
19
|
-
|
19
|
+
subject.register_match!({:score => 0.1, :filter => self}) if text =~ /\b#{word}s?\b/
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
@@ -1,17 +1,11 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class NumbersAndWords < Despamilator::
|
5
|
+
class NumbersAndWords < Despamilator::Filter
|
6
6
|
|
7
|
-
def parse
|
8
|
-
text
|
9
|
-
|
10
|
-
# strip out "good numbers"
|
11
|
-
text.gsub!(/h[1-6]/, '')
|
12
|
-
text.gsub!(/(^|\b)\d+($|\b)/, '')
|
13
|
-
text.gsub!(/(^|\b)\d+(,|\.)\d+($|\b)/, '')
|
14
|
-
text.gsub!(/(^|\b)\d+(st|nd|rd|th)($|\b)/, '')
|
7
|
+
def parse subject
|
8
|
+
text = tidy_text(subject)
|
15
9
|
|
16
10
|
[
|
17
11
|
/\w\d+/,
|
@@ -25,7 +19,7 @@ module DespamilatorFilter
|
|
25
19
|
matches.each do |to_remove|
|
26
20
|
to_remove = to_remove.to_s
|
27
21
|
text.sub!(to_remove, '') unless to_remove.empty?
|
28
|
-
|
22
|
+
subject.register_match!({:score => 0.1, :filter => self})
|
29
23
|
end
|
30
24
|
end
|
31
25
|
end
|
@@ -38,6 +32,20 @@ module DespamilatorFilter
|
|
38
32
|
'Detects unusual number/word combinations'
|
39
33
|
end
|
40
34
|
|
35
|
+
private
|
36
|
+
|
37
|
+
def tidy_text subject
|
38
|
+
text = subject.text.without_uris
|
39
|
+
text.downcase!
|
40
|
+
|
41
|
+
# strip out "good numbers"
|
42
|
+
text.gsub!(/h[1-6]/, '')
|
43
|
+
text.gsub!(/(^|\b)\d+($|\b)/, '')
|
44
|
+
text.gsub!(/(^|\b)\d+(,|\.)\d+($|\b)/, '')
|
45
|
+
text.gsub!(/(^|\b)\d+(st|nd|rd|th)($|\b)/, '')
|
46
|
+
|
47
|
+
text
|
48
|
+
end
|
41
49
|
end
|
42
50
|
|
43
51
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module DespamilatorFilter
|
2
|
+
|
3
|
+
class ObfuscatedURLs < Despamilator::Filter
|
4
|
+
def name
|
5
|
+
'Obfuscated URLs'
|
6
|
+
end
|
7
|
+
|
8
|
+
def description
|
9
|
+
'Finds lame attempts at obfuscating urls.'
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse subject
|
13
|
+
text = subject.text.without_uris.downcase
|
14
|
+
count = find_space_separated_parts text
|
15
|
+
count += find_space_separated_characters text
|
16
|
+
|
17
|
+
# weird maths below is due to some issue with ruby 1.9.2 multiplying floats by 3 (?!)
|
18
|
+
subject.register_match!({score: (4.0 * count) / 10, filter: self}) if count > 0
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def find_space_separated_parts text
|
24
|
+
text.count(/www\s+\w+\s+com/)
|
25
|
+
end
|
26
|
+
|
27
|
+
def find_space_separated_characters text
|
28
|
+
count = 0
|
29
|
+
|
30
|
+
text.split(/[a-z][a-z]/).each do |candidate|
|
31
|
+
candidate.strip!
|
32
|
+
candidate.gsub!(/\s+/, '')
|
33
|
+
count += 1 if candidate =~ /\w{5,}\.\w{2,3}/
|
34
|
+
end
|
35
|
+
|
36
|
+
count
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module DespamilatorFilter
|
2
|
+
|
3
|
+
class Prices < Despamilator::Filter
|
4
|
+
def name
|
5
|
+
'Prices'
|
6
|
+
end
|
7
|
+
|
8
|
+
def description
|
9
|
+
'Detects prices in text.'
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse subject
|
13
|
+
price_count = subject.text.count(/\$\s*\d+/)
|
14
|
+
subject.register_match!({:score => 0.075 * price_count, :filter => self}) if price_count > 0
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -1,11 +1,11 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class ScriptTag < Despamilator::
|
5
|
+
class ScriptTag < Despamilator::Filter
|
6
6
|
|
7
|
-
def parse
|
8
|
-
|
7
|
+
def parse subject
|
8
|
+
subject.register_match!({:score => 1, :filter => self}) if subject.text.downcase.match(/<\/?script(>|\s+|\n|\r)/)
|
9
9
|
end
|
10
10
|
|
11
11
|
def name
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class Shouting < Despamilator::
|
5
|
+
class Shouting < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Shouting'
|
@@ -12,17 +12,20 @@ module DespamilatorFilter
|
|
12
12
|
'Detects and scores shouting (all caps)'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
15
|
+
def parse subject
|
16
16
|
# strip HTML
|
17
|
-
text.gsub
|
17
|
+
text = subject.text.gsub(/<\/?[^>]*>/, "")
|
18
18
|
|
19
19
|
return if text.length < 20
|
20
20
|
|
21
21
|
uppercased = text.scan(/[A-Z][A-Z]+/).join.length
|
22
|
-
lowercased = text.
|
22
|
+
lowercased = text.count(/[a-z]/)
|
23
23
|
|
24
24
|
if uppercased > 0
|
25
|
-
|
25
|
+
subject.register_match!({
|
26
|
+
:score => (uppercased.to_f / (uppercased + lowercased)) * 0.5,
|
27
|
+
:filter => self
|
28
|
+
})
|
26
29
|
end
|
27
30
|
end
|
28
31
|
|