despamilator 2.0.1 → 2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/lib/despamilator.rb +38 -7
- data/lib/despamilator/filter.rb +39 -23
- data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
- data/lib/despamilator/filter/html_tags.rb +9 -7
- data/lib/despamilator/filter/ip_address_url.rb +6 -4
- data/lib/despamilator/filter/long_words.rb +7 -5
- data/lib/despamilator/filter/mixed_case.rb +21 -0
- data/lib/despamilator/filter/naughty_words.rb +5 -5
- data/lib/despamilator/filter/numbers_and_words.rb +19 -11
- data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
- data/lib/despamilator/filter/prices.rb +19 -0
- data/lib/despamilator/filter/script_tag.rb +4 -4
- data/lib/despamilator/filter/shouting.rb +9 -6
- data/lib/despamilator/filter/spammy_tlds.rb +22 -0
- data/lib/despamilator/filter/square_brackets.rb +5 -5
- data/lib/despamilator/filter/trailing_number.rb +4 -4
- data/lib/despamilator/filter/unusual_characters.rb +5 -5
- data/lib/despamilator/filter/urls.rb +7 -9
- data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
- data/lib/despamilator/filter/weird_punctuation.rb +44 -0
- data/lib/despamilator/subject.rb +30 -0
- data/lib/despamilator/subject/text.rb +32 -0
- data/lib/despamilator/version.rb +3 -0
- metadata +29 -75
- data/.rspec +0 -2
- data/.rvmrc +0 -1
- data/Gemfile +0 -12
- data/Gemfile.lock +0 -47
- data/Manifest.txt +0 -46
- data/PostInstall.txt +0 -1
- data/Rakefile +0 -39
- data/conf/unusual_characters.txt +0 -6674
- data/despamilator.gemspec +0 -38
- data/lib/despamilator/filter_base.rb +0 -82
- data/scripts/despamilator_score.rb +0 -25
- data/scripts/from_file.rb +0 -26
- data/spec/despamilator_spec.rb +0 -13
- data/spec/filter_base_spec.rb +0 -30
- data/spec/filters/gtubs_test_filter_spec.rb +0 -9
- data/spec/filters/html_tags_spec.rb +0 -129
- data/spec/filters/ip_address_url_spec.rb +0 -11
- data/spec/filters/long_words_spec.rb +0 -11
- data/spec/filters/naughty_words_spec.rb +0 -11
- data/spec/filters/numbers_and_words_spec.rb +0 -34
- data/spec/filters/script_tag_spec.rb +0 -22
- data/spec/filters/shouting_spec.rb +0 -45
- data/spec/filters/square_brackets_spec.rb +0 -11
- data/spec/filters/trailing_number_spec.rb +0 -10
- data/spec/filters/unusual_characters_spec.rb +0 -9
- data/spec/filters/urls_spec.rb +0 -11
- data/spec/helpers/corpus_helper.rb +0 -5
- data/spec/helpers/filter_helper.rb +0 -59
- data/spec/helpers/spec_helper.rb +0 -6
- data/tasks/test.rake +0 -6
data/History.txt
CHANGED
data/lib/despamilator.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
2
2
|
|
3
|
-
|
3
|
+
Dir.glob(File.join(File.dirname(__FILE__), 'despamilator', 'filter', '*.rb')).each do |filter_file|
|
4
|
+
require filter_file
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'despamilator/subject'
|
8
|
+
require 'ostruct'
|
4
9
|
|
5
10
|
#== SYNOPSIS:
|
6
11
|
#
|
@@ -14,25 +19,39 @@ require 'despamilator/filter'
|
|
14
19
|
# dspam.matched_by #=> array of matching filters
|
15
20
|
|
16
21
|
class Despamilator
|
17
|
-
VERSION = "2.0.1"
|
18
22
|
|
19
23
|
# Constructor. Takes the text you which to parse and score.
|
20
24
|
|
21
25
|
def initialize text
|
22
|
-
@
|
26
|
+
@subject = Despamilator::Subject.new text
|
27
|
+
run_filters @subject
|
23
28
|
end
|
24
29
|
|
25
30
|
# Returns the total score as a Float.
|
26
31
|
|
27
32
|
def score
|
28
|
-
@
|
33
|
+
@subject.score
|
34
|
+
end
|
35
|
+
|
36
|
+
def matched_by
|
37
|
+
warn 'Despamilator.matched_by is deprecated, please use Despamilator.matches by 2011-12-31.'
|
38
|
+
|
39
|
+
matches.map do |match|
|
40
|
+
filter = match[:filter]
|
41
|
+
|
42
|
+
OpenStruct.new(
|
43
|
+
:name => filter.name,
|
44
|
+
:description => filter.description,
|
45
|
+
:score => match[:score]
|
46
|
+
)
|
47
|
+
end
|
29
48
|
end
|
30
49
|
|
31
|
-
# Returns an array of filters that have matched and contributed to the score.
|
50
|
+
# Returns an array of scores and filters that have matched and contributed to the score.
|
32
51
|
# Each element is a a child of the Despamilator::FilterBase class.
|
33
52
|
|
34
|
-
def
|
35
|
-
@
|
53
|
+
def matches
|
54
|
+
@subject.matches
|
36
55
|
end
|
37
56
|
|
38
57
|
# Generic Test for Unsolicited Bulk Submissions. Similar to SpamAssassin's GTUBE.
|
@@ -41,4 +60,16 @@ class Despamilator
|
|
41
60
|
def self.gtubs_test_string
|
42
61
|
'89913b8a065b7092721fe995877e097681683af9d3ab767146d5d6fd050fc0bda7ab99f4232d94a1'
|
43
62
|
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def run_filters subject
|
67
|
+
filter_namespace = Object.const_get('DespamilatorFilter')
|
68
|
+
|
69
|
+
filter_namespace.constants.each do |filter_class|
|
70
|
+
filter = filter_namespace.const_get(filter_class).new
|
71
|
+
filter.parse(subject)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
44
75
|
end
|
data/lib/despamilator/filter.rb
CHANGED
@@ -1,36 +1,52 @@
|
|
1
1
|
class Despamilator
|
2
|
-
class Filter
|
3
|
-
attr_accessor :matches, :score
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
3
|
+
#This class is the base class of all the despamilator filters.
|
4
|
+
#
|
5
|
+
#== EXAMPLE:
|
6
|
+
#
|
7
|
+
#This example is to detect the letter "a". Put the code in
|
8
|
+
#lib/despamilator/filter/detect_letter_a.rb:
|
9
|
+
#
|
10
|
+
# require 'despamilator/filter_base'
|
11
|
+
#
|
12
|
+
# module DespamilatorFilter
|
13
|
+
#
|
14
|
+
# class DetectLetterA < Despamilator::FilterBase
|
15
|
+
#
|
16
|
+
# def name
|
17
|
+
# 'Detecting the letter A'
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# def description
|
21
|
+
# 'Detects the letter "a" in a string for no reason other than a demo'
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# def parse text
|
25
|
+
# if text.downcase.scan(/a/)
|
26
|
+
# # add 0.1 to the score of the text
|
27
|
+
# self.append_score = 0.1
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
# end
|
10
31
|
|
11
|
-
|
32
|
+
class Filter
|
12
33
|
|
13
|
-
|
14
|
-
filter_namespace = Object.const_get('DespamilatorFilter')
|
34
|
+
# The nice description of the filter. Usually no more than a sentence.
|
15
35
|
|
16
|
-
|
17
|
-
|
18
|
-
end
|
36
|
+
def description
|
37
|
+
raise "No description defined for #{self.class}"
|
19
38
|
end
|
20
39
|
|
21
|
-
|
40
|
+
# This method parses some text. The score is assigned to the same instance.
|
22
41
|
|
23
|
-
def
|
24
|
-
|
25
|
-
|
26
|
-
if filter.matched?
|
27
|
-
@matches.push(filter)
|
28
|
-
@score += filter.score
|
29
|
-
end
|
42
|
+
def parse text
|
43
|
+
raise "No parser defined for #{self.class}"
|
30
44
|
end
|
31
45
|
|
32
|
-
|
33
|
-
|
46
|
+
# The one or two word name for the filter.
|
47
|
+
|
48
|
+
def name
|
49
|
+
raise "No name defined for #{self.class}"
|
34
50
|
end
|
35
51
|
|
36
52
|
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class GtubsTestFilter < Despamilator::
|
5
|
+
class GtubsTestFilter < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'GTubs Test Filter'
|
@@ -12,8 +12,8 @@ module DespamilatorFilter
|
|
12
12
|
'Detects the special test string (Despamilator.gtubs_test_string) and assigns a big score.'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
|
15
|
+
def parse subject
|
16
|
+
subject.register_match!({:score => 100, :filter => self}) if subject.text == Despamilator.gtubs_test_string
|
17
17
|
end
|
18
18
|
|
19
19
|
end
|
@@ -1,15 +1,17 @@
|
|
1
|
-
require 'despamilator/filter_base'
|
2
|
-
|
3
1
|
module DespamilatorFilter
|
4
2
|
|
5
|
-
class HtmlTags < Despamilator::
|
3
|
+
class HtmlTags < Despamilator::Filter
|
6
4
|
|
7
|
-
def parse
|
8
|
-
text.downcase
|
5
|
+
def parse subject
|
6
|
+
text = subject.text.downcase
|
9
7
|
|
10
8
|
html_tags.each do |tag|
|
11
|
-
|
12
|
-
|
9
|
+
opening_elements = text.count(/<\s*#{tag}\W/)
|
10
|
+
closing_elements = text.count(/\W#{tag}\s*\/>/)
|
11
|
+
|
12
|
+
if opening_elements > 0 or closing_elements > 0
|
13
|
+
safest_element_count = opening_elements > closing_elements ? opening_elements : closing_elements
|
14
|
+
subject.register_match!({:score => 0.6 * safest_element_count, :filter => self})
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class IPAddressURL < Despamilator::
|
5
|
+
class IPAddressURL < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'IP Address URL'
|
@@ -12,8 +12,10 @@ module DespamilatorFilter
|
|
12
12
|
'Detects IP address URLs'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
|
15
|
+
def parse subject
|
16
|
+
subject.register_match!({
|
17
|
+
:score => 0.5, :filter => self
|
18
|
+
}) if subject.text.downcase.count(/http:\/\/\d+\.\d+\.\d+\.\d+/) > 0
|
17
19
|
end
|
18
20
|
|
19
21
|
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class LongWords < Despamilator::
|
5
|
+
class LongWords < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Long Words'
|
@@ -12,9 +12,11 @@ module DespamilatorFilter
|
|
12
12
|
'Detects long and unbroken strings'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
text.
|
17
|
-
|
15
|
+
def parse subject
|
16
|
+
subject.text.without_uris.words.each do |word|
|
17
|
+
subject.register_match!({
|
18
|
+
:score => 0.1, :filter => self
|
19
|
+
}) if word.length > 20
|
18
20
|
end
|
19
21
|
end
|
20
22
|
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module DespamilatorFilter
|
2
|
+
|
3
|
+
class MixedCase < Despamilator::Filter
|
4
|
+
def name
|
5
|
+
'Mixed Case String'
|
6
|
+
end
|
7
|
+
|
8
|
+
def description
|
9
|
+
'Detects mixed case strings.'
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse subject
|
13
|
+
text = subject.text.without_uris
|
14
|
+
count = text.remove_and_count!(/[a-z][A-Z]/)
|
15
|
+
count += text.remove_and_count!(/[a-z][A-Z][a-z]/)
|
16
|
+
subject.register_match!({:score => 0.1 * count, :filter => self}) if count > 0
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class NaughtyWords < Despamilator::
|
5
|
+
class NaughtyWords < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Naughty Words'
|
@@ -12,11 +12,11 @@ module DespamilatorFilter
|
|
12
12
|
'Detects cheeky words'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
text.downcase
|
15
|
+
def parse subject
|
16
|
+
text = subject.text.downcase
|
17
17
|
|
18
18
|
naughty_words.each do |word|
|
19
|
-
|
19
|
+
subject.register_match!({:score => 0.1, :filter => self}) if text =~ /\b#{word}s?\b/
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
@@ -1,17 +1,11 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class NumbersAndWords < Despamilator::
|
5
|
+
class NumbersAndWords < Despamilator::Filter
|
6
6
|
|
7
|
-
def parse
|
8
|
-
text
|
9
|
-
|
10
|
-
# strip out "good numbers"
|
11
|
-
text.gsub!(/h[1-6]/, '')
|
12
|
-
text.gsub!(/(^|\b)\d+($|\b)/, '')
|
13
|
-
text.gsub!(/(^|\b)\d+(,|\.)\d+($|\b)/, '')
|
14
|
-
text.gsub!(/(^|\b)\d+(st|nd|rd|th)($|\b)/, '')
|
7
|
+
def parse subject
|
8
|
+
text = tidy_text(subject)
|
15
9
|
|
16
10
|
[
|
17
11
|
/\w\d+/,
|
@@ -25,7 +19,7 @@ module DespamilatorFilter
|
|
25
19
|
matches.each do |to_remove|
|
26
20
|
to_remove = to_remove.to_s
|
27
21
|
text.sub!(to_remove, '') unless to_remove.empty?
|
28
|
-
|
22
|
+
subject.register_match!({:score => 0.1, :filter => self})
|
29
23
|
end
|
30
24
|
end
|
31
25
|
end
|
@@ -38,6 +32,20 @@ module DespamilatorFilter
|
|
38
32
|
'Detects unusual number/word combinations'
|
39
33
|
end
|
40
34
|
|
35
|
+
private
|
36
|
+
|
37
|
+
def tidy_text subject
|
38
|
+
text = subject.text.without_uris
|
39
|
+
text.downcase!
|
40
|
+
|
41
|
+
# strip out "good numbers"
|
42
|
+
text.gsub!(/h[1-6]/, '')
|
43
|
+
text.gsub!(/(^|\b)\d+($|\b)/, '')
|
44
|
+
text.gsub!(/(^|\b)\d+(,|\.)\d+($|\b)/, '')
|
45
|
+
text.gsub!(/(^|\b)\d+(st|nd|rd|th)($|\b)/, '')
|
46
|
+
|
47
|
+
text
|
48
|
+
end
|
41
49
|
end
|
42
50
|
|
43
51
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module DespamilatorFilter
|
2
|
+
|
3
|
+
class ObfuscatedURLs < Despamilator::Filter
|
4
|
+
def name
|
5
|
+
'Obfuscated URLs'
|
6
|
+
end
|
7
|
+
|
8
|
+
def description
|
9
|
+
'Finds lame attempts at obfuscating urls.'
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse subject
|
13
|
+
text = subject.text.without_uris.downcase
|
14
|
+
count = find_space_separated_parts text
|
15
|
+
count += find_space_separated_characters text
|
16
|
+
|
17
|
+
# weird maths below is due to some issue with ruby 1.9.2 multiplying floats by 3 (?!)
|
18
|
+
subject.register_match!({score: (4.0 * count) / 10, filter: self}) if count > 0
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def find_space_separated_parts text
|
24
|
+
text.count(/www\s+\w+\s+com/)
|
25
|
+
end
|
26
|
+
|
27
|
+
def find_space_separated_characters text
|
28
|
+
count = 0
|
29
|
+
|
30
|
+
text.split(/[a-z][a-z]/).each do |candidate|
|
31
|
+
candidate.strip!
|
32
|
+
candidate.gsub!(/\s+/, '')
|
33
|
+
count += 1 if candidate =~ /\w{5,}\.\w{2,3}/
|
34
|
+
end
|
35
|
+
|
36
|
+
count
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module DespamilatorFilter
|
2
|
+
|
3
|
+
class Prices < Despamilator::Filter
|
4
|
+
def name
|
5
|
+
'Prices'
|
6
|
+
end
|
7
|
+
|
8
|
+
def description
|
9
|
+
'Detects prices in text.'
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse subject
|
13
|
+
price_count = subject.text.count(/\$\s*\d+/)
|
14
|
+
subject.register_match!({:score => 0.075 * price_count, :filter => self}) if price_count > 0
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -1,11 +1,11 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class ScriptTag < Despamilator::
|
5
|
+
class ScriptTag < Despamilator::Filter
|
6
6
|
|
7
|
-
def parse
|
8
|
-
|
7
|
+
def parse subject
|
8
|
+
subject.register_match!({:score => 1, :filter => self}) if subject.text.downcase.match(/<\/?script(>|\s+|\n|\r)/)
|
9
9
|
end
|
10
10
|
|
11
11
|
def name
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class Shouting < Despamilator::
|
5
|
+
class Shouting < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Shouting'
|
@@ -12,17 +12,20 @@ module DespamilatorFilter
|
|
12
12
|
'Detects and scores shouting (all caps)'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
15
|
+
def parse subject
|
16
16
|
# strip HTML
|
17
|
-
text.gsub
|
17
|
+
text = subject.text.gsub(/<\/?[^>]*>/, "")
|
18
18
|
|
19
19
|
return if text.length < 20
|
20
20
|
|
21
21
|
uppercased = text.scan(/[A-Z][A-Z]+/).join.length
|
22
|
-
lowercased = text.
|
22
|
+
lowercased = text.count(/[a-z]/)
|
23
23
|
|
24
24
|
if uppercased > 0
|
25
|
-
|
25
|
+
subject.register_match!({
|
26
|
+
:score => (uppercased.to_f / (uppercased + lowercased)) * 0.5,
|
27
|
+
:filter => self
|
28
|
+
})
|
26
29
|
end
|
27
30
|
end
|
28
31
|
|