despamilator 2.0.1 → 2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/lib/despamilator.rb +38 -7
- data/lib/despamilator/filter.rb +39 -23
- data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
- data/lib/despamilator/filter/html_tags.rb +9 -7
- data/lib/despamilator/filter/ip_address_url.rb +6 -4
- data/lib/despamilator/filter/long_words.rb +7 -5
- data/lib/despamilator/filter/mixed_case.rb +21 -0
- data/lib/despamilator/filter/naughty_words.rb +5 -5
- data/lib/despamilator/filter/numbers_and_words.rb +19 -11
- data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
- data/lib/despamilator/filter/prices.rb +19 -0
- data/lib/despamilator/filter/script_tag.rb +4 -4
- data/lib/despamilator/filter/shouting.rb +9 -6
- data/lib/despamilator/filter/spammy_tlds.rb +22 -0
- data/lib/despamilator/filter/square_brackets.rb +5 -5
- data/lib/despamilator/filter/trailing_number.rb +4 -4
- data/lib/despamilator/filter/unusual_characters.rb +5 -5
- data/lib/despamilator/filter/urls.rb +7 -9
- data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
- data/lib/despamilator/filter/weird_punctuation.rb +44 -0
- data/lib/despamilator/subject.rb +30 -0
- data/lib/despamilator/subject/text.rb +32 -0
- data/lib/despamilator/version.rb +3 -0
- metadata +29 -75
- data/.rspec +0 -2
- data/.rvmrc +0 -1
- data/Gemfile +0 -12
- data/Gemfile.lock +0 -47
- data/Manifest.txt +0 -46
- data/PostInstall.txt +0 -1
- data/Rakefile +0 -39
- data/conf/unusual_characters.txt +0 -6674
- data/despamilator.gemspec +0 -38
- data/lib/despamilator/filter_base.rb +0 -82
- data/scripts/despamilator_score.rb +0 -25
- data/scripts/from_file.rb +0 -26
- data/spec/despamilator_spec.rb +0 -13
- data/spec/filter_base_spec.rb +0 -30
- data/spec/filters/gtubs_test_filter_spec.rb +0 -9
- data/spec/filters/html_tags_spec.rb +0 -129
- data/spec/filters/ip_address_url_spec.rb +0 -11
- data/spec/filters/long_words_spec.rb +0 -11
- data/spec/filters/naughty_words_spec.rb +0 -11
- data/spec/filters/numbers_and_words_spec.rb +0 -34
- data/spec/filters/script_tag_spec.rb +0 -22
- data/spec/filters/shouting_spec.rb +0 -45
- data/spec/filters/square_brackets_spec.rb +0 -11
- data/spec/filters/trailing_number_spec.rb +0 -10
- data/spec/filters/unusual_characters_spec.rb +0 -9
- data/spec/filters/urls_spec.rb +0 -11
- data/spec/helpers/corpus_helper.rb +0 -5
- data/spec/helpers/filter_helper.rb +0 -59
- data/spec/helpers/spec_helper.rb +0 -6
- data/tasks/test.rake +0 -6
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'despamilator/filter'
|
2
|
+
|
3
|
+
module DespamilatorFilter
|
4
|
+
|
5
|
+
class SpammyTLDs < Despamilator::Filter
|
6
|
+
|
7
|
+
def name
|
8
|
+
'Spammy TLDs'
|
9
|
+
end
|
10
|
+
|
11
|
+
def description
|
12
|
+
'Detects TLDs that are more commonly associated with spam.'
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse subject
|
16
|
+
matches = subject.text.count(/\w{5,}\.(info|biz)\b/)
|
17
|
+
subject.register_match!({:score => 0.05 * matches, :filter => self}) if matches > 0
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class SquareBrackets < Despamilator::
|
5
|
+
class SquareBrackets < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Square Brackets'
|
@@ -12,9 +12,9 @@ module DespamilatorFilter
|
|
12
12
|
'Detects each square bracket in a string'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
text.downcase.scan(/(\[|\])/).each do |match|
|
17
|
-
|
15
|
+
def parse subject
|
16
|
+
subject.text.downcase.scan(/(\[|\])/).each do |match|
|
17
|
+
subject.register_match!({:score => 0.05, :filter => self})
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class TrailingNumber < Despamilator::
|
5
|
+
class TrailingNumber < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Trailing Number'
|
@@ -12,8 +12,8 @@ module DespamilatorFilter
|
|
12
12
|
'Detects a trailing cache busting number'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
|
15
|
+
def parse subject
|
16
|
+
subject.register_match!({:score => 0.1, :filter => self}) if subject.text.without_uris =~ /\b\d+\s*$/
|
17
17
|
end
|
18
18
|
|
19
19
|
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class UnusualCharacters < Despamilator::
|
5
|
+
class UnusualCharacters < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'Unusual Characters'
|
@@ -12,10 +12,10 @@ module DespamilatorFilter
|
|
12
12
|
'Detects and scores each occurrence of an unusual 2 or 3 character combination'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
15
|
+
def parse subject
|
16
16
|
initialize_combos
|
17
|
-
tokenize(text).each do |token|
|
18
|
-
|
17
|
+
tokenize(subject.text.without_uris).each do |token|
|
18
|
+
subject.register_match!({:score => 0.05, :filter => self}) if @@combos[token.to_sym]
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'despamilator/
|
1
|
+
require 'despamilator/filter'
|
2
2
|
|
3
3
|
module DespamilatorFilter
|
4
4
|
|
5
|
-
class URLs < Despamilator::
|
5
|
+
class URLs < Despamilator::Filter
|
6
6
|
|
7
7
|
def name
|
8
8
|
'URLs'
|
@@ -12,13 +12,11 @@ module DespamilatorFilter
|
|
12
12
|
'Detects each url in a string'
|
13
13
|
end
|
14
14
|
|
15
|
-
def parse
|
16
|
-
text.downcase
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
1.upto(text.scan(/http:\/\//).length) do
|
21
|
-
self.append_score = 0.4
|
15
|
+
def parse subject
|
16
|
+
text = subject.text.downcase.gsub(/http:\/\/\d+\.\d+\.\d+\.\d+/, '')
|
17
|
+
matches = text.count(/https?:\/\//)
|
18
|
+
1.upto(matches > 2 ? 2 : matches) do
|
19
|
+
subject.register_match!({:score => 0.4, :filter => self})
|
22
20
|
end
|
23
21
|
end
|
24
22
|
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'despamilator/filter'
|
2
|
+
require 'domainatrix'
|
3
|
+
|
4
|
+
module DespamilatorFilter
|
5
|
+
|
6
|
+
class VeryLongDomainName < Despamilator::Filter
|
7
|
+
|
8
|
+
def name
|
9
|
+
'Very Long Domain Name'
|
10
|
+
end
|
11
|
+
|
12
|
+
def description
|
13
|
+
'Detects unusually long domain names.'
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse subject
|
17
|
+
subject.text.scan(URI.regexp).each do |url_parts|
|
18
|
+
url_parts.compact!
|
19
|
+
next if !url_parts[1] or url_parts[1] !~ /(\w|-){5,}\.\w{2,5}/
|
20
|
+
url = Domainatrix.parse('http://' + url_parts[1])
|
21
|
+
subject.register_match!({:score => 0.4, :filter => self}) if url.domain.length > 20
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'despamilator/filter'
|
2
|
+
|
3
|
+
module DespamilatorFilter
|
4
|
+
|
5
|
+
class WeirdPunctuation < Despamilator::Filter
|
6
|
+
|
7
|
+
def name
|
8
|
+
'Weird Punctuation'
|
9
|
+
end
|
10
|
+
|
11
|
+
def description
|
12
|
+
'Detects unusual use of punctuation.'
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse subject
|
16
|
+
text = subject.text.without_uris.downcase
|
17
|
+
|
18
|
+
text.gsub!(/\w&\w/, 'xx')
|
19
|
+
text.gsub!(/[a-z](!|\?)(\s|$)/, 'x')
|
20
|
+
text.gsub!(/(?:#{punctuation}){20,}/, '')
|
21
|
+
matches = text.remove_and_count!(/(?:\W|\s|^)(#{punctuation})/)
|
22
|
+
matches += text.remove_and_count!(/\w,\w/)
|
23
|
+
matches += text.remove_and_count!(/\w\w\.\w/)
|
24
|
+
matches += text.remove_and_count!(/\w\.\w\w/)
|
25
|
+
matches += text.remove_and_count!(/(#{punctuation})(#{punctuation})/)
|
26
|
+
matches += text.remove_and_count!(/(#{punctuation})$/)
|
27
|
+
matches += text.remove_and_count!(/(?:\W|\s|^)\d+(#{punctuation})/)
|
28
|
+
|
29
|
+
subject.register_match!({:score => 0.03 * matches, :filter => self}) if matches > 0
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def punctuation
|
35
|
+
@punctuation ||= %w{~ ` ! @ # $ % ^ & * _ - + = , / ? | \\ : ;}.map do |punctuation_character|
|
36
|
+
Regexp.escape(punctuation_character)
|
37
|
+
end.join('|')
|
38
|
+
|
39
|
+
@punctuation
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'despamilator/subject/text'
|
2
|
+
|
3
|
+
class Despamilator
|
4
|
+
class Subject
|
5
|
+
attr_reader :score, :text
|
6
|
+
|
7
|
+
def initialize text
|
8
|
+
@score = 0.0
|
9
|
+
@matches = {}
|
10
|
+
@text = Despamilator::Subject::Text.new(text)
|
11
|
+
end
|
12
|
+
|
13
|
+
def register_match! details
|
14
|
+
@score += details[:score] || raise('A score must be supplied')
|
15
|
+
filter = details[:filter] || raise('A filter must be supplied')
|
16
|
+
|
17
|
+
@matches[filter] ||= 0.0
|
18
|
+
@matches[filter] += details[:score]
|
19
|
+
end
|
20
|
+
|
21
|
+
def matches
|
22
|
+
@matches.map do |filter, score|
|
23
|
+
{:filter => filter, :score => score}
|
24
|
+
end.sort do |a, b|
|
25
|
+
b[:score] <=> a[:score]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
class Despamilator
|
4
|
+
class Subject
|
5
|
+
class Text < String
|
6
|
+
|
7
|
+
def initialize text
|
8
|
+
super text
|
9
|
+
freeze
|
10
|
+
end
|
11
|
+
|
12
|
+
def without_uris
|
13
|
+
gsub(/\b(?:https?|mailto|ftp):.+?(\s|$)/i, '')
|
14
|
+
end
|
15
|
+
|
16
|
+
def words
|
17
|
+
split(/\W+/)
|
18
|
+
end
|
19
|
+
|
20
|
+
def count pattern
|
21
|
+
scan(pattern).flatten.compact.length
|
22
|
+
end
|
23
|
+
|
24
|
+
def remove_and_count! pattern
|
25
|
+
count = count(pattern)
|
26
|
+
gsub!(pattern, '')
|
27
|
+
count
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: despamilator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: '2.1'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,102 +9,58 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
13
|
-
default_executable:
|
12
|
+
date: 2011-09-11 00:00:00.000000000Z
|
14
13
|
dependencies:
|
15
14
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
17
|
-
requirement: &
|
15
|
+
name: domainatrix
|
16
|
+
requirement: &70312466224620 !ruby/object:Gem::Requirement
|
18
17
|
none: false
|
19
18
|
requirements:
|
20
19
|
- - ! '>='
|
21
20
|
- !ruby/object:Gem::Version
|
22
|
-
version:
|
23
|
-
type: :
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
24
23
|
prerelease: false
|
25
|
-
version_requirements: *
|
26
|
-
- !ruby/object:Gem::Dependency
|
27
|
-
name: hoe
|
28
|
-
requirement: &2730610 !ruby/object:Gem::Requirement
|
29
|
-
none: false
|
30
|
-
requirements:
|
31
|
-
- - ! '>='
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 2.7.0
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: *2730610
|
24
|
+
version_requirements: *70312466224620
|
37
25
|
description: ! 'Despamilator is a plugin based spam detector designed for use on your
|
38
|
-
web forms borne out of two annoyances:
|
39
|
-
|
40
|
-
|
41
|
-
will apply
|
42
|
-
|
43
|
-
some commonly used heuristics from the world of anti-spam to help you decide whether
|
44
|
-
your users are human or machine.'
|
26
|
+
web forms borne out of two annoyances: Spam being submitted in my web forms and
|
27
|
+
CAPTCHAS being intrusive. Despamilator will apply some commonly used heuristics
|
28
|
+
from the world of anti-spam to help you decide whether your users are human or machine.'
|
45
29
|
email:
|
46
30
|
- moowahaha@hotmail.com
|
47
31
|
executables: []
|
48
32
|
extensions: []
|
49
|
-
extra_rdoc_files:
|
50
|
-
- History.txt
|
51
|
-
- Manifest.txt
|
52
|
-
- PostInstall.txt
|
53
|
-
- conf/unusual_characters.txt
|
33
|
+
extra_rdoc_files: []
|
54
34
|
files:
|
55
|
-
- .rspec
|
56
|
-
- .rvmrc
|
57
|
-
- Gemfile
|
58
|
-
- Gemfile.lock
|
59
|
-
- History.txt
|
60
|
-
- Manifest.txt
|
61
|
-
- PostInstall.txt
|
62
|
-
- README.rdoc
|
63
|
-
- Rakefile
|
64
|
-
- conf/unusual_characters.txt
|
65
|
-
- despamilator.gemspec
|
66
|
-
- lib/despamilator.rb
|
67
|
-
- lib/despamilator/filter.rb
|
68
35
|
- lib/despamilator/filter/gtubs_test_filter.rb
|
69
36
|
- lib/despamilator/filter/html_tags.rb
|
70
37
|
- lib/despamilator/filter/ip_address_url.rb
|
71
38
|
- lib/despamilator/filter/long_words.rb
|
39
|
+
- lib/despamilator/filter/mixed_case.rb
|
72
40
|
- lib/despamilator/filter/naughty_words.rb
|
73
41
|
- lib/despamilator/filter/numbers_and_words.rb
|
42
|
+
- lib/despamilator/filter/obfuscated_urls.rb
|
43
|
+
- lib/despamilator/filter/prices.rb
|
74
44
|
- lib/despamilator/filter/script_tag.rb
|
75
45
|
- lib/despamilator/filter/shouting.rb
|
46
|
+
- lib/despamilator/filter/spammy_tlds.rb
|
76
47
|
- lib/despamilator/filter/square_brackets.rb
|
77
48
|
- lib/despamilator/filter/trailing_number.rb
|
78
49
|
- lib/despamilator/filter/unusual_characters.rb
|
79
50
|
- lib/despamilator/filter/urls.rb
|
80
|
-
- lib/despamilator/
|
81
|
-
-
|
82
|
-
-
|
83
|
-
-
|
84
|
-
-
|
85
|
-
-
|
86
|
-
-
|
87
|
-
-
|
88
|
-
-
|
89
|
-
- spec/filters/naughty_words_spec.rb
|
90
|
-
- spec/filters/numbers_and_words_spec.rb
|
91
|
-
- spec/filters/script_tag_spec.rb
|
92
|
-
- spec/filters/shouting_spec.rb
|
93
|
-
- spec/filters/square_brackets_spec.rb
|
94
|
-
- spec/filters/trailing_number_spec.rb
|
95
|
-
- spec/filters/unusual_characters_spec.rb
|
96
|
-
- spec/filters/urls_spec.rb
|
97
|
-
- spec/helpers/corpus_helper.rb
|
98
|
-
- spec/helpers/filter_helper.rb
|
99
|
-
- spec/helpers/spec_helper.rb
|
100
|
-
- tasks/test.rake
|
101
|
-
has_rdoc: true
|
51
|
+
- lib/despamilator/filter/very_long_domain_name.rb
|
52
|
+
- lib/despamilator/filter/weird_punctuation.rb
|
53
|
+
- lib/despamilator/filter.rb
|
54
|
+
- lib/despamilator/subject/text.rb
|
55
|
+
- lib/despamilator/subject.rb
|
56
|
+
- lib/despamilator/version.rb
|
57
|
+
- lib/despamilator.rb
|
58
|
+
- README.rdoc
|
59
|
+
- History.txt
|
102
60
|
homepage: http://github.com/moowahaha/despamilator
|
103
61
|
licenses: []
|
104
|
-
post_install_message:
|
105
|
-
rdoc_options:
|
106
|
-
- --main
|
107
|
-
- README.rdoc
|
62
|
+
post_install_message:
|
63
|
+
rdoc_options: []
|
108
64
|
require_paths:
|
109
65
|
- lib
|
110
66
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -118,13 +74,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
74
|
requirements:
|
119
75
|
- - ! '>='
|
120
76
|
- !ruby/object:Gem::Version
|
121
|
-
version:
|
77
|
+
version: 1.3.6
|
122
78
|
requirements: []
|
123
79
|
rubyforge_project: despamilator
|
124
|
-
rubygems_version: 1.
|
80
|
+
rubygems_version: 1.8.6
|
125
81
|
signing_key:
|
126
82
|
specification_version: 3
|
127
|
-
summary:
|
128
|
-
web forms borne out of two annoyances: Spam being submitted in my web forms and
|
129
|
-
CAPTCHAS being intrusive'
|
83
|
+
summary: Stop web form Spam!
|
130
84
|
test_files: []
|
data/.rspec
DELETED
data/.rvmrc
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
rvm --create use 1.9.2@despamilator
|
data/Gemfile
DELETED