despamilator 2.0.1 → 2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/History.txt +7 -0
  2. data/lib/despamilator.rb +38 -7
  3. data/lib/despamilator/filter.rb +39 -23
  4. data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
  5. data/lib/despamilator/filter/html_tags.rb +9 -7
  6. data/lib/despamilator/filter/ip_address_url.rb +6 -4
  7. data/lib/despamilator/filter/long_words.rb +7 -5
  8. data/lib/despamilator/filter/mixed_case.rb +21 -0
  9. data/lib/despamilator/filter/naughty_words.rb +5 -5
  10. data/lib/despamilator/filter/numbers_and_words.rb +19 -11
  11. data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
  12. data/lib/despamilator/filter/prices.rb +19 -0
  13. data/lib/despamilator/filter/script_tag.rb +4 -4
  14. data/lib/despamilator/filter/shouting.rb +9 -6
  15. data/lib/despamilator/filter/spammy_tlds.rb +22 -0
  16. data/lib/despamilator/filter/square_brackets.rb +5 -5
  17. data/lib/despamilator/filter/trailing_number.rb +4 -4
  18. data/lib/despamilator/filter/unusual_characters.rb +5 -5
  19. data/lib/despamilator/filter/urls.rb +7 -9
  20. data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
  21. data/lib/despamilator/filter/weird_punctuation.rb +44 -0
  22. data/lib/despamilator/subject.rb +30 -0
  23. data/lib/despamilator/subject/text.rb +32 -0
  24. data/lib/despamilator/version.rb +3 -0
  25. metadata +29 -75
  26. data/.rspec +0 -2
  27. data/.rvmrc +0 -1
  28. data/Gemfile +0 -12
  29. data/Gemfile.lock +0 -47
  30. data/Manifest.txt +0 -46
  31. data/PostInstall.txt +0 -1
  32. data/Rakefile +0 -39
  33. data/conf/unusual_characters.txt +0 -6674
  34. data/despamilator.gemspec +0 -38
  35. data/lib/despamilator/filter_base.rb +0 -82
  36. data/scripts/despamilator_score.rb +0 -25
  37. data/scripts/from_file.rb +0 -26
  38. data/spec/despamilator_spec.rb +0 -13
  39. data/spec/filter_base_spec.rb +0 -30
  40. data/spec/filters/gtubs_test_filter_spec.rb +0 -9
  41. data/spec/filters/html_tags_spec.rb +0 -129
  42. data/spec/filters/ip_address_url_spec.rb +0 -11
  43. data/spec/filters/long_words_spec.rb +0 -11
  44. data/spec/filters/naughty_words_spec.rb +0 -11
  45. data/spec/filters/numbers_and_words_spec.rb +0 -34
  46. data/spec/filters/script_tag_spec.rb +0 -22
  47. data/spec/filters/shouting_spec.rb +0 -45
  48. data/spec/filters/square_brackets_spec.rb +0 -11
  49. data/spec/filters/trailing_number_spec.rb +0 -10
  50. data/spec/filters/unusual_characters_spec.rb +0 -9
  51. data/spec/filters/urls_spec.rb +0 -11
  52. data/spec/helpers/corpus_helper.rb +0 -5
  53. data/spec/helpers/filter_helper.rb +0 -59
  54. data/spec/helpers/spec_helper.rb +0 -6
  55. data/tasks/test.rake +0 -6
@@ -0,0 +1,22 @@
1
+ require 'despamilator/filter'
2
+
3
+ module DespamilatorFilter
4
+
5
+ class SpammyTLDs < Despamilator::Filter
6
+
7
+ def name
8
+ 'Spammy TLDs'
9
+ end
10
+
11
+ def description
12
+ 'Detects TLDs that are more commonly associated with spam.'
13
+ end
14
+
15
+ def parse subject
16
+ matches = subject.text.count(/\w{5,}\.(info|biz)\b/)
17
+ subject.register_match!({:score => 0.05 * matches, :filter => self}) if matches > 0
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class SquareBrackets < Despamilator::FilterBase
5
+ class SquareBrackets < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Square Brackets'
@@ -12,9 +12,9 @@ module DespamilatorFilter
12
12
  'Detects each square bracket in a string'
13
13
  end
14
14
 
15
- def parse text
16
- text.downcase.scan(/(\[|\])/).each do |match|
17
- self.append_score = 0.05
15
+ def parse subject
16
+ subject.text.downcase.scan(/(\[|\])/).each do |match|
17
+ subject.register_match!({:score => 0.05, :filter => self})
18
18
  end
19
19
  end
20
20
 
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class TrailingNumber < Despamilator::FilterBase
5
+ class TrailingNumber < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Trailing Number'
@@ -12,8 +12,8 @@ module DespamilatorFilter
12
12
  'Detects a trailing cache busting number'
13
13
  end
14
14
 
15
- def parse text
16
- self.append_score = 0.1 if text =~ /\b\d+\s*$/
15
+ def parse subject
16
+ subject.register_match!({:score => 0.1, :filter => self}) if subject.text.without_uris =~ /\b\d+\s*$/
17
17
  end
18
18
 
19
19
  end
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class UnusualCharacters < Despamilator::FilterBase
5
+ class UnusualCharacters < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'Unusual Characters'
@@ -12,10 +12,10 @@ module DespamilatorFilter
12
12
  'Detects and scores each occurrence of an unusual 2 or 3 character combination'
13
13
  end
14
14
 
15
- def parse text
15
+ def parse subject
16
16
  initialize_combos
17
- tokenize(text).each do |token|
18
- self.append_score = 0.05 if @@combos[token.to_sym]
17
+ tokenize(subject.text.without_uris).each do |token|
18
+ subject.register_match!({:score => 0.05, :filter => self}) if @@combos[token.to_sym]
19
19
  end
20
20
  end
21
21
 
@@ -1,8 +1,8 @@
1
- require 'despamilator/filter_base'
1
+ require 'despamilator/filter'
2
2
 
3
3
  module DespamilatorFilter
4
4
 
5
- class URLs < Despamilator::FilterBase
5
+ class URLs < Despamilator::Filter
6
6
 
7
7
  def name
8
8
  'URLs'
@@ -12,13 +12,11 @@ module DespamilatorFilter
12
12
  'Detects each url in a string'
13
13
  end
14
14
 
15
- def parse text
16
- text.downcase!
17
-
18
- text.gsub!(/http:\/\/\d+\.\d+\.\d+\.\d+/, '')
19
-
20
- 1.upto(text.scan(/http:\/\//).length) do
21
- self.append_score = 0.4
15
+ def parse subject
16
+ text = subject.text.downcase.gsub(/http:\/\/\d+\.\d+\.\d+\.\d+/, '')
17
+ matches = text.count(/https?:\/\//)
18
+ 1.upto(matches > 2 ? 2 : matches) do
19
+ subject.register_match!({:score => 0.4, :filter => self})
22
20
  end
23
21
  end
24
22
 
@@ -0,0 +1,27 @@
1
+ require 'despamilator/filter'
2
+ require 'domainatrix'
3
+
4
+ module DespamilatorFilter
5
+
6
+ class VeryLongDomainName < Despamilator::Filter
7
+
8
+ def name
9
+ 'Very Long Domain Name'
10
+ end
11
+
12
+ def description
13
+ 'Detects unusually long domain names.'
14
+ end
15
+
16
+ def parse subject
17
+ subject.text.scan(URI.regexp).each do |url_parts|
18
+ url_parts.compact!
19
+ next if !url_parts[1] or url_parts[1] !~ /(\w|-){5,}\.\w{2,5}/
20
+ url = Domainatrix.parse('http://' + url_parts[1])
21
+ subject.register_match!({:score => 0.4, :filter => self}) if url.domain.length > 20
22
+ end
23
+ end
24
+
25
+ end
26
+
27
+ end
@@ -0,0 +1,44 @@
1
+ require 'despamilator/filter'
2
+
3
+ module DespamilatorFilter
4
+
5
+ class WeirdPunctuation < Despamilator::Filter
6
+
7
+ def name
8
+ 'Weird Punctuation'
9
+ end
10
+
11
+ def description
12
+ 'Detects unusual use of punctuation.'
13
+ end
14
+
15
+ def parse subject
16
+ text = subject.text.without_uris.downcase
17
+
18
+ text.gsub!(/\w&\w/, 'xx')
19
+ text.gsub!(/[a-z](!|\?)(\s|$)/, 'x')
20
+ text.gsub!(/(?:#{punctuation}){20,}/, '')
21
+ matches = text.remove_and_count!(/(?:\W|\s|^)(#{punctuation})/)
22
+ matches += text.remove_and_count!(/\w,\w/)
23
+ matches += text.remove_and_count!(/\w\w\.\w/)
24
+ matches += text.remove_and_count!(/\w\.\w\w/)
25
+ matches += text.remove_and_count!(/(#{punctuation})(#{punctuation})/)
26
+ matches += text.remove_and_count!(/(#{punctuation})$/)
27
+ matches += text.remove_and_count!(/(?:\W|\s|^)\d+(#{punctuation})/)
28
+
29
+ subject.register_match!({:score => 0.03 * matches, :filter => self}) if matches > 0
30
+ end
31
+
32
+ private
33
+
34
+ def punctuation
35
+ @punctuation ||= %w{~ ` ! @ # $ % ^ & * _ - + = , / ? | \\ : ;}.map do |punctuation_character|
36
+ Regexp.escape(punctuation_character)
37
+ end.join('|')
38
+
39
+ @punctuation
40
+ end
41
+
42
+ end
43
+
44
+ end
@@ -0,0 +1,30 @@
1
+ require 'despamilator/subject/text'
2
+
3
+ class Despamilator
4
+ class Subject
5
+ attr_reader :score, :text
6
+
7
+ def initialize text
8
+ @score = 0.0
9
+ @matches = {}
10
+ @text = Despamilator::Subject::Text.new(text)
11
+ end
12
+
13
+ def register_match! details
14
+ @score += details[:score] || raise('A score must be supplied')
15
+ filter = details[:filter] || raise('A filter must be supplied')
16
+
17
+ @matches[filter] ||= 0.0
18
+ @matches[filter] += details[:score]
19
+ end
20
+
21
+ def matches
22
+ @matches.map do |filter, score|
23
+ {:filter => filter, :score => score}
24
+ end.sort do |a, b|
25
+ b[:score] <=> a[:score]
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,32 @@
1
+ require 'uri'
2
+
3
+ class Despamilator
4
+ class Subject
5
+ class Text < String
6
+
7
+ def initialize text
8
+ super text
9
+ freeze
10
+ end
11
+
12
+ def without_uris
13
+ gsub(/\b(?:https?|mailto|ftp):.+?(\s|$)/i, '')
14
+ end
15
+
16
+ def words
17
+ split(/\W+/)
18
+ end
19
+
20
+ def count pattern
21
+ scan(pattern).flatten.compact.length
22
+ end
23
+
24
+ def remove_and_count! pattern
25
+ count = count(pattern)
26
+ gsub!(pattern, '')
27
+ count
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ class Despamilator
2
+ VERSION = 2.1
3
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: despamilator
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: '2.1'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,102 +9,58 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-08-11 00:00:00.000000000 -04:00
13
- default_executable:
12
+ date: 2011-09-11 00:00:00.000000000Z
14
13
  dependencies:
15
14
  - !ruby/object:Gem::Dependency
16
- name: rubyforge
17
- requirement: &2730850 !ruby/object:Gem::Requirement
15
+ name: domainatrix
16
+ requirement: &70312466224620 !ruby/object:Gem::Requirement
18
17
  none: false
19
18
  requirements:
20
19
  - - ! '>='
21
20
  - !ruby/object:Gem::Version
22
- version: 2.0.4
23
- type: :development
21
+ version: '0'
22
+ type: :runtime
24
23
  prerelease: false
25
- version_requirements: *2730850
26
- - !ruby/object:Gem::Dependency
27
- name: hoe
28
- requirement: &2730610 !ruby/object:Gem::Requirement
29
- none: false
30
- requirements:
31
- - - ! '>='
32
- - !ruby/object:Gem::Version
33
- version: 2.7.0
34
- type: :development
35
- prerelease: false
36
- version_requirements: *2730610
24
+ version_requirements: *70312466224620
37
25
  description: ! 'Despamilator is a plugin based spam detector designed for use on your
38
- web forms borne out of two annoyances:
39
-
40
- Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator
41
- will apply
42
-
43
- some commonly used heuristics from the world of anti-spam to help you decide whether
44
- your users are human or machine.'
26
+ web forms borne out of two annoyances: Spam being submitted in my web forms and
27
+ CAPTCHAS being intrusive. Despamilator will apply some commonly used heuristics
28
+ from the world of anti-spam to help you decide whether your users are human or machine.'
45
29
  email:
46
30
  - moowahaha@hotmail.com
47
31
  executables: []
48
32
  extensions: []
49
- extra_rdoc_files:
50
- - History.txt
51
- - Manifest.txt
52
- - PostInstall.txt
53
- - conf/unusual_characters.txt
33
+ extra_rdoc_files: []
54
34
  files:
55
- - .rspec
56
- - .rvmrc
57
- - Gemfile
58
- - Gemfile.lock
59
- - History.txt
60
- - Manifest.txt
61
- - PostInstall.txt
62
- - README.rdoc
63
- - Rakefile
64
- - conf/unusual_characters.txt
65
- - despamilator.gemspec
66
- - lib/despamilator.rb
67
- - lib/despamilator/filter.rb
68
35
  - lib/despamilator/filter/gtubs_test_filter.rb
69
36
  - lib/despamilator/filter/html_tags.rb
70
37
  - lib/despamilator/filter/ip_address_url.rb
71
38
  - lib/despamilator/filter/long_words.rb
39
+ - lib/despamilator/filter/mixed_case.rb
72
40
  - lib/despamilator/filter/naughty_words.rb
73
41
  - lib/despamilator/filter/numbers_and_words.rb
42
+ - lib/despamilator/filter/obfuscated_urls.rb
43
+ - lib/despamilator/filter/prices.rb
74
44
  - lib/despamilator/filter/script_tag.rb
75
45
  - lib/despamilator/filter/shouting.rb
46
+ - lib/despamilator/filter/spammy_tlds.rb
76
47
  - lib/despamilator/filter/square_brackets.rb
77
48
  - lib/despamilator/filter/trailing_number.rb
78
49
  - lib/despamilator/filter/unusual_characters.rb
79
50
  - lib/despamilator/filter/urls.rb
80
- - lib/despamilator/filter_base.rb
81
- - scripts/despamilator_score.rb
82
- - scripts/from_file.rb
83
- - spec/despamilator_spec.rb
84
- - spec/filter_base_spec.rb
85
- - spec/filters/gtubs_test_filter_spec.rb
86
- - spec/filters/html_tags_spec.rb
87
- - spec/filters/ip_address_url_spec.rb
88
- - spec/filters/long_words_spec.rb
89
- - spec/filters/naughty_words_spec.rb
90
- - spec/filters/numbers_and_words_spec.rb
91
- - spec/filters/script_tag_spec.rb
92
- - spec/filters/shouting_spec.rb
93
- - spec/filters/square_brackets_spec.rb
94
- - spec/filters/trailing_number_spec.rb
95
- - spec/filters/unusual_characters_spec.rb
96
- - spec/filters/urls_spec.rb
97
- - spec/helpers/corpus_helper.rb
98
- - spec/helpers/filter_helper.rb
99
- - spec/helpers/spec_helper.rb
100
- - tasks/test.rake
101
- has_rdoc: true
51
+ - lib/despamilator/filter/very_long_domain_name.rb
52
+ - lib/despamilator/filter/weird_punctuation.rb
53
+ - lib/despamilator/filter.rb
54
+ - lib/despamilator/subject/text.rb
55
+ - lib/despamilator/subject.rb
56
+ - lib/despamilator/version.rb
57
+ - lib/despamilator.rb
58
+ - README.rdoc
59
+ - History.txt
102
60
  homepage: http://github.com/moowahaha/despamilator
103
61
  licenses: []
104
- post_install_message: PostInstall.txt
105
- rdoc_options:
106
- - --main
107
- - README.rdoc
62
+ post_install_message:
63
+ rdoc_options: []
108
64
  require_paths:
109
65
  - lib
110
66
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -118,13 +74,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
74
  requirements:
119
75
  - - ! '>='
120
76
  - !ruby/object:Gem::Version
121
- version: '0'
77
+ version: 1.3.6
122
78
  requirements: []
123
79
  rubyforge_project: despamilator
124
- rubygems_version: 1.5.2
80
+ rubygems_version: 1.8.6
125
81
  signing_key:
126
82
  specification_version: 3
127
- summary: ! 'Despamilator is a plugin based spam detector designed for use on your
128
- web forms borne out of two annoyances: Spam being submitted in my web forms and
129
- CAPTCHAS being intrusive'
83
+ summary: Stop web form Spam!
130
84
  test_files: []
data/.rspec DELETED
@@ -1,2 +0,0 @@
1
- --color
2
- --require=./spec/helpers/spec_helper.rb
data/.rvmrc DELETED
@@ -1 +0,0 @@
1
- rvm --create use 1.9.2@despamilator
data/Gemfile DELETED
@@ -1,12 +0,0 @@
1
- # A sample Gemfile
2
- source "http://rubygems.org"
3
-
4
- gem 'hoe', '>= 2.7.0'
5
- gem 'newgem', '>= 1.5.3'
6
- gem 'rdoc', '>= 3.2'
7
-
8
- group :test do
9
- gem 'rspec', '>= 2.0.1'
10
- gem 'simplecov', '>= 0.3.7'
11
- gem 'one_hundred_percent_coverage'
12
- end