burnspam 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
data/lib/burnspam.rb ADDED
@@ -0,0 +1,38 @@
1
+ # The program takes a comment input as three
2
+ # paramaters: email, name and content.
3
+ # It performs some basic checks to determine the
4
+ # "spaminess" of a comment. Higher values are better
5
+ # while negative numbers are likely spam.
6
+ #
7
+ # It also stores the most recent comments in
8
+ # memory and checks that new comments are
9
+ # not duplicates. If a new comment is found already
10
+ # existing in memory, the spaminess value goes down!
11
+ #
12
+ # Version:: 0.1.1
13
+ #
14
+ # Date:: 2011/12/18
15
+ #
16
+ # @author:: Brian Burns, x10205284
17
+ #
18
+ # ==Burnspan.spaminess returns values:
19
+ # * spaminess < 0:: Obvious spam
20
+ # * spaminess 0 - 2:: Questionable quality
21
+ # * spaminess > 2:: Good quality comment
22
+
23
+ class Burnspam
24
+ attr_reader :points
25
+
26
+ # Analyse the comment
27
+ def initialize(email, name, content)
28
+ @points = PointTracker.new(email, name, content)
29
+ end
30
+
31
+ # Return an int value of spaminess
32
+ def spaminess
33
+ @points.spaminess
34
+ end
35
+
36
+ end
37
+
38
+ require 'burnspam/point_tracker'
@@ -0,0 +1,278 @@
1
+ # PointTracker holds all the statistics of the name
2
+ # and comment inside @name and @content
3
+ #
4
+ # The name and content are passed into the relevant
5
+ # Strategy (Strategy Design Pattern).
6
+ #
7
+ # The checker interchanges these strategies. This
8
+ # allows slightly different processing for the name
9
+ # and content as well as a shared method made
10
+ # available through the GeneralStrategy class.
11
+ #
12
+ # The Checker returns the results to @name and @content
13
+ #
14
+ # Version:: 0.1.1
15
+ #
16
+ # Date:: 2011/12/18
17
+ #
18
+ # @author:: Brian Burns, x10205284
19
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
20
+ class Burnspam::PointTracker
21
+
22
+ attr_reader :name, :content
23
+
24
+ # Gather all statistics from name and content
25
+ def initialize(email, name, content)
26
+ @name = Checker.new(NameCheckerStrategy.new(name))
27
+ @content = Checker.new(ContentCheckerStrategy.new(content))
28
+ end
29
+
30
+ # Return spaminess based on gathered statistics
31
+ def spaminess
32
+ @total = 0
33
+ @total += body_urls(@content.count_urls)
34
+ @total += name_urls(@name.count_urls)
35
+ @total += body_length(@content.length, @content.count_urls)
36
+ @total += keywordsearch(@content.keyword_count)
37
+ @total += urlength(@content.url_length)
38
+ @total += startswith?(@content.starts_with)
39
+ @total += duplicate?(@content.duplicate)
40
+ end
41
+
42
+ # * Gain 2 points for 1 URL
43
+ # * 0 points for 2 URLS
44
+ # * Loost 1 point for more than 2 URLS
45
+ def body_urls(count)
46
+ if count < 2
47
+ 2
48
+ elsif count > 2
49
+ count * -1
50
+ else
51
+ 0
52
+ end
53
+ end
54
+
55
+ # URL in name? Loose 2 points
56
+ def name_urls(count)
57
+ count > 0 ? -2 : 0
58
+ end
59
+
60
+ # Comment less than 20 char, loose 1 point
61
+ def body_length(size, counturls)
62
+ if size < 20
63
+ -1
64
+ elsif (size > 20) && (counturls == 0)
65
+ 2
66
+ else
67
+ 0
68
+ end
69
+ end
70
+
71
+ # For every bad keyword loose 1 point.
72
+ def keywordsearch (count)
73
+ count * -1
74
+ end
75
+
76
+ # 1 point penalty if average URL length is long
77
+ def urlength(size)
78
+ size > 30? -1 : 0
79
+ end
80
+
81
+ # If the comment starts with specific bad keywords
82
+ # they loose substantial (10) points
83
+ # This is because spammy comments often start like this.
84
+ def startswith?(word)
85
+ word ? -10 : 0
86
+ end
87
+
88
+ # If the post is duplicate, the comment looses 5 points.
89
+ def duplicate?(post)
90
+ post ? -5 : 0
91
+ end
92
+ end
93
+
94
+ # Checker calls the methods of the strategies and
95
+ # holds the results in accessible instance variables.
96
+ # The results are either Integer or Boolean
97
+ #
98
+ # Version:: 0.1.1
99
+ #
100
+ # Date:: 2011/12/18
101
+ #
102
+ # @author:: Brian Burns, x10205284
103
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
104
+ class Checker
105
+ attr_reader :count_urls, :url_length, :length, :starts_with,
106
+ :duplicate, :test, :keyword_count
107
+
108
+ def initialize(strategy)
109
+ @count_urls = strategy.count_urls
110
+ @url_length = strategy.url_length
111
+ @length = strategy.length
112
+ @starts_with = strategy.starts_with?
113
+ @duplicate = strategy.duplicate?
114
+ @test = strategy.test
115
+ @keyword_count = strategy.keywords
116
+ end
117
+
118
+ end
119
+
120
+ # The GeneralStrategy class stores the analysis methods that
121
+ # can be common to both NameChecker and ContentChecker Strategy
122
+ #
123
+ # It also stores a list of bad keywords for comments
124
+ # and bad keywords for URLs
125
+ #
126
+ # Version:: 0.1.1
127
+ #
128
+ # Date:: 2011/12/18
129
+ #
130
+ # @author:: Brian Burns, x10205284
131
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
132
+ class GeneralStrategy
133
+
134
+ @@comparisonwords = ["feck", "bitch"]
135
+ # Why use .de, .pl, .cn?
136
+ # Answer:
137
+ # Ask the author of this spam solution, referenced.
138
+ @@comparisonwordsurl = [".html", ".info", "?", "&", "free",
139
+ ".de", ".pl", ".cn"]
140
+
141
+ # The only method that is currently shared. All the others
142
+ # implement custom strategies.
143
+ # This method checks for bad keywords in the text
144
+ def keywords
145
+ @keywords_count = 0
146
+ unless @keywords.empty?
147
+ @keywords.each do |word|
148
+ @@comparisonwords.each do |word1|
149
+ if word == word1
150
+ @keywords_count += 1
151
+ end
152
+ end
153
+ end
154
+ end
155
+ @keywords_count
156
+ end
157
+
158
+ def test
159
+ 44
160
+ end
161
+
162
+ end
163
+
164
+ # A specialized strategy for analizing the name field.
165
+ # Analysis for name usually requires much less processing
166
+ # than comment body.
167
+ #
168
+ # Version:: 0.1.1
169
+ #
170
+ # Date:: 2011/12/18
171
+ #
172
+ # @author:: Brian Burns, x10205284
173
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
174
+ class NameCheckerStrategy < GeneralStrategy
175
+
176
+ # Stores name as a string and value in array
177
+ def initialize(name)
178
+ @name = name
179
+ @keywords = [name]
180
+ end
181
+
182
+ # We only expect to count one or zero URLs in name
183
+ def count_urls
184
+ @name.include?('http://') ? 1 : 0
185
+ end
186
+
187
+ # Check length if URL exists
188
+ def url_length
189
+ count_urls ? @name.length : 0
190
+ end
191
+
192
+ # Check length of name
193
+ def length
194
+ @name.length
195
+ end
196
+
197
+ # This method is not used but must be included to interface
198
+ # correctly with the Checker class
199
+ def starts_with?
200
+ false
201
+ end
202
+
203
+ # This method is not used but must be included to interface
204
+ # correctly with the Checker class
205
+ def duplicate?
206
+ false
207
+ end
208
+
209
+ end
210
+
211
+ # A specialized strategy for analizing the content field.
212
+ #
213
+ # Version:: 0.1.1
214
+ #
215
+ # Date:: 2011/12/18
216
+ #
217
+ # @author:: Brian Burns, x10205284
218
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
219
+ class ContentCheckerStrategy < GeneralStrategy
220
+ @@recent_posts = ["5", "4", "3" , "2", "1"]
221
+
222
+ # Stores content, URLS and individual words
223
+ def initialize(content)
224
+ @content = content
225
+ @urls = URI.extract(content)
226
+ @keywords = @content.split /[\s,']+/
227
+ end
228
+
229
+ def count_urls
230
+ @urls.count
231
+ end
232
+
233
+ # Calculate the average length of all URLs in content
234
+ def url_length
235
+ unless @urls.empty?
236
+ @total_length = 0
237
+ @urls.each do |x|
238
+ @total_length += x.length
239
+ end
240
+ @total_length / @urls.size
241
+ else
242
+ 0
243
+ end
244
+ end
245
+
246
+ def length
247
+ @content.length
248
+ end
249
+
250
+ # Checks for (currently 2) keywords that we don't like
251
+ def starts_with?
252
+ if @content =~ /Cool(.*)/
253
+ return true
254
+ elsif @content =~ /Wow(.*)/
255
+ return true
256
+ else
257
+ return false
258
+ end
259
+ end
260
+
261
+ # Compares a comment with recent comments and if a match
262
+ # is found returns true.
263
+ # Otherwise removes the oldest comment from the end
264
+ # and adds new one to the front (First In First Out queue)
265
+ def duplicate?
266
+ @@recent_posts.each do |comment|
267
+ if comment == @content
268
+ return true
269
+ end
270
+ end
271
+ @@recent_posts.pop
272
+ @@recent_posts.unshift @content
273
+ return false
274
+ end
275
+
276
+ end
277
+
278
+ require 'uri'
@@ -0,0 +1,136 @@
1
+ require 'test/unit'
2
+ require 'burnspam'
3
+
4
+ # These tests were created to test if statistics
5
+ # are correctly gathered.
6
+ #
7
+ # @points is a PointTracker instance variable within Burnspam.
8
+ # It has two accessible values:
9
+ # * @name: This contains the analysis of the name
10
+ # e.g. points.name.length, points.name.count_urls
11
+ # * @content: This contains analysis of the content
12
+ # e.g. points.name.duplicate?
13
+ #
14
+ # These hold the same statistics although
15
+ # the strategy used to perform name and content analysis
16
+ # differ (based on strategy design pattern)
17
+ # See Burnspam::PointTracker for explanation.
18
+ #
19
+ # Version:: 0.1.1
20
+ #
21
+ # Date:: 2011/12/18
22
+ #
23
+ # @author:: Brian Burns, x10205284
24
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
25
+ # =Test the analysis of a comment
26
+ class BurnspamTest < Test::Unit::TestCase
27
+ # Test creation of new class
28
+ def test_create_new
29
+ assert Burnspam.new("email", "name", "content")
30
+ end
31
+
32
+ # Test that spaminess returns an Integer
33
+ def test_return_integer
34
+ x = Burnspam.new("email", "name", "content")
35
+ assert_kind_of Integer, x.spaminess
36
+ end
37
+
38
+ # Test that urls in the name are counted
39
+ def test_name_url
40
+ name = "http://www.spam.com"
41
+ x = Burnspam.new("email", name, "content")
42
+ assert_equal 1, x.points.name.count_urls
43
+ end
44
+
45
+ # Test that no urls in name produces 0
46
+ def test_name_clean
47
+ name = "Brian"
48
+ x = Burnspam.new("email", name, "content")
49
+ assert_equal 0, x.points.name.count_urls
50
+ end
51
+
52
+ # Check accurate counting of urls in name
53
+ def test_content_urls2
54
+ name = "http://www.spam.com"
55
+ content = "http://www.x.com and http://www.y.com!!"
56
+ x = Burnspam.new("email", name, content)
57
+ assert_equal 2, x.points.content.count_urls
58
+ end
59
+
60
+ # Check that no urls in content produces count of 0
61
+ def test_content_urls0
62
+ x = Burnspam.new("email", "name", "content")
63
+ assert_equal 0, x.points.content.count_urls
64
+ end
65
+
66
+ # Test that average url length count is accurate
67
+ # (20 + 18) / 2
68
+ # = 19
69
+ def test_url_length19
70
+ name = "http://www.spam.com"
71
+ content = "http://www.spam1.com http://www.spam.ie"
72
+ x = Burnspam.new("email", name, content)
73
+ assert_equal 19, x.points.content.url_length
74
+ assert_equal 19, x.points.name.url_length
75
+ end
76
+
77
+ # Count length of content
78
+ def test_length_content
79
+ content = "123456789"
80
+ x = Burnspam.new("email", "name", content)
81
+ assert_equal 9, x.points.content.length
82
+ end
83
+
84
+ # Check that bad starting keywords picked up.
85
+ def test_starts_with
86
+ content = "Cool..."
87
+ x = Burnspam.new("email", "name", content)
88
+ assert_equal true, x.points.content.starts_with
89
+
90
+ content = "Wow..."
91
+ x = Burnspam.new("email", "name", content)
92
+ assert_equal true, x.points.content.starts_with
93
+
94
+ content = "Doesn't start with.. "
95
+ x = Burnspam.new("email", "name", content)
96
+ assert_equal false, x.points.content.starts_with
97
+ end
98
+
99
+ # Test for duplicate comments in 5 most recent comments
100
+ # The first assertion expected to fail due to it being 6th.
101
+ def test_duplicate_comment
102
+ x = Burnspam.new("email", "name", "duplicate")
103
+ x = Burnspam.new("email", "name", "Content #{rand}")
104
+ x = Burnspam.new("email", "name", "Content #{rand}")
105
+ x = Burnspam.new("email", "name", "Content #{rand}")
106
+ x = Burnspam.new("email", "name", "Content #{rand}")
107
+ x = Burnspam.new("email", "name", "Content #{rand}")
108
+ x = Burnspam.new("email", "name", "duplicate")
109
+ assert_equal false, x.points.content.duplicate
110
+
111
+ x = Burnspam.new("email", "name", "duplicate")
112
+ x = Burnspam.new("email", "name", "Content #{rand}")
113
+ x = Burnspam.new("email", "name", "Content #{rand}")
114
+ x = Burnspam.new("email", "name", "Content #{rand}")
115
+ x = Burnspam.new("email", "name", "duplicate")
116
+ assert_equal true, x.points.content.duplicate
117
+ end
118
+
119
+ # A GeneralStrategy class method tested from the
120
+ # NameCheckerStrategy and ContentCheckerStrategy sub-classes.
121
+ def test_general_strategey_superclass
122
+ x = Burnspam.new("email", "name", "content")
123
+ assert_equal 44, x.points.content.test
124
+ assert_equal 44, x.points.name.test
125
+ end
126
+
127
+ # Check accurate counting of bad keywords within the content.
128
+ def test_for_bad_keywords
129
+ x = Burnspam.new("email", "name", "content feck it")
130
+ assert_equal 1, x.points.content.keyword_count
131
+
132
+ x = Burnspam.new("email", "name", "feck bitch content feck it")
133
+ assert_equal 3, x.points.content.keyword_count
134
+ end
135
+
136
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: burnspam
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brian Burns
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-15 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: Built based on http://snook.ca/archives/other/effective_blog_comment_spam_blocker
15
+ email: bud_weiser3@hotmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/burnspam.rb
21
+ - lib/burnspam/point_tracker.rb
22
+ - Rakefile
23
+ - test/test_burnspam.rb
24
+ homepage: http://rubygems.org/gems/burnspam
25
+ licenses: []
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 1.8.11
45
+ signing_key:
46
+ specification_version: 3
47
+ summary: Analyses a spam comment and name and returns likelihood of spamminess. Also
48
+ checks for duplicate comments.
49
+ test_files: []