burnspam 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
data/lib/burnspam.rb ADDED
@@ -0,0 +1,38 @@
1
+ # The program takes a comment input as three
2
+ # paramaters: email, name and content.
3
+ # It performs some basic checks to determine the
4
+ # "spaminess" of a comment. Higher values are better
5
+ # while negative numbers are likely spam.
6
+ #
7
+ # It also stores the most recent comments in
8
+ # memory and checks that new comments are
9
+ # not duplicates. If a new comment is found already
10
+ # existing in memory, the spaminess value goes down!
11
+ #
12
+ # Version:: 0.1.1
13
+ #
14
+ # Date:: 2011/12/18
15
+ #
16
+ # @author:: Brian Burns, x10205284
17
+ #
18
+ # ==Burnspan.spaminess returns values:
19
+ # * spaminess < 0:: Obvious spam
20
+ # * spaminess 0 - 2:: Questionable quality
21
+ # * spaminess > 2:: Good quality comment
22
+
23
+ class Burnspam
24
+ attr_reader :points
25
+
26
+ # Analyse the comment
27
+ def initialize(email, name, content)
28
+ @points = PointTracker.new(email, name, content)
29
+ end
30
+
31
+ # Return an int value of spaminess
32
+ def spaminess
33
+ @points.spaminess
34
+ end
35
+
36
+ end
37
+
38
+ require 'burnspam/point_tracker'
@@ -0,0 +1,278 @@
1
+ # PointTracker holds all the statistics of the name
2
+ # and comment inside @name and @content
3
+ #
4
+ # The name and content are passed into the relevant
5
+ # Strategy (Strategy Design Pattern).
6
+ #
7
+ # The checker interchanges these strategies. This
8
+ # allows slightly different processing for the name
9
+ # and content as well as a shared method made
10
+ # available through the GeneralStrategy class.
11
+ #
12
+ # The Checker returns the results to @name and @content
13
+ #
14
+ # Version:: 0.1.1
15
+ #
16
+ # Date:: 2011/12/18
17
+ #
18
+ # @author:: Brian Burns, x10205284
19
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
20
+ class Burnspam::PointTracker
21
+
22
+ attr_reader :name, :content
23
+
24
+ # Gather all statistics from name and content
25
+ def initialize(email, name, content)
26
+ @name = Checker.new(NameCheckerStrategy.new(name))
27
+ @content = Checker.new(ContentCheckerStrategy.new(content))
28
+ end
29
+
30
+ # Return spaminess based on gathered statistics
31
+ def spaminess
32
+ @total = 0
33
+ @total += body_urls(@content.count_urls)
34
+ @total += name_urls(@name.count_urls)
35
+ @total += body_length(@content.length, @content.count_urls)
36
+ @total += keywordsearch(@content.keyword_count)
37
+ @total += urlength(@content.url_length)
38
+ @total += startswith?(@content.starts_with)
39
+ @total += duplicate?(@content.duplicate)
40
+ end
41
+
42
+ # * Gain 2 points for 1 URL
43
+ # * 0 points for 2 URLS
44
+ # * Loost 1 point for more than 2 URLS
45
+ def body_urls(count)
46
+ if count < 2
47
+ 2
48
+ elsif count > 2
49
+ count * -1
50
+ else
51
+ 0
52
+ end
53
+ end
54
+
55
+ # URL in name? Loose 2 points
56
+ def name_urls(count)
57
+ count > 0 ? -2 : 0
58
+ end
59
+
60
+ # Comment less than 20 char, loose 1 point
61
+ def body_length(size, counturls)
62
+ if size < 20
63
+ -1
64
+ elsif (size > 20) && (counturls == 0)
65
+ 2
66
+ else
67
+ 0
68
+ end
69
+ end
70
+
71
+ # For every bad keyword loose 1 point.
72
+ def keywordsearch (count)
73
+ count * -1
74
+ end
75
+
76
+ # 1 point penalty if average URL length is long
77
+ def urlength(size)
78
+ size > 30? -1 : 0
79
+ end
80
+
81
+ # If the comment starts with specific bad keywords
82
+ # they loose substantial (10) points
83
+ # This is because spammy comments often start like this.
84
+ def startswith?(word)
85
+ word ? -10 : 0
86
+ end
87
+
88
+ # If the post is duplicate, the comment looses 5 points.
89
+ def duplicate?(post)
90
+ post ? -5 : 0
91
+ end
92
+ end
93
+
94
+ # Checker calls the methods of the strategies and
95
+ # holds the results in accessible instance variables.
96
+ # The results are either Integer or Boolean
97
+ #
98
+ # Version:: 0.1.1
99
+ #
100
+ # Date:: 2011/12/18
101
+ #
102
+ # @author:: Brian Burns, x10205284
103
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
104
+ class Checker
105
+ attr_reader :count_urls, :url_length, :length, :starts_with,
106
+ :duplicate, :test, :keyword_count
107
+
108
+ def initialize(strategy)
109
+ @count_urls = strategy.count_urls
110
+ @url_length = strategy.url_length
111
+ @length = strategy.length
112
+ @starts_with = strategy.starts_with?
113
+ @duplicate = strategy.duplicate?
114
+ @test = strategy.test
115
+ @keyword_count = strategy.keywords
116
+ end
117
+
118
+ end
119
+
120
+ # The GeneralStrategy class stores the analysis methods that
121
+ # can be common to both NameChecker and ContentChecker Strategy
122
+ #
123
+ # It also stores a list of bad keywords for comments
124
+ # and bad keywords for URLs
125
+ #
126
+ # Version:: 0.1.1
127
+ #
128
+ # Date:: 2011/12/18
129
+ #
130
+ # @author:: Brian Burns, x10205284
131
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
132
+ class GeneralStrategy
133
+
134
+ @@comparisonwords = ["feck", "bitch"]
135
+ # Why use .de, .pl, .cn?
136
+ # Answer:
137
+ # Ask the author of this spam solution, referenced.
138
+ @@comparisonwordsurl = [".html", ".info", "?", "&", "free",
139
+ ".de", ".pl", ".cn"]
140
+
141
+ # The only method that is currently shared. All the others
142
+ # implement custom strategies.
143
+ # This method checks for bad keywords in the text
144
+ def keywords
145
+ @keywords_count = 0
146
+ unless @keywords.empty?
147
+ @keywords.each do |word|
148
+ @@comparisonwords.each do |word1|
149
+ if word == word1
150
+ @keywords_count += 1
151
+ end
152
+ end
153
+ end
154
+ end
155
+ @keywords_count
156
+ end
157
+
158
+ def test
159
+ 44
160
+ end
161
+
162
+ end
163
+
164
+ # A specialized strategy for analizing the name field.
165
+ # Analysis for name usually requires much less processing
166
+ # than comment body.
167
+ #
168
+ # Version:: 0.1.1
169
+ #
170
+ # Date:: 2011/12/18
171
+ #
172
+ # @author:: Brian Burns, x10205284
173
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
174
+ class NameCheckerStrategy < GeneralStrategy
175
+
176
+ # Stores name as a string and value in array
177
+ def initialize(name)
178
+ @name = name
179
+ @keywords = [name]
180
+ end
181
+
182
+ # We only expect to count one or zero URLs in name
183
+ def count_urls
184
+ @name.include?('http://') ? 1 : 0
185
+ end
186
+
187
+ # Check length if URL exists
188
+ def url_length
189
+ count_urls ? @name.length : 0
190
+ end
191
+
192
+ # Check length of name
193
+ def length
194
+ @name.length
195
+ end
196
+
197
+ # This method is not used but must be included to interface
198
+ # correctly with the Checker class
199
+ def starts_with?
200
+ false
201
+ end
202
+
203
+ # This method is not used but must be included to interface
204
+ # correctly with the Checker class
205
+ def duplicate?
206
+ false
207
+ end
208
+
209
+ end
210
+
211
+ # A specialized strategy for analizing the content field.
212
+ #
213
+ # Version:: 0.1.1
214
+ #
215
+ # Date:: 2011/12/18
216
+ #
217
+ # @author:: Brian Burns, x10205284
218
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
219
+ class ContentCheckerStrategy < GeneralStrategy
220
+ @@recent_posts = ["5", "4", "3" , "2", "1"]
221
+
222
+ # Stores content, URLS and individual words
223
+ def initialize(content)
224
+ @content = content
225
+ @urls = URI.extract(content)
226
+ @keywords = @content.split /[\s,']+/
227
+ end
228
+
229
+ def count_urls
230
+ @urls.count
231
+ end
232
+
233
+ # Calculate the average length of all URLs in content
234
+ def url_length
235
+ unless @urls.empty?
236
+ @total_length = 0
237
+ @urls.each do |x|
238
+ @total_length += x.length
239
+ end
240
+ @total_length / @urls.size
241
+ else
242
+ 0
243
+ end
244
+ end
245
+
246
+ def length
247
+ @content.length
248
+ end
249
+
250
+ # Checks for (currently 2) keywords that we don't like
251
+ def starts_with?
252
+ if @content =~ /Cool(.*)/
253
+ return true
254
+ elsif @content =~ /Wow(.*)/
255
+ return true
256
+ else
257
+ return false
258
+ end
259
+ end
260
+
261
+ # Compares a comment with recent comments and if a match
262
+ # is found returns true.
263
+ # Otherwise removes the oldest comment from the end
264
+ # and adds new one to the front (First In First Out queue)
265
+ def duplicate?
266
+ @@recent_posts.each do |comment|
267
+ if comment == @content
268
+ return true
269
+ end
270
+ end
271
+ @@recent_posts.pop
272
+ @@recent_posts.unshift @content
273
+ return false
274
+ end
275
+
276
+ end
277
+
278
+ require 'uri'
@@ -0,0 +1,136 @@
1
+ require 'test/unit'
2
+ require 'burnspam'
3
+
4
+ # These tests were created to test if statistics
5
+ # are correctly gathered.
6
+ #
7
+ # @points is a PointTracker instance variable within Burnspam.
8
+ # It has two accessible values:
9
+ # * @name: This contains the analysis of the name
10
+ # e.g. points.name.length, points.name.count_urls
11
+ # * @content: This contains analysis of the content
12
+ # e.g. points.name.duplicate?
13
+ #
14
+ # These hold the same statistics although
15
+ # the strategy used to perform name and content analysis
16
+ # differ (based on strategy design pattern)
17
+ # See Burnspam::PointTracker for explanation.
18
+ #
19
+ # Version:: 0.1.1
20
+ #
21
+ # Date:: 2011/12/18
22
+ #
23
+ # @author:: Brian Burns, x10205284
24
+ # @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
25
+ # =Test the analysis of a comment
26
+ class BurnspamTest < Test::Unit::TestCase
27
+ # Test creation of new class
28
+ def test_create_new
29
+ assert Burnspam.new("email", "name", "content")
30
+ end
31
+
32
+ # Test that spaminess returns an Integer
33
+ def test_return_integer
34
+ x = Burnspam.new("email", "name", "content")
35
+ assert_kind_of Integer, x.spaminess
36
+ end
37
+
38
+ # Test that urls in the name are counted
39
+ def test_name_url
40
+ name = "http://www.spam.com"
41
+ x = Burnspam.new("email", name, "content")
42
+ assert_equal 1, x.points.name.count_urls
43
+ end
44
+
45
+ # Test that no urls in name produces 0
46
+ def test_name_clean
47
+ name = "Brian"
48
+ x = Burnspam.new("email", name, "content")
49
+ assert_equal 0, x.points.name.count_urls
50
+ end
51
+
52
+ # Check accurate counting of urls in name
53
+ def test_content_urls2
54
+ name = "http://www.spam.com"
55
+ content = "http://www.x.com and http://www.y.com!!"
56
+ x = Burnspam.new("email", name, content)
57
+ assert_equal 2, x.points.content.count_urls
58
+ end
59
+
60
+ # Check that no urls in content produces count of 0
61
+ def test_content_urls0
62
+ x = Burnspam.new("email", "name", "content")
63
+ assert_equal 0, x.points.content.count_urls
64
+ end
65
+
66
+ # Test that average url length count is accurate
67
+ # (20 + 18) / 2
68
+ # = 19
69
+ def test_url_length19
70
+ name = "http://www.spam.com"
71
+ content = "http://www.spam1.com http://www.spam.ie"
72
+ x = Burnspam.new("email", name, content)
73
+ assert_equal 19, x.points.content.url_length
74
+ assert_equal 19, x.points.name.url_length
75
+ end
76
+
77
+ # Count length of content
78
+ def test_length_content
79
+ content = "123456789"
80
+ x = Burnspam.new("email", "name", content)
81
+ assert_equal 9, x.points.content.length
82
+ end
83
+
84
+ # Check that bad starting keywords picked up.
85
+ def test_starts_with
86
+ content = "Cool..."
87
+ x = Burnspam.new("email", "name", content)
88
+ assert_equal true, x.points.content.starts_with
89
+
90
+ content = "Wow..."
91
+ x = Burnspam.new("email", "name", content)
92
+ assert_equal true, x.points.content.starts_with
93
+
94
+ content = "Doesn't start with.. "
95
+ x = Burnspam.new("email", "name", content)
96
+ assert_equal false, x.points.content.starts_with
97
+ end
98
+
99
+ # Test for duplicate comments in 5 most recent comments
100
+ # The first assertion expected to fail due to it being 6th.
101
+ def test_duplicate_comment
102
+ x = Burnspam.new("email", "name", "duplicate")
103
+ x = Burnspam.new("email", "name", "Content #{rand}")
104
+ x = Burnspam.new("email", "name", "Content #{rand}")
105
+ x = Burnspam.new("email", "name", "Content #{rand}")
106
+ x = Burnspam.new("email", "name", "Content #{rand}")
107
+ x = Burnspam.new("email", "name", "Content #{rand}")
108
+ x = Burnspam.new("email", "name", "duplicate")
109
+ assert_equal false, x.points.content.duplicate
110
+
111
+ x = Burnspam.new("email", "name", "duplicate")
112
+ x = Burnspam.new("email", "name", "Content #{rand}")
113
+ x = Burnspam.new("email", "name", "Content #{rand}")
114
+ x = Burnspam.new("email", "name", "Content #{rand}")
115
+ x = Burnspam.new("email", "name", "duplicate")
116
+ assert_equal true, x.points.content.duplicate
117
+ end
118
+
119
+ # A GeneralStrategy class method tested from the
120
+ # NameCheckerStrategy and ContentCheckerStrategy sub-classes.
121
+ def test_general_strategey_superclass
122
+ x = Burnspam.new("email", "name", "content")
123
+ assert_equal 44, x.points.content.test
124
+ assert_equal 44, x.points.name.test
125
+ end
126
+
127
+ # Check accurate counting of bad keywords within the content.
128
+ def test_for_bad_keywords
129
+ x = Burnspam.new("email", "name", "content feck it")
130
+ assert_equal 1, x.points.content.keyword_count
131
+
132
+ x = Burnspam.new("email", "name", "feck bitch content feck it")
133
+ assert_equal 3, x.points.content.keyword_count
134
+ end
135
+
136
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: burnspam
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brian Burns
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-15 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: Built based on http://snook.ca/archives/other/effective_blog_comment_spam_blocker
15
+ email: bud_weiser3@hotmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/burnspam.rb
21
+ - lib/burnspam/point_tracker.rb
22
+ - Rakefile
23
+ - test/test_burnspam.rb
24
+ homepage: http://rubygems.org/gems/burnspam
25
+ licenses: []
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 1.8.11
45
+ signing_key:
46
+ specification_version: 3
47
+ summary: Analyses a spam comment and name and returns likelihood of spamminess. Also
48
+ checks for duplicate comments.
49
+ test_files: []