burnspam 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +8 -0
- data/lib/burnspam.rb +38 -0
- data/lib/burnspam/point_tracker.rb +278 -0
- data/test/test_burnspam.rb +136 -0
- metadata +49 -0
data/Rakefile
ADDED
data/lib/burnspam.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# The program takes a comment input as three
|
2
|
+
# paramaters: email, name and content.
|
3
|
+
# It performs some basic checks to determine the
|
4
|
+
# "spaminess" of a comment. Higher values are better
|
5
|
+
# while negative numbers are likely spam.
|
6
|
+
#
|
7
|
+
# It also stores the most recent comments in
|
8
|
+
# memory and checks that new comments are
|
9
|
+
# not duplicates. If a new comment is found already
|
10
|
+
# existing in memory, the spaminess value goes down!
|
11
|
+
#
|
12
|
+
# Version:: 0.1.1
|
13
|
+
#
|
14
|
+
# Date:: 2011/12/18
|
15
|
+
#
|
16
|
+
# @author:: Brian Burns, x10205284
|
17
|
+
#
|
18
|
+
# ==Burnspan.spaminess returns values:
|
19
|
+
# * spaminess < 0:: Obvious spam
|
20
|
+
# * spaminess 0 - 2:: Questionable quality
|
21
|
+
# * spaminess > 2:: Good quality comment
|
22
|
+
|
23
|
+
class Burnspam
|
24
|
+
attr_reader :points
|
25
|
+
|
26
|
+
# Analyse the comment
|
27
|
+
def initialize(email, name, content)
|
28
|
+
@points = PointTracker.new(email, name, content)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Return an int value of spaminess
|
32
|
+
def spaminess
|
33
|
+
@points.spaminess
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
require 'burnspam/point_tracker'
|
@@ -0,0 +1,278 @@
|
|
1
|
+
# PointTracker holds all the statistics of the name
|
2
|
+
# and comment inside @name and @content
|
3
|
+
#
|
4
|
+
# The name and content are passed into the relevant
|
5
|
+
# Strategy (Strategy Design Pattern).
|
6
|
+
#
|
7
|
+
# The checker interchanges these strategies. This
|
8
|
+
# allows slightly different processing for the name
|
9
|
+
# and content as well as a shared method made
|
10
|
+
# available through the GeneralStrategy class.
|
11
|
+
#
|
12
|
+
# The Checker returns the results to @name and @content
|
13
|
+
#
|
14
|
+
# Version:: 0.1.1
|
15
|
+
#
|
16
|
+
# Date:: 2011/12/18
|
17
|
+
#
|
18
|
+
# @author:: Brian Burns, x10205284
|
19
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
20
|
+
class Burnspam::PointTracker
|
21
|
+
|
22
|
+
attr_reader :name, :content
|
23
|
+
|
24
|
+
# Gather all statistics from name and content
|
25
|
+
def initialize(email, name, content)
|
26
|
+
@name = Checker.new(NameCheckerStrategy.new(name))
|
27
|
+
@content = Checker.new(ContentCheckerStrategy.new(content))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Return spaminess based on gathered statistics
|
31
|
+
def spaminess
|
32
|
+
@total = 0
|
33
|
+
@total += body_urls(@content.count_urls)
|
34
|
+
@total += name_urls(@name.count_urls)
|
35
|
+
@total += body_length(@content.length, @content.count_urls)
|
36
|
+
@total += keywordsearch(@content.keyword_count)
|
37
|
+
@total += urlength(@content.url_length)
|
38
|
+
@total += startswith?(@content.starts_with)
|
39
|
+
@total += duplicate?(@content.duplicate)
|
40
|
+
end
|
41
|
+
|
42
|
+
# * Gain 2 points for 1 URL
|
43
|
+
# * 0 points for 2 URLS
|
44
|
+
# * Loost 1 point for more than 2 URLS
|
45
|
+
def body_urls(count)
|
46
|
+
if count < 2
|
47
|
+
2
|
48
|
+
elsif count > 2
|
49
|
+
count * -1
|
50
|
+
else
|
51
|
+
0
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# URL in name? Loose 2 points
|
56
|
+
def name_urls(count)
|
57
|
+
count > 0 ? -2 : 0
|
58
|
+
end
|
59
|
+
|
60
|
+
# Comment less than 20 char, loose 1 point
|
61
|
+
def body_length(size, counturls)
|
62
|
+
if size < 20
|
63
|
+
-1
|
64
|
+
elsif (size > 20) && (counturls == 0)
|
65
|
+
2
|
66
|
+
else
|
67
|
+
0
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# For every bad keyword loose 1 point.
|
72
|
+
def keywordsearch (count)
|
73
|
+
count * -1
|
74
|
+
end
|
75
|
+
|
76
|
+
# 1 point penalty if average URL length is long
|
77
|
+
def urlength(size)
|
78
|
+
size > 30? -1 : 0
|
79
|
+
end
|
80
|
+
|
81
|
+
# If the comment starts with specific bad keywords
|
82
|
+
# they loose substantial (10) points
|
83
|
+
# This is because spammy comments often start like this.
|
84
|
+
def startswith?(word)
|
85
|
+
word ? -10 : 0
|
86
|
+
end
|
87
|
+
|
88
|
+
# If the post is duplicate, the comment looses 5 points.
|
89
|
+
def duplicate?(post)
|
90
|
+
post ? -5 : 0
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Checker calls the methods of the strategies and
|
95
|
+
# holds the results in accessible instance variables.
|
96
|
+
# The results are either Integer or Boolean
|
97
|
+
#
|
98
|
+
# Version:: 0.1.1
|
99
|
+
#
|
100
|
+
# Date:: 2011/12/18
|
101
|
+
#
|
102
|
+
# @author:: Brian Burns, x10205284
|
103
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
104
|
+
class Checker
|
105
|
+
attr_reader :count_urls, :url_length, :length, :starts_with,
|
106
|
+
:duplicate, :test, :keyword_count
|
107
|
+
|
108
|
+
def initialize(strategy)
|
109
|
+
@count_urls = strategy.count_urls
|
110
|
+
@url_length = strategy.url_length
|
111
|
+
@length = strategy.length
|
112
|
+
@starts_with = strategy.starts_with?
|
113
|
+
@duplicate = strategy.duplicate?
|
114
|
+
@test = strategy.test
|
115
|
+
@keyword_count = strategy.keywords
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
# The GeneralStrategy class stores the analysis methods that
|
121
|
+
# can be common to both NameChecker and ContentChecker Strategy
|
122
|
+
#
|
123
|
+
# It also stores a list of bad keywords for comments
|
124
|
+
# and bad keywords for URLs
|
125
|
+
#
|
126
|
+
# Version:: 0.1.1
|
127
|
+
#
|
128
|
+
# Date:: 2011/12/18
|
129
|
+
#
|
130
|
+
# @author:: Brian Burns, x10205284
|
131
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
132
|
+
class GeneralStrategy
|
133
|
+
|
134
|
+
@@comparisonwords = ["feck", "bitch"]
|
135
|
+
# Why use .de, .pl, .cn?
|
136
|
+
# Answer:
|
137
|
+
# Ask the author of this spam solution, referenced.
|
138
|
+
@@comparisonwordsurl = [".html", ".info", "?", "&", "free",
|
139
|
+
".de", ".pl", ".cn"]
|
140
|
+
|
141
|
+
# The only method that is currently shared. All the others
|
142
|
+
# implement custom strategies.
|
143
|
+
# This method checks for bad keywords in the text
|
144
|
+
def keywords
|
145
|
+
@keywords_count = 0
|
146
|
+
unless @keywords.empty?
|
147
|
+
@keywords.each do |word|
|
148
|
+
@@comparisonwords.each do |word1|
|
149
|
+
if word == word1
|
150
|
+
@keywords_count += 1
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
@keywords_count
|
156
|
+
end
|
157
|
+
|
158
|
+
def test
|
159
|
+
44
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
# A specialized strategy for analizing the name field.
|
165
|
+
# Analysis for name usually requires much less processing
|
166
|
+
# than comment body.
|
167
|
+
#
|
168
|
+
# Version:: 0.1.1
|
169
|
+
#
|
170
|
+
# Date:: 2011/12/18
|
171
|
+
#
|
172
|
+
# @author:: Brian Burns, x10205284
|
173
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
174
|
+
class NameCheckerStrategy < GeneralStrategy
|
175
|
+
|
176
|
+
# Stores name as a string and value in array
|
177
|
+
def initialize(name)
|
178
|
+
@name = name
|
179
|
+
@keywords = [name]
|
180
|
+
end
|
181
|
+
|
182
|
+
# We only expect to count one or zero URLs in name
|
183
|
+
def count_urls
|
184
|
+
@name.include?('http://') ? 1 : 0
|
185
|
+
end
|
186
|
+
|
187
|
+
# Check length if URL exists
|
188
|
+
def url_length
|
189
|
+
count_urls ? @name.length : 0
|
190
|
+
end
|
191
|
+
|
192
|
+
# Check length of name
|
193
|
+
def length
|
194
|
+
@name.length
|
195
|
+
end
|
196
|
+
|
197
|
+
# This method is not used but must be included to interface
|
198
|
+
# correctly with the Checker class
|
199
|
+
def starts_with?
|
200
|
+
false
|
201
|
+
end
|
202
|
+
|
203
|
+
# This method is not used but must be included to interface
|
204
|
+
# correctly with the Checker class
|
205
|
+
def duplicate?
|
206
|
+
false
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
210
|
+
|
211
|
+
# A specialized strategy for analizing the content field.
|
212
|
+
#
|
213
|
+
# Version:: 0.1.1
|
214
|
+
#
|
215
|
+
# Date:: 2011/12/18
|
216
|
+
#
|
217
|
+
# @author:: Brian Burns, x10205284
|
218
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
219
|
+
class ContentCheckerStrategy < GeneralStrategy
|
220
|
+
@@recent_posts = ["5", "4", "3" , "2", "1"]
|
221
|
+
|
222
|
+
# Stores content, URLS and individual words
|
223
|
+
def initialize(content)
|
224
|
+
@content = content
|
225
|
+
@urls = URI.extract(content)
|
226
|
+
@keywords = @content.split /[\s,']+/
|
227
|
+
end
|
228
|
+
|
229
|
+
def count_urls
|
230
|
+
@urls.count
|
231
|
+
end
|
232
|
+
|
233
|
+
# Calculate the average length of all URLs in content
|
234
|
+
def url_length
|
235
|
+
unless @urls.empty?
|
236
|
+
@total_length = 0
|
237
|
+
@urls.each do |x|
|
238
|
+
@total_length += x.length
|
239
|
+
end
|
240
|
+
@total_length / @urls.size
|
241
|
+
else
|
242
|
+
0
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def length
|
247
|
+
@content.length
|
248
|
+
end
|
249
|
+
|
250
|
+
# Checks for (currently 2) keywords that we don't like
|
251
|
+
def starts_with?
|
252
|
+
if @content =~ /Cool(.*)/
|
253
|
+
return true
|
254
|
+
elsif @content =~ /Wow(.*)/
|
255
|
+
return true
|
256
|
+
else
|
257
|
+
return false
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Compares a comment with recent comments and if a match
|
262
|
+
# is found returns true.
|
263
|
+
# Otherwise removes the oldest comment from the end
|
264
|
+
# and adds new one to the front (First In First Out queue)
|
265
|
+
def duplicate?
|
266
|
+
@@recent_posts.each do |comment|
|
267
|
+
if comment == @content
|
268
|
+
return true
|
269
|
+
end
|
270
|
+
end
|
271
|
+
@@recent_posts.pop
|
272
|
+
@@recent_posts.unshift @content
|
273
|
+
return false
|
274
|
+
end
|
275
|
+
|
276
|
+
end
|
277
|
+
|
278
|
+
require 'uri'
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'burnspam'
|
3
|
+
|
4
|
+
# These tests were created to test if statistics
|
5
|
+
# are correctly gathered.
|
6
|
+
#
|
7
|
+
# @points is a PointTracker instance variable within Burnspam.
|
8
|
+
# It has two accessible values:
|
9
|
+
# * @name: This contains the analysis of the name
|
10
|
+
# e.g. points.name.length, points.name.count_urls
|
11
|
+
# * @content: This contains analysis of the content
|
12
|
+
# e.g. points.name.duplicate?
|
13
|
+
#
|
14
|
+
# These hold the same statistics although
|
15
|
+
# the strategy used to perform name and content analysis
|
16
|
+
# differ (based on strategy design pattern)
|
17
|
+
# See Burnspam::PointTracker for explanation.
|
18
|
+
#
|
19
|
+
# Version:: 0.1.1
|
20
|
+
#
|
21
|
+
# Date:: 2011/12/18
|
22
|
+
#
|
23
|
+
# @author:: Brian Burns, x10205284
|
24
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
25
|
+
# =Test the analysis of a comment
|
26
|
+
class BurnspamTest < Test::Unit::TestCase
|
27
|
+
# Test creation of new class
|
28
|
+
def test_create_new
|
29
|
+
assert Burnspam.new("email", "name", "content")
|
30
|
+
end
|
31
|
+
|
32
|
+
# Test that spaminess returns an Integer
|
33
|
+
def test_return_integer
|
34
|
+
x = Burnspam.new("email", "name", "content")
|
35
|
+
assert_kind_of Integer, x.spaminess
|
36
|
+
end
|
37
|
+
|
38
|
+
# Test that urls in the name are counted
|
39
|
+
def test_name_url
|
40
|
+
name = "http://www.spam.com"
|
41
|
+
x = Burnspam.new("email", name, "content")
|
42
|
+
assert_equal 1, x.points.name.count_urls
|
43
|
+
end
|
44
|
+
|
45
|
+
# Test that no urls in name produces 0
|
46
|
+
def test_name_clean
|
47
|
+
name = "Brian"
|
48
|
+
x = Burnspam.new("email", name, "content")
|
49
|
+
assert_equal 0, x.points.name.count_urls
|
50
|
+
end
|
51
|
+
|
52
|
+
# Check accurate counting of urls in name
|
53
|
+
def test_content_urls2
|
54
|
+
name = "http://www.spam.com"
|
55
|
+
content = "http://www.x.com and http://www.y.com!!"
|
56
|
+
x = Burnspam.new("email", name, content)
|
57
|
+
assert_equal 2, x.points.content.count_urls
|
58
|
+
end
|
59
|
+
|
60
|
+
# Check that no urls in content produces count of 0
|
61
|
+
def test_content_urls0
|
62
|
+
x = Burnspam.new("email", "name", "content")
|
63
|
+
assert_equal 0, x.points.content.count_urls
|
64
|
+
end
|
65
|
+
|
66
|
+
# Test that average url length count is accurate
|
67
|
+
# (20 + 18) / 2
|
68
|
+
# = 19
|
69
|
+
def test_url_length19
|
70
|
+
name = "http://www.spam.com"
|
71
|
+
content = "http://www.spam1.com http://www.spam.ie"
|
72
|
+
x = Burnspam.new("email", name, content)
|
73
|
+
assert_equal 19, x.points.content.url_length
|
74
|
+
assert_equal 19, x.points.name.url_length
|
75
|
+
end
|
76
|
+
|
77
|
+
# Count length of content
|
78
|
+
def test_length_content
|
79
|
+
content = "123456789"
|
80
|
+
x = Burnspam.new("email", "name", content)
|
81
|
+
assert_equal 9, x.points.content.length
|
82
|
+
end
|
83
|
+
|
84
|
+
# Check that bad starting keywords picked up.
|
85
|
+
def test_starts_with
|
86
|
+
content = "Cool..."
|
87
|
+
x = Burnspam.new("email", "name", content)
|
88
|
+
assert_equal true, x.points.content.starts_with
|
89
|
+
|
90
|
+
content = "Wow..."
|
91
|
+
x = Burnspam.new("email", "name", content)
|
92
|
+
assert_equal true, x.points.content.starts_with
|
93
|
+
|
94
|
+
content = "Doesn't start with.. "
|
95
|
+
x = Burnspam.new("email", "name", content)
|
96
|
+
assert_equal false, x.points.content.starts_with
|
97
|
+
end
|
98
|
+
|
99
|
+
# Test for duplicate comments in 5 most recent comments
|
100
|
+
# The first assertion expected to fail due to it being 6th.
|
101
|
+
def test_duplicate_comment
|
102
|
+
x = Burnspam.new("email", "name", "duplicate")
|
103
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
104
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
105
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
106
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
107
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
108
|
+
x = Burnspam.new("email", "name", "duplicate")
|
109
|
+
assert_equal false, x.points.content.duplicate
|
110
|
+
|
111
|
+
x = Burnspam.new("email", "name", "duplicate")
|
112
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
113
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
114
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
115
|
+
x = Burnspam.new("email", "name", "duplicate")
|
116
|
+
assert_equal true, x.points.content.duplicate
|
117
|
+
end
|
118
|
+
|
119
|
+
# A GeneralStrategy class method tested from the
|
120
|
+
# NameCheckerStrategy and ContentCheckerStrategy sub-classes.
|
121
|
+
def test_general_strategey_superclass
|
122
|
+
x = Burnspam.new("email", "name", "content")
|
123
|
+
assert_equal 44, x.points.content.test
|
124
|
+
assert_equal 44, x.points.name.test
|
125
|
+
end
|
126
|
+
|
127
|
+
# Check accurate counting of bad keywords within the content.
|
128
|
+
def test_for_bad_keywords
|
129
|
+
x = Burnspam.new("email", "name", "content feck it")
|
130
|
+
assert_equal 1, x.points.content.keyword_count
|
131
|
+
|
132
|
+
x = Burnspam.new("email", "name", "feck bitch content feck it")
|
133
|
+
assert_equal 3, x.points.content.keyword_count
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: burnspam
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Brian Burns
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-15 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: Built based on http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
15
|
+
email: bud_weiser3@hotmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/burnspam.rb
|
21
|
+
- lib/burnspam/point_tracker.rb
|
22
|
+
- Rakefile
|
23
|
+
- test/test_burnspam.rb
|
24
|
+
homepage: http://rubygems.org/gems/burnspam
|
25
|
+
licenses: []
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ! '>='
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 1.8.11
|
45
|
+
signing_key:
|
46
|
+
specification_version: 3
|
47
|
+
summary: Analyses a spam comment and name and returns likelihood of spamminess. Also
|
48
|
+
checks for duplicate comments.
|
49
|
+
test_files: []
|