burnspam 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +8 -0
- data/lib/burnspam.rb +38 -0
- data/lib/burnspam/point_tracker.rb +278 -0
- data/test/test_burnspam.rb +136 -0
- metadata +49 -0
data/Rakefile
ADDED
data/lib/burnspam.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# The program takes a comment input as three
|
2
|
+
# paramaters: email, name and content.
|
3
|
+
# It performs some basic checks to determine the
|
4
|
+
# "spaminess" of a comment. Higher values are better
|
5
|
+
# while negative numbers are likely spam.
|
6
|
+
#
|
7
|
+
# It also stores the most recent comments in
|
8
|
+
# memory and checks that new comments are
|
9
|
+
# not duplicates. If a new comment is found already
|
10
|
+
# existing in memory, the spaminess value goes down!
|
11
|
+
#
|
12
|
+
# Version:: 0.1.1
|
13
|
+
#
|
14
|
+
# Date:: 2011/12/18
|
15
|
+
#
|
16
|
+
# @author:: Brian Burns, x10205284
|
17
|
+
#
|
18
|
+
# ==Burnspan.spaminess returns values:
|
19
|
+
# * spaminess < 0:: Obvious spam
|
20
|
+
# * spaminess 0 - 2:: Questionable quality
|
21
|
+
# * spaminess > 2:: Good quality comment
|
22
|
+
|
23
|
+
class Burnspam
|
24
|
+
attr_reader :points
|
25
|
+
|
26
|
+
# Analyse the comment
|
27
|
+
def initialize(email, name, content)
|
28
|
+
@points = PointTracker.new(email, name, content)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Return an int value of spaminess
|
32
|
+
def spaminess
|
33
|
+
@points.spaminess
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
require 'burnspam/point_tracker'
|
@@ -0,0 +1,278 @@
|
|
1
|
+
# PointTracker holds all the statistics of the name
|
2
|
+
# and comment inside @name and @content
|
3
|
+
#
|
4
|
+
# The name and content are passed into the relevant
|
5
|
+
# Strategy (Strategy Design Pattern).
|
6
|
+
#
|
7
|
+
# The checker interchanges these strategies. This
|
8
|
+
# allows slightly different processing for the name
|
9
|
+
# and content as well as a shared method made
|
10
|
+
# available through the GeneralStrategy class.
|
11
|
+
#
|
12
|
+
# The Checker returns the results to @name and @content
|
13
|
+
#
|
14
|
+
# Version:: 0.1.1
|
15
|
+
#
|
16
|
+
# Date:: 2011/12/18
|
17
|
+
#
|
18
|
+
# @author:: Brian Burns, x10205284
|
19
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
20
|
+
class Burnspam::PointTracker
|
21
|
+
|
22
|
+
attr_reader :name, :content
|
23
|
+
|
24
|
+
# Gather all statistics from name and content
|
25
|
+
def initialize(email, name, content)
|
26
|
+
@name = Checker.new(NameCheckerStrategy.new(name))
|
27
|
+
@content = Checker.new(ContentCheckerStrategy.new(content))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Return spaminess based on gathered statistics
|
31
|
+
def spaminess
|
32
|
+
@total = 0
|
33
|
+
@total += body_urls(@content.count_urls)
|
34
|
+
@total += name_urls(@name.count_urls)
|
35
|
+
@total += body_length(@content.length, @content.count_urls)
|
36
|
+
@total += keywordsearch(@content.keyword_count)
|
37
|
+
@total += urlength(@content.url_length)
|
38
|
+
@total += startswith?(@content.starts_with)
|
39
|
+
@total += duplicate?(@content.duplicate)
|
40
|
+
end
|
41
|
+
|
42
|
+
# * Gain 2 points for 1 URL
|
43
|
+
# * 0 points for 2 URLS
|
44
|
+
# * Loost 1 point for more than 2 URLS
|
45
|
+
def body_urls(count)
|
46
|
+
if count < 2
|
47
|
+
2
|
48
|
+
elsif count > 2
|
49
|
+
count * -1
|
50
|
+
else
|
51
|
+
0
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# URL in name? Loose 2 points
|
56
|
+
def name_urls(count)
|
57
|
+
count > 0 ? -2 : 0
|
58
|
+
end
|
59
|
+
|
60
|
+
# Comment less than 20 char, loose 1 point
|
61
|
+
def body_length(size, counturls)
|
62
|
+
if size < 20
|
63
|
+
-1
|
64
|
+
elsif (size > 20) && (counturls == 0)
|
65
|
+
2
|
66
|
+
else
|
67
|
+
0
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# For every bad keyword loose 1 point.
|
72
|
+
def keywordsearch (count)
|
73
|
+
count * -1
|
74
|
+
end
|
75
|
+
|
76
|
+
# 1 point penalty if average URL length is long
|
77
|
+
def urlength(size)
|
78
|
+
size > 30? -1 : 0
|
79
|
+
end
|
80
|
+
|
81
|
+
# If the comment starts with specific bad keywords
|
82
|
+
# they loose substantial (10) points
|
83
|
+
# This is because spammy comments often start like this.
|
84
|
+
def startswith?(word)
|
85
|
+
word ? -10 : 0
|
86
|
+
end
|
87
|
+
|
88
|
+
# If the post is duplicate, the comment looses 5 points.
|
89
|
+
def duplicate?(post)
|
90
|
+
post ? -5 : 0
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Checker calls the methods of the strategies and
|
95
|
+
# holds the results in accessible instance variables.
|
96
|
+
# The results are either Integer or Boolean
|
97
|
+
#
|
98
|
+
# Version:: 0.1.1
|
99
|
+
#
|
100
|
+
# Date:: 2011/12/18
|
101
|
+
#
|
102
|
+
# @author:: Brian Burns, x10205284
|
103
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
104
|
+
class Checker
|
105
|
+
attr_reader :count_urls, :url_length, :length, :starts_with,
|
106
|
+
:duplicate, :test, :keyword_count
|
107
|
+
|
108
|
+
def initialize(strategy)
|
109
|
+
@count_urls = strategy.count_urls
|
110
|
+
@url_length = strategy.url_length
|
111
|
+
@length = strategy.length
|
112
|
+
@starts_with = strategy.starts_with?
|
113
|
+
@duplicate = strategy.duplicate?
|
114
|
+
@test = strategy.test
|
115
|
+
@keyword_count = strategy.keywords
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
# The GeneralStrategy class stores the analysis methods that
|
121
|
+
# can be common to both NameChecker and ContentChecker Strategy
|
122
|
+
#
|
123
|
+
# It also stores a list of bad keywords for comments
|
124
|
+
# and bad keywords for URLs
|
125
|
+
#
|
126
|
+
# Version:: 0.1.1
|
127
|
+
#
|
128
|
+
# Date:: 2011/12/18
|
129
|
+
#
|
130
|
+
# @author:: Brian Burns, x10205284
|
131
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
132
|
+
class GeneralStrategy
|
133
|
+
|
134
|
+
@@comparisonwords = ["feck", "bitch"]
|
135
|
+
# Why use .de, .pl, .cn?
|
136
|
+
# Answer:
|
137
|
+
# Ask the author of this spam solution, referenced.
|
138
|
+
@@comparisonwordsurl = [".html", ".info", "?", "&", "free",
|
139
|
+
".de", ".pl", ".cn"]
|
140
|
+
|
141
|
+
# The only method that is currently shared. All the others
|
142
|
+
# implement custom strategies.
|
143
|
+
# This method checks for bad keywords in the text
|
144
|
+
def keywords
|
145
|
+
@keywords_count = 0
|
146
|
+
unless @keywords.empty?
|
147
|
+
@keywords.each do |word|
|
148
|
+
@@comparisonwords.each do |word1|
|
149
|
+
if word == word1
|
150
|
+
@keywords_count += 1
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
@keywords_count
|
156
|
+
end
|
157
|
+
|
158
|
+
def test
|
159
|
+
44
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
# A specialized strategy for analizing the name field.
|
165
|
+
# Analysis for name usually requires much less processing
|
166
|
+
# than comment body.
|
167
|
+
#
|
168
|
+
# Version:: 0.1.1
|
169
|
+
#
|
170
|
+
# Date:: 2011/12/18
|
171
|
+
#
|
172
|
+
# @author:: Brian Burns, x10205284
|
173
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
174
|
+
class NameCheckerStrategy < GeneralStrategy
|
175
|
+
|
176
|
+
# Stores name as a string and value in array
|
177
|
+
def initialize(name)
|
178
|
+
@name = name
|
179
|
+
@keywords = [name]
|
180
|
+
end
|
181
|
+
|
182
|
+
# We only expect to count one or zero URLs in name
|
183
|
+
def count_urls
|
184
|
+
@name.include?('http://') ? 1 : 0
|
185
|
+
end
|
186
|
+
|
187
|
+
# Check length if URL exists
|
188
|
+
def url_length
|
189
|
+
count_urls ? @name.length : 0
|
190
|
+
end
|
191
|
+
|
192
|
+
# Check length of name
|
193
|
+
def length
|
194
|
+
@name.length
|
195
|
+
end
|
196
|
+
|
197
|
+
# This method is not used but must be included to interface
|
198
|
+
# correctly with the Checker class
|
199
|
+
def starts_with?
|
200
|
+
false
|
201
|
+
end
|
202
|
+
|
203
|
+
# This method is not used but must be included to interface
|
204
|
+
# correctly with the Checker class
|
205
|
+
def duplicate?
|
206
|
+
false
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
210
|
+
|
211
|
+
# A specialized strategy for analizing the content field.
|
212
|
+
#
|
213
|
+
# Version:: 0.1.1
|
214
|
+
#
|
215
|
+
# Date:: 2011/12/18
|
216
|
+
#
|
217
|
+
# @author:: Brian Burns, x10205284
|
218
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
219
|
+
class ContentCheckerStrategy < GeneralStrategy
|
220
|
+
@@recent_posts = ["5", "4", "3" , "2", "1"]
|
221
|
+
|
222
|
+
# Stores content, URLS and individual words
|
223
|
+
def initialize(content)
|
224
|
+
@content = content
|
225
|
+
@urls = URI.extract(content)
|
226
|
+
@keywords = @content.split /[\s,']+/
|
227
|
+
end
|
228
|
+
|
229
|
+
def count_urls
|
230
|
+
@urls.count
|
231
|
+
end
|
232
|
+
|
233
|
+
# Calculate the average length of all URLs in content
|
234
|
+
def url_length
|
235
|
+
unless @urls.empty?
|
236
|
+
@total_length = 0
|
237
|
+
@urls.each do |x|
|
238
|
+
@total_length += x.length
|
239
|
+
end
|
240
|
+
@total_length / @urls.size
|
241
|
+
else
|
242
|
+
0
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def length
|
247
|
+
@content.length
|
248
|
+
end
|
249
|
+
|
250
|
+
# Checks for (currently 2) keywords that we don't like
|
251
|
+
def starts_with?
|
252
|
+
if @content =~ /Cool(.*)/
|
253
|
+
return true
|
254
|
+
elsif @content =~ /Wow(.*)/
|
255
|
+
return true
|
256
|
+
else
|
257
|
+
return false
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Compares a comment with recent comments and if a match
|
262
|
+
# is found returns true.
|
263
|
+
# Otherwise removes the oldest comment from the end
|
264
|
+
# and adds new one to the front (First In First Out queue)
|
265
|
+
def duplicate?
|
266
|
+
@@recent_posts.each do |comment|
|
267
|
+
if comment == @content
|
268
|
+
return true
|
269
|
+
end
|
270
|
+
end
|
271
|
+
@@recent_posts.pop
|
272
|
+
@@recent_posts.unshift @content
|
273
|
+
return false
|
274
|
+
end
|
275
|
+
|
276
|
+
end
|
277
|
+
|
278
|
+
require 'uri'
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'burnspam'
|
3
|
+
|
4
|
+
# These tests were created to test if statistics
|
5
|
+
# are correctly gathered.
|
6
|
+
#
|
7
|
+
# @points is a PointTracker instance variable within Burnspam.
|
8
|
+
# It has two accessible values:
|
9
|
+
# * @name: This contains the analysis of the name
|
10
|
+
# e.g. points.name.length, points.name.count_urls
|
11
|
+
# * @content: This contains analysis of the content
|
12
|
+
# e.g. points.name.duplicate?
|
13
|
+
#
|
14
|
+
# These hold the same statistics although
|
15
|
+
# the strategy used to perform name and content analysis
|
16
|
+
# differ (based on strategy design pattern)
|
17
|
+
# See Burnspam::PointTracker for explanation.
|
18
|
+
#
|
19
|
+
# Version:: 0.1.1
|
20
|
+
#
|
21
|
+
# Date:: 2011/12/18
|
22
|
+
#
|
23
|
+
# @author:: Brian Burns, x10205284
|
24
|
+
# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
25
|
+
# =Test the analysis of a comment
|
26
|
+
class BurnspamTest < Test::Unit::TestCase
|
27
|
+
# Test creation of new class
|
28
|
+
def test_create_new
|
29
|
+
assert Burnspam.new("email", "name", "content")
|
30
|
+
end
|
31
|
+
|
32
|
+
# Test that spaminess returns an Integer
|
33
|
+
def test_return_integer
|
34
|
+
x = Burnspam.new("email", "name", "content")
|
35
|
+
assert_kind_of Integer, x.spaminess
|
36
|
+
end
|
37
|
+
|
38
|
+
# Test that urls in the name are counted
|
39
|
+
def test_name_url
|
40
|
+
name = "http://www.spam.com"
|
41
|
+
x = Burnspam.new("email", name, "content")
|
42
|
+
assert_equal 1, x.points.name.count_urls
|
43
|
+
end
|
44
|
+
|
45
|
+
# Test that no urls in name produces 0
|
46
|
+
def test_name_clean
|
47
|
+
name = "Brian"
|
48
|
+
x = Burnspam.new("email", name, "content")
|
49
|
+
assert_equal 0, x.points.name.count_urls
|
50
|
+
end
|
51
|
+
|
52
|
+
# Check accurate counting of urls in name
|
53
|
+
def test_content_urls2
|
54
|
+
name = "http://www.spam.com"
|
55
|
+
content = "http://www.x.com and http://www.y.com!!"
|
56
|
+
x = Burnspam.new("email", name, content)
|
57
|
+
assert_equal 2, x.points.content.count_urls
|
58
|
+
end
|
59
|
+
|
60
|
+
# Check that no urls in content produces count of 0
|
61
|
+
def test_content_urls0
|
62
|
+
x = Burnspam.new("email", "name", "content")
|
63
|
+
assert_equal 0, x.points.content.count_urls
|
64
|
+
end
|
65
|
+
|
66
|
+
# Test that average url length count is accurate
|
67
|
+
# (20 + 18) / 2
|
68
|
+
# = 19
|
69
|
+
def test_url_length19
|
70
|
+
name = "http://www.spam.com"
|
71
|
+
content = "http://www.spam1.com http://www.spam.ie"
|
72
|
+
x = Burnspam.new("email", name, content)
|
73
|
+
assert_equal 19, x.points.content.url_length
|
74
|
+
assert_equal 19, x.points.name.url_length
|
75
|
+
end
|
76
|
+
|
77
|
+
# Count length of content
|
78
|
+
def test_length_content
|
79
|
+
content = "123456789"
|
80
|
+
x = Burnspam.new("email", "name", content)
|
81
|
+
assert_equal 9, x.points.content.length
|
82
|
+
end
|
83
|
+
|
84
|
+
# Check that bad starting keywords picked up.
|
85
|
+
def test_starts_with
|
86
|
+
content = "Cool..."
|
87
|
+
x = Burnspam.new("email", "name", content)
|
88
|
+
assert_equal true, x.points.content.starts_with
|
89
|
+
|
90
|
+
content = "Wow..."
|
91
|
+
x = Burnspam.new("email", "name", content)
|
92
|
+
assert_equal true, x.points.content.starts_with
|
93
|
+
|
94
|
+
content = "Doesn't start with.. "
|
95
|
+
x = Burnspam.new("email", "name", content)
|
96
|
+
assert_equal false, x.points.content.starts_with
|
97
|
+
end
|
98
|
+
|
99
|
+
# Test for duplicate comments in 5 most recent comments
|
100
|
+
# The first assertion expected to fail due to it being 6th.
|
101
|
+
def test_duplicate_comment
|
102
|
+
x = Burnspam.new("email", "name", "duplicate")
|
103
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
104
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
105
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
106
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
107
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
108
|
+
x = Burnspam.new("email", "name", "duplicate")
|
109
|
+
assert_equal false, x.points.content.duplicate
|
110
|
+
|
111
|
+
x = Burnspam.new("email", "name", "duplicate")
|
112
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
113
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
114
|
+
x = Burnspam.new("email", "name", "Content #{rand}")
|
115
|
+
x = Burnspam.new("email", "name", "duplicate")
|
116
|
+
assert_equal true, x.points.content.duplicate
|
117
|
+
end
|
118
|
+
|
119
|
+
# A GeneralStrategy class method tested from the
|
120
|
+
# NameCheckerStrategy and ContentCheckerStrategy sub-classes.
|
121
|
+
def test_general_strategey_superclass
|
122
|
+
x = Burnspam.new("email", "name", "content")
|
123
|
+
assert_equal 44, x.points.content.test
|
124
|
+
assert_equal 44, x.points.name.test
|
125
|
+
end
|
126
|
+
|
127
|
+
# Check accurate counting of bad keywords within the content.
|
128
|
+
def test_for_bad_keywords
|
129
|
+
x = Burnspam.new("email", "name", "content feck it")
|
130
|
+
assert_equal 1, x.points.content.keyword_count
|
131
|
+
|
132
|
+
x = Burnspam.new("email", "name", "feck bitch content feck it")
|
133
|
+
assert_equal 3, x.points.content.keyword_count
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: burnspam
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Brian Burns
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-15 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: Built based on http://snook.ca/archives/other/effective_blog_comment_spam_blocker
|
15
|
+
email: bud_weiser3@hotmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/burnspam.rb
|
21
|
+
- lib/burnspam/point_tracker.rb
|
22
|
+
- Rakefile
|
23
|
+
- test/test_burnspam.rb
|
24
|
+
homepage: http://rubygems.org/gems/burnspam
|
25
|
+
licenses: []
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ! '>='
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 1.8.11
|
45
|
+
signing_key:
|
46
|
+
specification_version: 3
|
47
|
+
summary: Analyses a spam comment and name and returns likelihood of spamminess. Also
|
48
|
+
checks for duplicate comments.
|
49
|
+
test_files: []
|