RubyGems - burnspam - Versions diffs - 0.1.1 - Mend

burnspam 0.1.1

Files changed (5) hide show

data/Rakefile +8 -0
data/lib/burnspam.rb +38 -0
data/lib/burnspam/point_tracker.rb +278 -0
data/test/test_burnspam.rb +136 -0
metadata +49 -0

data/Rakefile ADDED Viewed

@@ -0,0 +1,8 @@
+require 'rake/testtask'
+Rake::TestTask.new do |t|
+  t.libs << 'test'
+end
+desc "Run tests"
+task :default => :test

data/lib/burnspam.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# The program takes a comment input as three
+# paramaters: email, name and content.
+# It performs some basic checks to determine the
+# "spaminess" of a comment. Higher values are better
+# while negative numbers are likely spam.
+#
+# It also stores the most recent comments in
+# memory and checks that new comments are
+# not duplicates. If a new comment is found already
+# existing in memory, the spaminess value goes down!
+#
+# Version:: 0.1.1
+#
+# Date:: 2011/12/18
+#
+# @author:: Brian Burns, x10205284
+#
+# ==Burnspan.spaminess returns values:
+# * spaminess < 0:: Obvious spam
+# * spaminess 0 - 2:: Questionable quality
+# * spaminess > 2:: Good quality comment
+class Burnspam
+  attr_reader :points
+  # Analyse the comment
+  def initialize(email, name, content)
+    @points = PointTracker.new(email, name, content)
+  end
+  # Return an int value of spaminess
+  def spaminess
+    @points.spaminess
+  end
+end
+require 'burnspam/point_tracker'

data/lib/burnspam/point_tracker.rb ADDED Viewed

@@ -0,0 +1,278 @@
+# PointTracker holds all the statistics of the name
+# and comment inside @name and @content
+#
+# The name and content are passed into the relevant
+# Strategy (Strategy Design Pattern).
+#
+# The checker interchanges these strategies. This
+# allows slightly different processing for the name
+# and content as well as a shared method made
+# available through the GeneralStrategy class.
+#
+# The Checker returns the results to @name and @content
+#
+# Version:: 0.1.1
+#
+# Date:: 2011/12/18
+#
+# @author:: Brian Burns, x10205284
+# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
+class Burnspam::PointTracker
+  attr_reader :name, :content
+  # Gather all statistics from name and content
+  def initialize(email, name, content)
+    @name = Checker.new(NameCheckerStrategy.new(name))
+    @content = Checker.new(ContentCheckerStrategy.new(content))
+  end
+  # Return spaminess based on gathered statistics
+  def spaminess
+    @total = 0
+    @total += body_urls(@content.count_urls)
+    @total += name_urls(@name.count_urls)
+    @total += body_length(@content.length, @content.count_urls)
+    @total += keywordsearch(@content.keyword_count)
+    @total += urlength(@content.url_length)
+    @total += startswith?(@content.starts_with)
+    @total += duplicate?(@content.duplicate)
+  end
+  # * Gain 2 points for 1 URL
+  # * 0 points for 2 URLS
+  # * Loost 1 point for more than 2 URLS
+  def body_urls(count)
+    if count < 2
+      2
+    elsif count > 2
+      count * -1
+    else
+      0
+    end
+  end
+  # URL in name? Loose 2 points
+  def name_urls(count)
+    count > 0 ? -2 : 0
+  end
+  # Comment less than 20 char, loose 1 point
+  def body_length(size, counturls)
+    if size < 20
+      -1
+    elsif (size > 20) && (counturls == 0)
+      2
+    else
+      0
+    end
+  end
+  # For every bad keyword loose 1 point.
+  def keywordsearch (count)
+    count * -1
+  end
+  # 1 point penalty if average URL length is long
+  def urlength(size)
+    size > 30? -1  :  0
+  end
+  # If the comment starts with specific bad keywords
+  # they loose substantial (10) points
+  # This is because spammy comments often start like this.
+  def startswith?(word)
+    word  ?  -10 : 0
+  end
+  # If the post is duplicate, the comment looses 5 points.
+  def duplicate?(post)
+    post ? -5  :  0
+  end
+end
+# Checker calls the methods of the strategies and
+# holds the results in accessible instance variables.
+# The results are either Integer or Boolean
+#
+# Version:: 0.1.1
+#
+# Date:: 2011/12/18
+#
+# @author:: Brian Burns, x10205284
+# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
+class Checker
+  attr_reader :count_urls, :url_length, :length, :starts_with,
+  :duplicate, :test, :keyword_count
+  def initialize(strategy)
+    @count_urls = strategy.count_urls
+    @url_length = strategy.url_length
+    @length = strategy.length
+    @starts_with = strategy.starts_with?
+    @duplicate = strategy.duplicate?
+    @test = strategy.test
+    @keyword_count = strategy.keywords
+  end
+end
+#  The GeneralStrategy class stores the analysis methods that
+#  can be common to both NameChecker and ContentChecker Strategy
+#
+#  It also stores a list of bad keywords for comments
+#  and bad keywords for URLs
+#
+#  Version:: 0.1.1
+#
+#  Date:: 2011/12/18
+#
+#  @author:: Brian Burns, x10205284
+#  @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
+class GeneralStrategy
+  @@comparisonwords = ["feck", "bitch"]
+  # Why use .de, .pl, .cn?
+  #  Answer:
+  #  Ask the author of this spam solution, referenced.
+  @@comparisonwordsurl = [".html", ".info", "?", "&", "free",
+  ".de", ".pl", ".cn"]
+  # The only method that is currently shared. All the others
+  # implement custom strategies.
+  # This method checks for bad keywords in the text
+  def keywords
+    @keywords_count = 0
+    unless @keywords.empty?
+      @keywords.each do |word|
+        @@comparisonwords.each do |word1|
+          if word == word1
+            @keywords_count += 1
+          end
+        end
+      end
+    end
+    @keywords_count
+  end
+  def test
+    44
+  end
+end
+# A specialized strategy for analizing the name field.
+# Analysis for name usually requires much less processing
+# than comment body.
+#
+# Version:: 0.1.1
+#
+# Date:: 2011/12/18
+#
+# @author:: Brian Burns, x10205284
+# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
+class NameCheckerStrategy < GeneralStrategy
+  # Stores name as a string and value in array
+  def initialize(name)
+    @name = name
+    @keywords = [name]
+  end
+  # We only expect to count one or zero URLs in name
+  def count_urls
+    @name.include?('http://') ? 1 : 0
+  end
+  # Check length if URL exists
+  def url_length
+    count_urls ? @name.length : 0
+  end
+  # Check length of name
+  def length
+    @name.length
+  end
+  # This method is not used but must be included to interface
+  # correctly with the Checker class
+  def starts_with?
+    false
+  end
+  # This method is not used but must be included to interface
+  # correctly with the Checker class
+  def duplicate?
+    false
+  end
+end
+# A specialized strategy for analizing the content field.
+#
+# Version:: 0.1.1
+#
+# Date:: 2011/12/18
+#
+# @author:: Brian Burns, x10205284
+# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
+class ContentCheckerStrategy  < GeneralStrategy
+  @@recent_posts = ["5", "4", "3" , "2", "1"]
+  # Stores content, URLS and individual words
+  def initialize(content)
+    @content = content
+    @urls = URI.extract(content)
+    @keywords = @content.split /[\s,']+/
+  end
+  def count_urls
+    @urls.count
+  end
+  # Calculate the average length of all URLs in content
+  def url_length
+    unless @urls.empty?
+      @total_length = 0
+      @urls.each do |x|
+        @total_length += x.length
+      end
+      @total_length / @urls.size
+    else
+      0
+    end
+  end
+  def length
+    @content.length
+  end
+  # Checks for (currently 2) keywords that we don't like
+  def starts_with?
+    if @content =~ /Cool(.*)/
+      return true
+    elsif @content =~ /Wow(.*)/
+      return true
+    else
+      return false
+    end
+  end
+  # Compares a comment with recent comments and if a match
+  # is found returns true.
+  # Otherwise removes the oldest comment from the end
+  # and adds new one to the front (First In First Out queue)
+  def duplicate?
+    @@recent_posts.each do |comment|
+      if comment == @content
+        return true
+      end
+    end
+    @@recent_posts.pop
+    @@recent_posts.unshift @content
+    return false
+  end
+end
+require 'uri'

data/test/test_burnspam.rb ADDED Viewed

@@ -0,0 +1,136 @@
+require 'test/unit'
+require 'burnspam'
+# These tests were created to test if statistics
+# are correctly gathered.
+#
+# @points is a PointTracker instance variable within Burnspam.
+# It has two accessible values:
+# * @name: This contains the analysis of the name
+#   e.g. points.name.length, points.name.count_urls
+# * @content: This contains analysis of the content
+#   e.g. points.name.duplicate?
+#
+# These hold the same statistics although
+# the strategy used to perform name and content analysis
+# differ (based on strategy design pattern)
+# See Burnspam::PointTracker for explanation.
+#
+# Version:: 0.1.1
+#
+# Date:: 2011/12/18
+#
+# @author:: Brian Burns, x10205284
+# @reference:: http://snook.ca/archives/other/effective_blog_comment_spam_blocker
+# =Test the analysis of a comment
+class BurnspamTest < Test::Unit::TestCase
+  # Test creation of new class
+  def test_create_new
+    assert Burnspam.new("email", "name", "content")
+  end
+  # Test that spaminess returns an Integer
+  def test_return_integer
+    x = Burnspam.new("email", "name", "content")
+    assert_kind_of Integer, x.spaminess
+  end
+  # Test that urls in the name are counted
+  def test_name_url
+    name = "http://www.spam.com"
+    x = Burnspam.new("email", name, "content")
+    assert_equal 1, x.points.name.count_urls
+  end
+  # Test that no urls in name produces 0
+  def test_name_clean
+    name = "Brian"
+    x = Burnspam.new("email", name, "content")
+    assert_equal 0, x.points.name.count_urls
+  end
+  # Check accurate counting of urls in name
+  def test_content_urls2
+    name = "http://www.spam.com"
+    content = "http://www.x.com and http://www.y.com!!"
+    x = Burnspam.new("email", name, content)
+    assert_equal 2, x.points.content.count_urls
+  end
+  # Check that no urls in content produces count of 0
+  def test_content_urls0
+    x = Burnspam.new("email", "name", "content")
+    assert_equal 0, x.points.content.count_urls
+  end
+  # Test that average url length count is accurate
+  #  (20 + 18) / 2
+  # = 19
+  def test_url_length19
+    name = "http://www.spam.com"
+    content = "http://www.spam1.com  http://www.spam.ie"
+    x = Burnspam.new("email", name, content)
+    assert_equal 19, x.points.content.url_length
+    assert_equal 19, x.points.name.url_length
+  end
+  # Count length of content
+  def test_length_content
+    content = "123456789"
+    x = Burnspam.new("email", "name", content)
+    assert_equal 9, x.points.content.length
+  end
+  # Check that bad starting keywords picked up.
+  def test_starts_with
+    content = "Cool..."
+    x = Burnspam.new("email", "name", content)
+    assert_equal true, x.points.content.starts_with
+     content = "Wow..."
+    x = Burnspam.new("email", "name", content)
+    assert_equal true, x.points.content.starts_with
+    content = "Doesn't start with.. "
+    x = Burnspam.new("email", "name", content)
+    assert_equal false, x.points.content.starts_with
+  end
+  # Test for duplicate comments in 5 most recent comments
+  # The first assertion expected to fail due to it being 6th.
+  def test_duplicate_comment
+    x = Burnspam.new("email", "name", "duplicate")
+    x = Burnspam.new("email", "name", "Content #{rand}")
+    x = Burnspam.new("email", "name", "Content #{rand}")
+    x = Burnspam.new("email", "name", "Content #{rand}")
+    x = Burnspam.new("email", "name", "Content #{rand}")
+    x = Burnspam.new("email", "name", "Content #{rand}")
+    x = Burnspam.new("email", "name", "duplicate")
+    assert_equal false, x.points.content.duplicate
+    x = Burnspam.new("email", "name", "duplicate")
+    x = Burnspam.new("email", "name", "Content #{rand}")
+    x = Burnspam.new("email", "name", "Content #{rand}")
+    x = Burnspam.new("email", "name", "Content #{rand}")
+    x = Burnspam.new("email", "name", "duplicate")
+    assert_equal true, x.points.content.duplicate
+  end
+  # A GeneralStrategy class method tested from the
+  # NameCheckerStrategy and ContentCheckerStrategy sub-classes.
+  def test_general_strategey_superclass
+    x = Burnspam.new("email", "name", "content")
+    assert_equal 44, x.points.content.test
+    assert_equal 44, x.points.name.test
+  end
+  # Check accurate counting of bad keywords within the content.
+  def test_for_bad_keywords
+    x = Burnspam.new("email", "name", "content feck it")
+    assert_equal 1, x.points.content.keyword_count
+    x = Burnspam.new("email", "name", "feck bitch content feck it")
+    assert_equal 3, x.points.content.keyword_count
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,49 @@
+--- !ruby/object:Gem::Specification
+name: burnspam
+version: !ruby/object:Gem::Version
+  version: 0.1.1
+  prerelease:
+platform: ruby
+authors:
+- Brian Burns
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-12-15 00:00:00.000000000Z
+dependencies: []
+description: Built based on http://snook.ca/archives/other/effective_blog_comment_spam_blocker
+email: bud_weiser3@hotmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/burnspam.rb
+- lib/burnspam/point_tracker.rb
+- Rakefile
+- test/test_burnspam.rb
+homepage: http://rubygems.org/gems/burnspam
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.11
+signing_key:
+specification_version: 3
+summary: Analyses a spam comment and name and returns likelihood of spamminess. Also
+  checks for duplicate comments.
+test_files: []