RubyGems - splam - Versions diffs - 0.1.1 → 0.2.0 - Mend

splam 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data.tar.gz.sig +0 -0
data/Gemfile +2 -0
data/Gemfile.lock +5 -1
data/{README → Readme.md} +9 -0
data/lib/splam.rb +5 -3
data/lib/splam/ngram.rb +98 -0
data/lib/splam/rule.rb +15 -3
data/lib/splam/rules/arms_race.rb +1 -1
data/lib/splam/rules/bad_words.rb +60 -19
data/lib/splam/rules/bbcode.rb +4 -2
data/lib/splam/rules/chinese.rb +24 -4
data/lib/splam/rules/fuzz.rb +6 -1
data/lib/splam/rules/good_words.rb +6 -3
data/lib/splam/rules/href.rb +10 -6
data/lib/splam/rules/httpbl.rb +62 -0
data/lib/splam/rules/line_length.rb +2 -2
data/lib/splam/rules/punctuation.rb +5 -4
data/lib/splam/rules/russian.rb +1 -1
data/lib/splam/rules/user.rb +15 -0
data/lib/splam/rules/word_length.rb +7 -8
data/splam.gemspec +1 -1
data/test/ngram_test.rb +36 -0
data/test/splam_test.rb +20 -2
metadata +8 -5
metadata.gz.sig +1 -2
data/test/fixtures/comment/spam/consent.txt +0 -1

data.tar.gz.sig CHANGED

Binary file

data/Gemfile CHANGED

@@ -3,4 +3,6 @@ gemspec
 gem 'bump'
 gem 'rake'
+gem 'redis'
 gem 'activesupport'
+gem 'system_timer', :platform => :ruby_18

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    splam (0.1.1)
+    splam (0.2.0)
 GEM
   remote: http://rubygems.org/
@@ -13,6 +13,8 @@ GEM
     i18n (0.6.1)
     multi_json (1.6.0)
     rake (10.0.3)
+    redis (3.0.2)
+    system_timer (1.2.4)
 PLATFORMS
   ruby
@@ -21,4 +23,6 @@ DEPENDENCIES
   activesupport
   bump
   rake
+  redis
   splam!
+  system_timer

data/{README → Readme.md} RENAMED

@@ -42,8 +42,17 @@ site) whether to ban the post or not.
 We recommend showing the post to the user (spambox them in) but hide it from everyone else.
+Dev
+===
+    bundle
+    redis-server
+    rake
 TODO
+====
+- fix on 1.9
 - Integrate bayesian or other clever algorithm, so that scores aren't hardcoded.
 - Switch to using a percentage (0.994) rather than a score (250)
 - Write more plugins!

data/lib/splam.rb CHANGED

@@ -29,11 +29,11 @@ module Splam
       end
     end
-    def run(record)
+    def run(record, request)
       score, reasons = 0, []
       rules.each do |rule_class, weight|
         weight ||= 1
-        worker   = rule_class.run(self, record, weight)
+        worker   = rule_class.run(self, record, weight, request)
         score   += worker.score
         reasons << worker.reasons
       end
@@ -51,6 +51,7 @@ module Splam
     Dir["#{File.dirname(__FILE__)}/splam/rules/*.rb"].each do |f|
       require f
     end
+    require "splam/ngram"
     base.send :extend, ClassMethods
   end
@@ -113,7 +114,8 @@ protected
     return false if (splam_suite.conditions && !splam_suite.conditions.call(self)) ||
                     skip_splam_check ||
                     send(splam_suite.body).nil?
-    @splam_score, @splam_reasons = splam_suite.run(self)
+    @request = splam_suite.request.call(self) if splam_suite.request
+    @splam_score, @splam_reasons = splam_suite.run(self, @request)
     instance_variable_get("@splam_#{attr_suffix}") if attr_suffix
   end

data/lib/splam/ngram.rb ADDED

@@ -0,0 +1,98 @@
+class Splam::Ngram
+  def self.trigram text
+    # this won't be utf-8 happy. Oh well!
+    words = text.gsub("'", "").split(/\W/)
+    hash = Hash.new 0
+    i = 0
+    while (i < words.length)
+      tri = []
+      count = 0
+      while ((words.length > i + count) && (tri.length < 3))
+        word = words[i + count]
+        if word && word != ""
+          tri << words[i + count]
+        end
+        count += 1
+      end
+      if tri.length == 3
+        hash[tri.join(' ')] += 1
+      end
+      i += 1
+    end
+    hash
+  end
+  def initialize site_id=nil
+    @site_id = site_id
+  end
+  # Train the temporary corpus with your data
+  def train words, spam = false, retrain = false
+    if words.is_a?(String)
+      words = self.class.trigram(words)
+    end
+    words.each do |word,value|
+      key = spam ? "spam" : "ham"
+      REDIS.hincrby key, word, value
+      REDIS.hincrby "#{key}-#{@site_id}", word, value if @site_id
+      if retrain
+        # Remove phrases from existing corpus
+        key = spam ? "ham" : "spam"
+        REDIS.hincrby key, word, -value
+        REDIS.hincrby "#{key}-#{@site_id}", word, -value if @site_id
+      end
+    end
+  end
+  def compare text
+    tri = self.class.trigram(text)
+    score = 0
+    spam = 0
+    ham_key = @site_id ? "ham-#{@site_id}" : "ham"
+    spam_key = @site_id ? "spam-#{@site_id}" : "spam"
+    @ham_tri = Hash.new 0
+    @spam_tri = Hash.new 0
+    tri.each do |key,value|
+      next if key.nil? || key.strip == ""
+      hmatch = REDIS.hget(ham_key, key).to_i #  ham_tri[key]
+      smatch = REDIS.hget(spam_key, key).to_i  # spam_tri[key]
+      if hmatch > 0 && smatch > 0
+        # tri appears in both
+        # ignore.
+        next
+      end
+      if hmatch > 0
+        score += hmatch + value
+      elsif smatch > 0
+        spam += smatch + value
+      end
+    end
+    [score, spam]
+  end
+end
+# corpus = Splam::Ngram.new 10009
+# s.comments.paginated_each(:order => "id desc") do |c|
+#   puts c.id
+#   words = Splam::Ngram.trigram(c.body.downcase)
+#   if c.author.support? || (c.user && c.user.trusted?)
+#     corpus.train words, false
+#   elsif c.spam
+#     corpus.train words, true
+#   end
+# end
+#
+# Comment.spam.paginated_each(:order => "id desc", :conditions => ['id < 12916619']) do |c|
+#   next if c.user_email == "no-reply@lighthouseapp.com"
+#   score = corpus.compare(c.body)
+#   if score[0] > score[1]
+#     puts "Not spam? #{c.id} : #{score.inspect} - #{c.body.first(100)}"
+#   else
+#     puts "Spam! #{c.id} : #{score.inspect}"
+#   end
+# end

data/lib/splam/rule.rb CHANGED

@@ -30,8 +30,9 @@ class Splam::Rule
     end
   end
-  def initialize(suite, record, weight = 1.0)
-    @suite, @weight, @score, @reasons, @body = suite, weight, 0, [], record.send(suite.body)
+  def initialize(suite, record, weight = 1.0, request = nil)
+    @suite, @weight, @score, @reasons, @body, @request = suite, weight, 0, [], record.send(suite.body), request
+    @user = record.user # todo: customize user field
   end
   def name
@@ -70,4 +71,15 @@ class Splam::Rule
       @score += points
     end
   end
-end
+  def line_safe?(string)
+    ([
+      /\.dylib\b/,
+      /\b0x[0-9a-f]{6,16}\b/i,
+      /\b\/Applications\//,
+      /\b\/System\/Library\//,
+      /\bLibrary\/Application Support\//
+    ].map {|r| r.match string }).compact.size > 0
+  end
+end

data/lib/splam/rules/arms_race.rb CHANGED

@@ -7,7 +7,7 @@ class Splam::Rules::ArmsRace < Splam::Rule
   # This is where you put banned domain names or otherwise
   def run
-    shitty_sites = ["inquisitr"]
+    shitty_sites = ["inquisitr", "beeplog"]
     shitty_sites.each do |word|
       results = @body.downcase.scan(word)
       if results && results.size > 0

data/lib/splam/rules/bad_words.rb CHANGED

@@ -1,3 +1,4 @@
+require 'active_support'
 class Splam::Rules::BadWords < Splam::Rule
   class << self
     attr_accessor :bad_word_score, :suspicious_word_score
@@ -7,28 +8,68 @@ class Splam::Rules::BadWords < Splam::Rule
   self.suspicious_word_score = 4
   def run
-    bad_words = %w( sex sexy porn gay erotica viagra erotismo porno porn lesbian amateur tit\b)
-    bad_words |= %w( gratis erotismo porno torrent bittorrent adulto )
-    bad_words |= %w( cialis viagra payday loan jihad )
-    bad_words |= %w( webcam  free-web-host rapidshare muslim)
-    bad_words << /pel?cula/ << /pornogr?fica/ << "portal porno" # srsly, spamming in spanish?
+    bad_words = {}
+    bad_words[:pornspam] = %w( sex sexy porn gay erotica erotico topless naked viagra erotismo porno porn lesbian amateur tit\b)
+    bad_words[:pornspam] |= %w( gratis erotismo porno torrent bittorrent adulto videochat  video 3dsex)
+    bad_words[:pornspam] << /pel?cula/ << /pornogr?fica/ << "portal porno" # srsly, spamming in spanish?
+    bad_words[:pornspam] |= %w( webcam  free-web-host rapidshare)
+    bad_words[:viagraspam] = %w( cialis viagra pharmacy prescription levitra kamagra)
+    bad_words[:benzospam]  = %w( ultram tramadol pharmacy prescription )
+    bad_words[:cashspam]   = %w( payday loan jihad ) << "payday loan"
+    bad_words[:pharmaspam] = %w( propecia finasteride viagra )
+    bad_words[:nigerian]   = ["million pounds sterling", "dear sirs,", "any bank account", "winning notification", "western union", "diagnosed with cancer", "bank treasury", "unclaimed inheritance"]
+    # linkspammers
+    bad_words[:linkspam] = ["increase traffic", "discovered your blog", "backlinks", "sent me a link", "more visitors to my site", "targeted traffic", "increase traffic to your website", "estore"]
+    bad_words[:beats] = %w( beats dre headphones sale cheap shipping ) << "monster beats" << "best online"
+    bad_words[:rolex] = %w( rolex watch replica watches price )
+    bad_words[:wtf] = %w( bilete avion )
+    # buying fake shitty brand stuff
+    bad_words[:bagspam]  = %w(handbag louis louisvuitton vuitton chanel coach clearance outlet hermes bag scarf sale ralphlauren)
+    bad_words[:handbags] = %w( karenmillen michaelkors kors millen bags purchase handbag chanel outlet tasche longchamp kaufen louboutin christianlouboutin)
+    bad_words[:blingspam] = %w( tiffany jewellery tiffanyco clearance outlet)
+    bad_words[:uggspam]  = %w(\buggs?\b \buggboots\b clearance outlet )
+    bad_words[:wedding]  = ["wedding", "wedding dress", "weddingdress", "strapless"]
+    bad_words[:webcamspam] = %w( live girls webcam adult singles) << "chat room"
+    bad_words[:gamereview] = %w( games-review-it.com game-reviews-online.com )
+    bad_words[:streaming]  = %w( watchmlbbaseball watchnhlhockey pspnsportstv.com )
+    bad_words[:forum_spam] = ["IMG", "url="]
     suspicious_words =  %w( free buy galleries dating gallery hard hardcore video homemade celebrity ) << "credit card" << "my friend" << "friend sent me"
-    suspicious_words |= %w( adult pharmacy overnight shipping free hot movie nylon arab ?????? xxx) << "sent me a link"
+    suspicious_words |= %w( adult overnight shipping free hot movie nylon arab ?????? seo)
     suspicious_words << "forums/member.php?u=" << "chat room" << "free chat" << "yahoo chat" << "page.php"
-    bad_words.each do |word|
-      results = @body.downcase.scan(word)
-      if results && results.size > 0
-        add_score((self.class.bad_word_score ** results.size), "nasty word: '#{word}'")
-        # Add more points if the bad word is INSIDE a link
-        @body.scan(/<a[^>]+>(.*?)<\/a>/).each do |match|
-          add_score self.class.bad_word_score * 4 * match[0].scan(word).size, "nasty word inside a link: #{word}"
-        end
-        @body.scan(/\nhttp:\/\/(.*?#{word})/).each do |match|
-          add_score self.class.bad_word_score ** 4 * match[0].scan(word).size, "nasty word inside a straight-up link: #{word}"
+    bad_words.each do |key,wordlist|
+      counter = 0
+      wordlist.each do |word|
+        results = Regexp.new("\\b(#{word})\\b").match @body
+        if results && results.size > 0
+          counter += 1
+          add_score((self.class.bad_word_score ** results.size), "nasty word: '#{word}'")
+          # Add more points if the bad word is INSIDE a link
+          @body.scan(/<a[^>]+>(.*?)<\/a>/).each do |match|
+            add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a link: #{word}"
+          end
+          @body.scan(/\nhttp:\/\/(.*?#{word})\//).each do |match|
+            add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a straight-up link: #{word}"
+          end
+          @body.scan(/<a.*?>(.*?)<\/a>/).each do |links|
+            add_score self.class.bad_word_score * 50, "nasty word is the entire link: #{word}"
+          end
+          @body.scan(/<a(.*?)>/).each do |match|
+            add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a URL: #{word}"
+          end
         end
-        @body.scan(/<a(.*?)>/).each do |match|
-          add_score self.class.bad_word_score * 4 * match[0].scan(word).size, "nasty word inside a URL: #{word}"
+        if counter > (wordlist.size / 2)
+          add_score 1000, "Lots of bad words from one genre (#{key}): #{counter}"
         end
       end
     end
@@ -43,4 +84,4 @@ class Splam::Rules::BadWords < Splam::Rule
       end
     end
   end
-end
+end

data/lib/splam/rules/bbcode.rb CHANGED

@@ -3,10 +3,12 @@ class Splam::Rules::Bbcode < Splam::Rule
   def run
     add_score 10 * @body.scan("showpost.php?p=").size, "Linking to a shitty forum"
     # add_score 10 * @body.scan("\r\n").size, "Poorly formed POST (\\r\\n)"
+    add_score 80 * @body.scan(/\n\[url.*?\]\n/).size, "Shitty bbcode url covers entire line"
     add_score 40 * @body.scan("[url=").size, "URL" # no URLS for you!!
     add_score 40 * @body.scan("[URL=").size, "URL" # no URLS for you!!
-    add_score 40 * @body.scan("[url=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
-    add_score 40 * @body.scan("[URL=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
+    add_score 45 * @body.scan("[url=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
+    add_score 45 * @body.scan("[URL=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
+    add_score 30 * @body.scan("[/CODE").size, "Forum codes?"
     add_score 10 * @body.scan(/\[[bai]/).size, "b/a/i tag"
   end
 end

data/lib/splam/rules/chinese.rb CHANGED

@@ -3,22 +3,42 @@ class Splam::Rules::Chinese < Splam::Rule
   class << self
     attr_accessor :base_score
   end
-  self.base_score = 3
+  self.base_score = 5
   def run
     banned_words =[ # various chinese characters
       "\350\263\207",
       "\351\207\221",
       "\357\274\222", # number 2 in weird unicode
-      "\357\274\224", # number 4
+      "\357\274\224", # number 4
       "\357\274\225", # number 5
       "\357\274\231", # number 9
+      "\345\260\232",
+      "\345\256\266",
+      "\345\274\267",
+      "\345\240\261",
+      "\345\260\216",
+      "\345\217\260",
+      "\345\215\227",
+      "\346\235\261",
+      "\345\270\202",
+      "\345\240\264",
+      "\345\202\263",
+      "\346\216\250",
+      "\346\231\202",
+      "\347\203\210",
+      "\347\216\251",
+      "\350\226\246",
+      "\350\217\234",
+      "\350\216\216",
       "\357\274\215", # hyphen
-      # /\\357\2\d\d\\\d{3}/, # TODO does not work on 1.9
+      # /\\357\2\d\d\\\d{3}/, # TODO SyntaxError on 1.9
       # "\357", # ugh, these don't work .. because they're only part of a character.
       # "\351",
       "\35"
-    ]
+    ].compact
     banned_words.each do |word|
       hits = (self.class.base_score * @body.scan(word).size) # 1 point for every banned word
       add_score hits, "Banned character: #{word}"

data/lib/splam/rules/fuzz.rb CHANGED

@@ -3,11 +3,16 @@ class Splam::Rules::Fuzz < Splam::Rule
     attr_accessor :bad_word_score
   end
-  self.bad_word_score       = 10
+  self.bad_word_score = 10
   def run
     patterns = [/^(\d[a-z])/, /(\d[a-z][A-Z]\w+)/, /(\b\w+\d\.txt)/, /(;\d+;)/ ]
+    ignore_if = [%r{vendor/rails}, /EXC_BAD_ACCESS/, /JavaAppLauncher/, %r{Contents/MacOS}, %r{/Library/}]
     matches = 0
+    # looks like a stack trace
+    ignore_if.each do |pattern|
+      return if @body.scan(pattern)
+    end
     patterns.each do |pattern|
       results = @body.scan(pattern)
       if results && results.size > 0

data/lib/splam/rules/good_words.rb CHANGED

@@ -2,18 +2,21 @@ class Splam::Rules::GoodWords < Splam::Rule
   def run
     good_words = [ /I\'having a problem/, ]
-    good_words |= %w( lighthouse activereload  warehouse install eclipse settings assigned user ticket tickets token api number query request)
+    good_words |= %w( lighthouse lighthouseapp activereload  warehouse install eclipse settings assigned user ticket tickets token api number query request)
     good_words |= %w( browser feed firefox safari skitch vendor rails action_controller railties )
     good_words |= %w( redirect login diff dreamhost setup subversion git  wildcard domain subdomain ssh database )
     good_words |= %w( project billing tags description comment milestone saving happening feature mac implement report)
     good_words |= %w( rss notification subscribe calendar chart note task gantt search service ownership application communicate )
-    good_words |= %w( pattern template web integer status xml activereload html state page)
+    good_words |= %w( interaction API tickets hosted domain skitch )
+    good_words |= %w( pattern template web integer status xml activereload html state page rack diff )
     good_words << "project management"
     good_words << "/usr/local/lib" << "gems"
     body = @body.downcase
     good_words.each { |rule|
-      add_score -5 * body.scan(rule).size, "relevant word match: #{rule}"
+      results = Regexp.new("\\b(#{rule})\\b","i").match(body)
+      add_score -50 * results.size, "relevant word match: #{rule}" if results
     }
   end
 end

data/lib/splam/rules/href.rb CHANGED

@@ -4,8 +4,8 @@ class Splam::Rules::Href < Splam::Rule
   def run
     # add_score 3 * @body.scan("href=http").size, "Shitty html 'href=http'" # 3 points for shitty html
-    add_score 15 * @body.scan(/href\=\s*http/).size, "Shitty html 'href=http'" # 15 points for shitty html
-    add_score 15 * @body.scan(/href\="\s+http/).size, "Shitty html 'href=\" http'" # 15 points for shitty html
+    add_score 35 * @body.scan(/href\=\s*http/).size, "Shitty html 'href=http'" # 15 points for shitty html
+    add_score 35 * @body.scan(/href\="\s+http/).size, "Shitty html 'href=\" http'" # 15 points for shitty html
     add_score 50 * @body.scan(/\A<a.*?<\/a>\Z/).size, "Single link post'"      # 50 points for shitty
     link_count = @body.scan("http://").size
@@ -13,7 +13,7 @@ class Splam::Rules::Href < Splam::Rule
     add_score 50, "More than 10 links" if link_count > 10  # more than 10 links? spam.
     add_score 100, "More than 20 links" if link_count > 20 # more than 20 links? definitely spam.
     add_score 1000, "More than 50 links" if link_count > 50 # more than 20 links? definitely spam.
     # Modify these scores to weight certain problematic domains.
     # You may need to modify these for your application
     suspicious_top_level_domains = {
@@ -22,6 +22,7 @@ class Splam::Rules::Href < Splam::Rule
       'us' => 8,   # .us ? possibly spam
       'it' => 5,
       'tk' => 20,
+      'eu' => 20,
       'pl' => 8,
       'info' => 20,
       'biz'  => 40 # no-one uses these for reals
@@ -33,11 +34,12 @@ class Splam::Rules::Href < Splam::Rule
     tokens = @body.split(" ")
     if tokens[-1] =~ /^http:\/\//
-      add_score 10, "Text ends in a http token"
-      add_score 50, "Text ends in a http token and only has one token" if link_count == 1
+      add_score 20, "Text ends in a http token"
+      add_score 150, "Text ends in a http token and only has one token" if link_count == 1
+      add_score 150, "Text ends in a http token with a shitty domain " if tokens[-1].match(/http:\/\/#{suspicious_sites.keys.join("|")}\./)
     end
-    @body.scan(/http:\/\/(.*?)[\/\]?]/) do |match|
+    @body.scan(/http:\/\/(.*?)[\/\>\]?]/) do |match|
       # $stderr.puts "checking #{match}"
       if domain = match.to_s.split(".")
         tld = domain[-1]
@@ -48,7 +50,9 @@ class Splam::Rules::Href < Splam::Rule
         if found = suspicious_sites[domain[-2]]
           add_score found, "Suspicious hostname: '#{domain[-2]}'"
+          add_score found * 5, "..document ends in suspicious hostname" if tokens[-1] =~ /^http:\/\//
         end
       end
     end
   end

data/lib/splam/rules/httpbl.rb ADDED

@@ -0,0 +1,62 @@
+require 'resolv'
+# Liberally copied from https://github.com/bpalmen/httpbl/blob/master/lib/httpbl.rb
+class Splam::Rules::Httpbl < Splam::Rule
+  if RUBY_VERSION < "1.9"
+    require 'system_timer'
+  else
+    require 'timeout'
+    SystemTimer = Timeout
+  end
+  class << self
+    attr_accessor :api_key
+  end
+  def run
+    return unless @request # no ip available
+    return unless @request[:remote_ip] # no ip available
+    ip = @request[:remote_ip]
+    if result = self.class.check_blacklist(ip)
+      add_score 250, "IP address (#{ip}) appears in ProjectHoneypot blacklist. (#{result.inspect})"
+    end
+  end
+  def self.check_blacklist(ip)
+    # @cache = REDIS if defined?(REDIS)
+    # result = @cache && @cache["ip.#{ip}"]
+    # result ||= resolve(ip)
+    # if @cache
+    #   @cache.set "ip.#{ip}", result if @cache
+    #   @cache.expire "ip.#{ip}", 1.week
+    # end
+    result = resolve(ip)
+    response = result.split(".").collect!(&:to_i)
+    # responses:
+    # a, b, c, d
+    # a = 127 if success
+    # b = days since last activity
+    # c = threat score, 0..255 (0 is not threat)
+    # d = type of visitor
+    raise "Bad httpbl request format!" if response[0] != 127
+    return response[3] > 0 || response[2] > 100
+  end
+  def self.resolve(ip)
+    query = "#{@@api_key}.#{ip.split('.').reverse.join('.')}.dnsbl.httpbl.org"
+    SystemTimer::timeout(0.5) do
+      begin
+        Resolv::DNS.new.getaddress(query).to_s
+      rescue Resolv::ResolvError
+        "127.0.0.0"
+      end
+    end
+  rescue Errno::ECONNREFUSED
+    # derp
+  end
+end

data/lib/splam/rules/line_length.rb CHANGED

@@ -10,8 +10,8 @@ class Splam::Rules::LineLength < Splam::Rule
     lines.each do |line|
       next if line =~ /\A\s{4,}/ # ignore code blocks
-      multiplier = (lines.size == 1) ? 10 : 1 # one line? fail.
+      # multiplier = (lines.size == 1) ? 10 : 1 # one line? fail.
+      multiplier = 1
       # 1 point for each 40 chars in a line.
       hits = (line.size / 40) * multiplier

data/lib/splam/rules/punctuation.rb CHANGED

@@ -1,5 +1,4 @@
 class Splam::Rules::Punctuation < Splam::Rule
   def run
     punctuation = @body.scan(/[.,] /)
     add_score 10, "Text has no punctuation" if punctuation.size == 0
@@ -7,8 +6,10 @@ class Splam::Rules::Punctuation < Splam::Rule
     @body.split(/[.,]/).each do |sentence|
       words = sentence.split(" ")
       # long sentence, add a point.
-      add_score 1, "Sentence has more than 10 words" if words.size > 10
-      add_score 10, "Sentence has more than 30 words" if words.size > 30
+      unless line_safe?(sentence)
+        add_score 1, "Sentence has more than 10 words" if words.size > 10
+        add_score 10, "Sentence has more than 30 words" if words.size > 30
+      end
     end
   end
-end
+end

data/lib/splam/rules/russian.rb CHANGED

@@ -3,7 +3,7 @@ class Splam::Rules::Russian < Splam::Rule
   def run
     banned_words =[ # various russian characters
-      "\320\241", "\320\220", "\320\234", "\320\257", "\320\233", "\320\243",
+      "\320\241", "\320\220", "\320\234", "\320\257", "\320\233", "\320\243",
       "с", "м", "о", "т", "р", "е", "т", "ь", "п", "о", "р", "н", "о", "р", "л", "и", "к"
       # unicode char
 #      "\320"

data/lib/splam/rules/user.rb ADDED

@@ -0,0 +1,15 @@
+class Splam::Rules::User < Splam::Rule
+  def run
+    bad_words = ["qq.com", "yahoo.cn", "126.com"]
+    bad_words |= %w( mortgage )
+    bad_words.each do |word|
+      add_score 50, "User's email address has suspicious parts: #{word}" if @user.email.include?(word)
+    end
+    add_score "20", "User has lots and lots of dots" if @user.email.split("@")[0].scan(/\./).size > 5
+    add_score 5, "User is untrusted" if !@user.trusted?
+  end
+end

data/lib/splam/rules/word_length.rb CHANGED

@@ -16,17 +16,16 @@ class Splam::Rules::WordLength < Splam::Rule
   def run
     words = []
-    words = @body.split(/\s/).map do |word|
-      word.size
-    end
-    words.delete_if { |w| w =~ /^http\:\/\//}
+    words = @body.split(/\s/)
+    words.delete_if { |w| w =~ /^https?\:\/\// }
+    words.collect! { |word| word.size }
     # Only count word lengths over 10
     if words.size > 5
-      add_score 20, "Average word length over 5"  if average(words) > 5
-      add_score 50, "Average word length over 10" if average(words) > 10
-      add_score 10, "Median word length over 5"   if median(words) > 5
-      add_score 50, "Median word length over 10"  if median(words) > 10
+      add_score 5, "Average word length over 5"  if average(words) > 5
+      add_score 10, "Average word length over 10" if average(words) > 10
+      add_score 5, "Median word length over 5"   if median(words) > 5
+      add_score 10, "Median word length over 10"  if median(words) > 10
     end
   end
 end

data/splam.gemspec CHANGED

@@ -1,6 +1,6 @@
 name = "splam"
-Gem::Specification.new name, "0.1.1" do |s|
+Gem::Specification.new name, "0.2.0" do |s|
   s.summary = "Run any kind of code in parallel processes"
   s.authors = ["ENTP"]
   s.email = "ENTP@example.com"

data/test/ngram_test.rb ADDED

@@ -0,0 +1,36 @@
+require File.join(File.dirname(__FILE__), 'test_helper')
+require "splam/ngram"
+require "redis"
+REDIS = Redis.new :db => "12"
+class NgramTest < Test::Unit::TestCase
+  def setup
+    @corpus = Splam::Ngram.new
+    # Will only try to stuff this into redis if it appears empty. Hacky, but works
+    if REDIS.hlen("spam") < 50
+      REDIS.expire "spam", 0
+      REDIS.expire "ham", 0
+      Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "comment", "spam", "*.txt")).each do |f|
+        spam = File.open(f).read
+        @corpus.train spam, true
+      end
+      puts "loading ham"
+      Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "comment", "ham", "*.txt")).each do |f|
+        ham = File.open(f).read
+        @corpus.train ham, false
+      end
+    end
+  end
+  def test_learns_spam
+    score = @corpus.compare("Bienvenido a nuestro nuevo portal porno")
+    assert score[1] > score[0] * 2
+  end
+  def test_learns_ham
+    score = @corpus.compare("Is this a known issue?")
+    assert score[0] > score[1] * 2
+  end
+end

data/test/splam_test.rb CHANGED

@@ -7,6 +7,16 @@ class SplamTest < Test::Unit::TestCase
     end
   end
+  class User
+    attr_accessor :trusted
+    def trusted?
+      trusted
+    end
+    def email
+      "test@test.com"
+    end
+  end
   class Foo
     include ::Splam
     splammable :body
@@ -14,6 +24,9 @@ class SplamTest < Test::Unit::TestCase
     def body
       @body || "This is body\320\224 \320\199"
     end
+    def user
+      User.new
+    end
   end
   class FooCond
@@ -27,7 +40,9 @@ class SplamTest < Test::Unit::TestCase
     splammable :body do |s|
       s.rules = [:fixed_rule, FixedRule]
     end
+    def user
+      User.new
+    end
     def body
       'lol wut'
     end
@@ -38,6 +53,9 @@ class SplamTest < Test::Unit::TestCase
     splammable :body do |s|
       s.rules = {:fixed_rule => 3}
     end
+    def user
+      User.new
+    end
     def body
       'lol wut'
@@ -47,7 +65,7 @@ class SplamTest < Test::Unit::TestCase
   def test_runs_plugins
     f = Foo.new
     assert ! f.splam?
-    assert_equal 35, f.splam_score
+    assert_equal 40, f.splam_score
   end
   def test_runs_plugins_with_specified_rules

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: splam
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
   prerelease:
 platform: ruby
 authors:
@@ -47,10 +47,11 @@ files:
 - Gemfile
 - Gemfile.lock
 - MIT-LICENSE
-- README
 - Rakefile
+- Readme.md
 - gem-public_cert.pem
 - lib/splam.rb
+- lib/splam/ngram.rb
 - lib/splam/rule.rb
 - lib/splam/rules.rb
 - lib/splam/rules/arms_race.rb
@@ -61,9 +62,11 @@ files:
 - lib/splam/rules/good_words.rb
 - lib/splam/rules/href.rb
 - lib/splam/rules/html.rb
+- lib/splam/rules/httpbl.rb
 - lib/splam/rules/line_length.rb
 - lib/splam/rules/punctuation.rb
 - lib/splam/rules/russian.rb
+- lib/splam/rules/user.rb
 - lib/splam/rules/word_length.rb
 - splam.gemspec
 - test/fixtures/comment/ham/api-1.txt
@@ -101,7 +104,6 @@ files:
 - test/fixtures/comment/spam/comment_cnn.txt
 - test/fixtures/comment/spam/comment_randi.txt
 - test/fixtures/comment/spam/comment_wordy.txt
-- test/fixtures/comment/spam/consent.txt
 - test/fixtures/comment/spam/december.txt
 - test/fixtures/comment/spam/digital_rights.txt
 - test/fixtures/comment/spam/dyed_wool.txt
@@ -139,6 +141,7 @@ files:
 - test/fixtures/comment/spam/troubles.txt
 - test/fixtures/comment/spam/url_only_idiot.txt
 - test/fixtures/comment/spam/webcam.txt
+- test/ngram_test.rb
 - test/splam_rule_test.rb
 - test/splam_test.rb
 - test/test_helper.rb
@@ -157,7 +160,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 3919635157433099912
+      hash: 2317839165020925326
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
@@ -166,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 3919635157433099912
+      hash: 2317839165020925326
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.24

metadata.gz.sig CHANGED

@@ -1,2 +1 @@
-Q�	+]���eji��	�Y\�N�N2d~W�Al�ٽ;�x��*��Z��,k"�-�3�&ݒG��^9Q&� ~�����z�9�
-[�z�Q�~�o��#T��5�OiZýh�x��!�M�Yk�S�c�xO�/v���#߼�������C�g�JYP<6��3�b���\^�`6O�Ils+4�j�	kU���#q�X��p�5C�n�fy�t5O��V`�ξ�Y�����J�����kO�)[�|.
+H�y7�t�[�>USWH�E�#����ӝ��r���;E8R'���([qq��nfv� ���E(�՗ c	��ojn�w��su��ޒ�᪜���]�~�C�E�|Z7߱���ɫ�c94�NCi_�W$@���b�7� �ٷ�

data/test/fixtures/comment/spam/consent.txt DELETED

@@ -1 +0,0 @@

- Metro: Faidherbe-ChalignyHappy Eating! There is one reason and one reason only that conflict occurs in marriage. Epistemology- or why we believe something is irrelevant in each democratic voice. A female under age 16 and a male under age 18 cannot secure a marriage license in the State of Rhode Island without the approval of the Family Court. Great article on CatholicCulture.org:There are a lot of social reasons why well-constructed families are important to our society. Oklahoma: If you are under 18, your parents must appear at the courthouse with you to sign a consent form.