splam 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data.tar.gz.sig CHANGED
Binary file
data/Gemfile CHANGED
@@ -3,4 +3,6 @@ gemspec
3
3
 
4
4
  gem 'bump'
5
5
  gem 'rake'
6
+ gem 'redis'
6
7
  gem 'activesupport'
8
+ gem 'system_timer', :platform => :ruby_18
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- splam (0.1.1)
4
+ splam (0.2.0)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -13,6 +13,8 @@ GEM
13
13
  i18n (0.6.1)
14
14
  multi_json (1.6.0)
15
15
  rake (10.0.3)
16
+ redis (3.0.2)
17
+ system_timer (1.2.4)
16
18
 
17
19
  PLATFORMS
18
20
  ruby
@@ -21,4 +23,6 @@ DEPENDENCIES
21
23
  activesupport
22
24
  bump
23
25
  rake
26
+ redis
24
27
  splam!
28
+ system_timer
@@ -42,8 +42,17 @@ site) whether to ban the post or not.
42
42
 
43
43
  We recommend showing the post to the user (spambox them in) but hide it from everyone else.
44
44
 
45
+ Dev
46
+ ===
47
+
48
+ bundle
49
+ redis-server
50
+ rake
51
+
45
52
  TODO
53
+ ====
46
54
 
55
+ - fix on 1.9
47
56
  - Integrate bayesian or other clever algorithm, so that scores aren't hardcoded.
48
57
  - Switch to using a percentage (0.994) rather than a score (250)
49
58
  - Write more plugins!
@@ -29,11 +29,11 @@ module Splam
29
29
  end
30
30
  end
31
31
 
32
- def run(record)
32
+ def run(record, request)
33
33
  score, reasons = 0, []
34
34
  rules.each do |rule_class, weight|
35
35
  weight ||= 1
36
- worker = rule_class.run(self, record, weight)
36
+ worker = rule_class.run(self, record, weight, request)
37
37
  score += worker.score
38
38
  reasons << worker.reasons
39
39
  end
@@ -51,6 +51,7 @@ module Splam
51
51
  Dir["#{File.dirname(__FILE__)}/splam/rules/*.rb"].each do |f|
52
52
  require f
53
53
  end
54
+ require "splam/ngram"
54
55
  base.send :extend, ClassMethods
55
56
  end
56
57
 
@@ -113,7 +114,8 @@ protected
113
114
  return false if (splam_suite.conditions && !splam_suite.conditions.call(self)) ||
114
115
  skip_splam_check ||
115
116
  send(splam_suite.body).nil?
116
- @splam_score, @splam_reasons = splam_suite.run(self)
117
+ @request = splam_suite.request.call(self) if splam_suite.request
118
+ @splam_score, @splam_reasons = splam_suite.run(self, @request)
117
119
  instance_variable_get("@splam_#{attr_suffix}") if attr_suffix
118
120
  end
119
121
 
@@ -0,0 +1,98 @@
1
+ class Splam::Ngram
2
+
3
+ def self.trigram text
4
+ # this won't be utf-8 happy. Oh well!
5
+ words = text.gsub("'", "").split(/\W/)
6
+ hash = Hash.new 0
7
+ i = 0
8
+ while (i < words.length)
9
+ tri = []
10
+ count = 0
11
+ while ((words.length > i + count) && (tri.length < 3))
12
+ word = words[i + count]
13
+ if word && word != ""
14
+ tri << words[i + count]
15
+ end
16
+ count += 1
17
+ end
18
+ if tri.length == 3
19
+ hash[tri.join(' ')] += 1
20
+ end
21
+ i += 1
22
+ end
23
+ hash
24
+ end
25
+
26
+ def initialize site_id=nil
27
+ @site_id = site_id
28
+ end
29
+
30
+ # Train the temporary corpus with your data
31
+ def train words, spam = false, retrain = false
32
+ if words.is_a?(String)
33
+ words = self.class.trigram(words)
34
+ end
35
+ words.each do |word,value|
36
+ key = spam ? "spam" : "ham"
37
+ REDIS.hincrby key, word, value
38
+ REDIS.hincrby "#{key}-#{@site_id}", word, value if @site_id
39
+ if retrain
40
+ # Remove phrases from existing corpus
41
+ key = spam ? "ham" : "spam"
42
+ REDIS.hincrby key, word, -value
43
+ REDIS.hincrby "#{key}-#{@site_id}", word, -value if @site_id
44
+ end
45
+ end
46
+ end
47
+
48
+ def compare text
49
+ tri = self.class.trigram(text)
50
+ score = 0
51
+ spam = 0
52
+
53
+ ham_key = @site_id ? "ham-#{@site_id}" : "ham"
54
+ spam_key = @site_id ? "spam-#{@site_id}" : "spam"
55
+
56
+ @ham_tri = Hash.new 0
57
+ @spam_tri = Hash.new 0
58
+
59
+ tri.each do |key,value|
60
+ next if key.nil? || key.strip == ""
61
+ hmatch = REDIS.hget(ham_key, key).to_i # ham_tri[key]
62
+ smatch = REDIS.hget(spam_key, key).to_i # spam_tri[key]
63
+
64
+ if hmatch > 0 && smatch > 0
65
+ # tri appears in both
66
+ # ignore.
67
+ next
68
+ end
69
+ if hmatch > 0
70
+ score += hmatch + value
71
+ elsif smatch > 0
72
+ spam += smatch + value
73
+ end
74
+ end
75
+ [score, spam]
76
+ end
77
+ end
78
+
79
+ # corpus = Splam::Ngram.new 10009
80
+ # s.comments.paginated_each(:order => "id desc") do |c|
81
+ # puts c.id
82
+ # words = Splam::Ngram.trigram(c.body.downcase)
83
+ # if c.author.support? || (c.user && c.user.trusted?)
84
+ # corpus.train words, false
85
+ # elsif c.spam
86
+ # corpus.train words, true
87
+ # end
88
+ # end
89
+ #
90
+ # Comment.spam.paginated_each(:order => "id desc", :conditions => ['id < 12916619']) do |c|
91
+ # next if c.user_email == "no-reply@lighthouseapp.com"
92
+ # score = corpus.compare(c.body)
93
+ # if score[0] > score[1]
94
+ # puts "Not spam? #{c.id} : #{score.inspect} - #{c.body.first(100)}"
95
+ # else
96
+ # puts "Spam! #{c.id} : #{score.inspect}"
97
+ # end
98
+ # end
@@ -30,8 +30,9 @@ class Splam::Rule
30
30
  end
31
31
  end
32
32
 
33
- def initialize(suite, record, weight = 1.0)
34
- @suite, @weight, @score, @reasons, @body = suite, weight, 0, [], record.send(suite.body)
33
+ def initialize(suite, record, weight = 1.0, request = nil)
34
+ @suite, @weight, @score, @reasons, @body, @request = suite, weight, 0, [], record.send(suite.body), request
35
+ @user = record.user # todo: customize user field
35
36
  end
36
37
 
37
38
  def name
@@ -70,4 +71,15 @@ class Splam::Rule
70
71
  @score += points
71
72
  end
72
73
  end
73
- end
74
+
75
+ def line_safe?(string)
76
+ ([
77
+ /\.dylib\b/,
78
+ /\b0x[0-9a-f]{6,16}\b/i,
79
+ /\b\/Applications\//,
80
+ /\b\/System\/Library\//,
81
+ /\bLibrary\/Application Support\//
82
+ ].map {|r| r.match string }).compact.size > 0
83
+ end
84
+
85
+ end
@@ -7,7 +7,7 @@ class Splam::Rules::ArmsRace < Splam::Rule
7
7
 
8
8
  # This is where you put banned domain names or otherwise
9
9
  def run
10
- shitty_sites = ["inquisitr"]
10
+ shitty_sites = ["inquisitr", "beeplog"]
11
11
  shitty_sites.each do |word|
12
12
  results = @body.downcase.scan(word)
13
13
  if results && results.size > 0
@@ -1,3 +1,4 @@
1
+ require 'active_support'
1
2
  class Splam::Rules::BadWords < Splam::Rule
2
3
  class << self
3
4
  attr_accessor :bad_word_score, :suspicious_word_score
@@ -7,28 +8,68 @@ class Splam::Rules::BadWords < Splam::Rule
7
8
  self.suspicious_word_score = 4
8
9
 
9
10
  def run
10
- bad_words = %w( sex sexy porn gay erotica viagra erotismo porno porn lesbian amateur tit\b)
11
- bad_words |= %w( gratis erotismo porno torrent bittorrent adulto )
12
- bad_words |= %w( cialis viagra payday loan jihad )
13
- bad_words |= %w( webcam free-web-host rapidshare muslim)
14
- bad_words << /pel?cula/ << /pornogr?fica/ << "portal porno" # srsly, spamming in spanish?
11
+ bad_words = {}
12
+ bad_words[:pornspam] = %w( sex sexy porn gay erotica erotico topless naked viagra erotismo porno porn lesbian amateur tit\b)
13
+ bad_words[:pornspam] |= %w( gratis erotismo porno torrent bittorrent adulto videochat video 3dsex)
14
+ bad_words[:pornspam] << /pel?cula/ << /pornogr?fica/ << "portal porno" # srsly, spamming in spanish?
15
+ bad_words[:pornspam] |= %w( webcam free-web-host rapidshare)
16
+
17
+ bad_words[:viagraspam] = %w( cialis viagra pharmacy prescription levitra kamagra)
18
+ bad_words[:benzospam] = %w( ultram tramadol pharmacy prescription )
19
+ bad_words[:cashspam] = %w( payday loan jihad ) << "payday loan"
20
+ bad_words[:pharmaspam] = %w( propecia finasteride viagra )
21
+
22
+ bad_words[:nigerian] = ["million pounds sterling", "dear sirs,", "any bank account", "winning notification", "western union", "diagnosed with cancer", "bank treasury", "unclaimed inheritance"]
23
+
24
+ # linkspammers
25
+ bad_words[:linkspam] = ["increase traffic", "discovered your blog", "backlinks", "sent me a link", "more visitors to my site", "targeted traffic", "increase traffic to your website", "estore"]
26
+
27
+ bad_words[:beats] = %w( beats dre headphones sale cheap shipping ) << "monster beats" << "best online"
28
+ bad_words[:rolex] = %w( rolex watch replica watches price )
29
+ bad_words[:wtf] = %w( bilete avion )
30
+
31
+ # buying fake shitty brand stuff
32
+ bad_words[:bagspam] = %w(handbag louis louisvuitton vuitton chanel coach clearance outlet hermes bag scarf sale ralphlauren)
33
+ bad_words[:handbags] = %w( karenmillen michaelkors kors millen bags purchase handbag chanel outlet tasche longchamp kaufen louboutin christianlouboutin)
34
+ bad_words[:blingspam] = %w( tiffany jewellery tiffanyco clearance outlet)
35
+ bad_words[:uggspam] = %w(\buggs?\b \buggboots\b clearance outlet )
36
+ bad_words[:wedding] = ["wedding", "wedding dress", "weddingdress", "strapless"]
37
+
38
+ bad_words[:webcamspam] = %w( live girls webcam adult singles) << "chat room"
39
+ bad_words[:gamereview] = %w( games-review-it.com game-reviews-online.com )
40
+ bad_words[:streaming] = %w( watchmlbbaseball watchnhlhockey pspnsportstv.com )
41
+
42
+ bad_words[:forum_spam] = ["IMG", "url="]
15
43
 
16
44
  suspicious_words = %w( free buy galleries dating gallery hard hardcore video homemade celebrity ) << "credit card" << "my friend" << "friend sent me"
17
- suspicious_words |= %w( adult pharmacy overnight shipping free hot movie nylon arab ?????? xxx) << "sent me a link"
45
+ suspicious_words |= %w( adult overnight shipping free hot movie nylon arab ?????? seo)
18
46
  suspicious_words << "forums/member.php?u=" << "chat room" << "free chat" << "yahoo chat" << "page.php"
19
- bad_words.each do |word|
20
- results = @body.downcase.scan(word)
21
- if results && results.size > 0
22
- add_score((self.class.bad_word_score ** results.size), "nasty word: '#{word}'")
23
- # Add more points if the bad word is INSIDE a link
24
- @body.scan(/<a[^>]+>(.*?)<\/a>/).each do |match|
25
- add_score self.class.bad_word_score * 4 * match[0].scan(word).size, "nasty word inside a link: #{word}"
26
- end
27
- @body.scan(/\nhttp:\/\/(.*?#{word})/).each do |match|
28
- add_score self.class.bad_word_score ** 4 * match[0].scan(word).size, "nasty word inside a straight-up link: #{word}"
47
+
48
+ bad_words.each do |key,wordlist|
49
+ counter = 0
50
+ wordlist.each do |word|
51
+ results = Regexp.new("\\b(#{word})\\b").match @body
52
+ if results && results.size > 0
53
+ counter += 1
54
+ add_score((self.class.bad_word_score ** results.size), "nasty word: '#{word}'")
55
+
56
+ # Add more points if the bad word is INSIDE a link
57
+ @body.scan(/<a[^>]+>(.*?)<\/a>/).each do |match|
58
+ add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a link: #{word}"
59
+ end
60
+ @body.scan(/\nhttp:\/\/(.*?#{word})\//).each do |match|
61
+ add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a straight-up link: #{word}"
62
+ end
63
+ @body.scan(/<a.*?>(.*?)<\/a>/).each do |links|
64
+ add_score self.class.bad_word_score * 50, "nasty word is the entire link: #{word}"
65
+ end
66
+ @body.scan(/<a(.*?)>/).each do |match|
67
+ add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a URL: #{word}"
68
+ end
69
+
29
70
  end
30
- @body.scan(/<a(.*?)>/).each do |match|
31
- add_score self.class.bad_word_score * 4 * match[0].scan(word).size, "nasty word inside a URL: #{word}"
71
+ if counter > (wordlist.size / 2)
72
+ add_score 1000, "Lots of bad words from one genre (#{key}): #{counter}"
32
73
  end
33
74
  end
34
75
  end
@@ -43,4 +84,4 @@ class Splam::Rules::BadWords < Splam::Rule
43
84
  end
44
85
  end
45
86
  end
46
- end
87
+ end
@@ -3,10 +3,12 @@ class Splam::Rules::Bbcode < Splam::Rule
3
3
  def run
4
4
  add_score 10 * @body.scan("showpost.php?p=").size, "Linking to a shitty forum"
5
5
  # add_score 10 * @body.scan("\r\n").size, "Poorly formed POST (\\r\\n)"
6
+ add_score 80 * @body.scan(/\n\[url.*?\]\n/).size, "Shitty bbcode url covers entire line"
6
7
  add_score 40 * @body.scan("[url=").size, "URL" # no URLS for you!!
7
8
  add_score 40 * @body.scan("[URL=").size, "URL" # no URLS for you!!
8
- add_score 40 * @body.scan("[url=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
9
- add_score 40 * @body.scan("[URL=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
9
+ add_score 45 * @body.scan("[url=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
10
+ add_score 45 * @body.scan("[URL=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
11
+ add_score 30 * @body.scan("[/CODE").size, "Forum codes?"
10
12
  add_score 10 * @body.scan(/\[[bai]/).size, "b/a/i tag"
11
13
  end
12
14
  end
@@ -3,22 +3,42 @@ class Splam::Rules::Chinese < Splam::Rule
3
3
  class << self
4
4
  attr_accessor :base_score
5
5
  end
6
- self.base_score = 3
6
+ self.base_score = 5
7
7
 
8
8
  def run
9
9
  banned_words =[ # various chinese characters
10
10
  "\350\263\207",
11
11
  "\351\207\221",
12
12
  "\357\274\222", # number 2 in weird unicode
13
- "\357\274\224", # number 4
13
+ "\357\274\224", # number 4
14
14
  "\357\274\225", # number 5
15
15
  "\357\274\231", # number 9
16
+
17
+ "\345\260\232",
18
+ "\345\256\266",
19
+ "\345\274\267",
20
+ "\345\240\261",
21
+ "\345\260\216",
22
+ "\345\217\260",
23
+ "\345\215\227",
24
+ "\346\235\261",
25
+ "\345\270\202",
26
+ "\345\240\264",
27
+ "\345\202\263",
28
+ "\346\216\250",
29
+ "\346\231\202",
30
+ "\347\203\210",
31
+ "\347\216\251",
32
+ "\350\226\246",
33
+ "\350\217\234",
34
+ "\350\216\216",
35
+
16
36
  "\357\274\215", # hyphen
17
- # /\\357\2\d\d\\\d{3}/, # TODO does not work on 1.9
37
+ # /\\357\2\d\d\\\d{3}/, # TODO SyntaxError on 1.9
18
38
  # "\357", # ugh, these don't work .. because they're only part of a character.
19
39
  # "\351",
20
40
  "\35"
21
- ]
41
+ ].compact
22
42
  banned_words.each do |word|
23
43
  hits = (self.class.base_score * @body.scan(word).size) # 1 point for every banned word
24
44
  add_score hits, "Banned character: #{word}"
@@ -3,11 +3,16 @@ class Splam::Rules::Fuzz < Splam::Rule
3
3
  attr_accessor :bad_word_score
4
4
  end
5
5
 
6
- self.bad_word_score = 10
6
+ self.bad_word_score = 10
7
7
 
8
8
  def run
9
9
  patterns = [/^(\d[a-z])/, /(\d[a-z][A-Z]\w+)/, /(\b\w+\d\.txt)/, /(;\d+;)/ ]
10
+ ignore_if = [%r{vendor/rails}, /EXC_BAD_ACCESS/, /JavaAppLauncher/, %r{Contents/MacOS}, %r{/Library/}]
10
11
  matches = 0
12
+ # looks like a stack trace
13
+ ignore_if.each do |pattern|
14
+ return if @body.scan(pattern)
15
+ end
11
16
  patterns.each do |pattern|
12
17
  results = @body.scan(pattern)
13
18
  if results && results.size > 0
@@ -2,18 +2,21 @@ class Splam::Rules::GoodWords < Splam::Rule
2
2
 
3
3
  def run
4
4
  good_words = [ /I\'having a problem/, ]
5
- good_words |= %w( lighthouse activereload warehouse install eclipse settings assigned user ticket tickets token api number query request)
5
+ good_words |= %w( lighthouse lighthouseapp activereload warehouse install eclipse settings assigned user ticket tickets token api number query request)
6
6
  good_words |= %w( browser feed firefox safari skitch vendor rails action_controller railties )
7
7
  good_words |= %w( redirect login diff dreamhost setup subversion git wildcard domain subdomain ssh database )
8
8
  good_words |= %w( project billing tags description comment milestone saving happening feature mac implement report)
9
9
  good_words |= %w( rss notification subscribe calendar chart note task gantt search service ownership application communicate )
10
- good_words |= %w( pattern template web integer status xml activereload html state page)
10
+ good_words |= %w( interaction API tickets hosted domain skitch )
11
+ good_words |= %w( pattern template web integer status xml activereload html state page rack diff )
11
12
  good_words << "project management"
12
13
  good_words << "/usr/local/lib" << "gems"
13
14
 
14
15
  body = @body.downcase
15
16
  good_words.each { |rule|
16
- add_score -5 * body.scan(rule).size, "relevant word match: #{rule}"
17
+
18
+ results = Regexp.new("\\b(#{rule})\\b","i").match(body)
19
+ add_score -50 * results.size, "relevant word match: #{rule}" if results
17
20
  }
18
21
  end
19
22
  end
@@ -4,8 +4,8 @@ class Splam::Rules::Href < Splam::Rule
4
4
 
5
5
  def run
6
6
  # add_score 3 * @body.scan("href=http").size, "Shitty html 'href=http'" # 3 points for shitty html
7
- add_score 15 * @body.scan(/href\=\s*http/).size, "Shitty html 'href=http'" # 15 points for shitty html
8
- add_score 15 * @body.scan(/href\="\s+http/).size, "Shitty html 'href=\" http'" # 15 points for shitty html
7
+ add_score 35 * @body.scan(/href\=\s*http/).size, "Shitty html 'href=http'" # 15 points for shitty html
8
+ add_score 35 * @body.scan(/href\="\s+http/).size, "Shitty html 'href=\" http'" # 15 points for shitty html
9
9
  add_score 50 * @body.scan(/\A<a.*?<\/a>\Z/).size, "Single link post'" # 50 points for shitty
10
10
 
11
11
  link_count = @body.scan("http://").size
@@ -13,7 +13,7 @@ class Splam::Rules::Href < Splam::Rule
13
13
  add_score 50, "More than 10 links" if link_count > 10 # more than 10 links? spam.
14
14
  add_score 100, "More than 20 links" if link_count > 20 # more than 20 links? definitely spam.
15
15
  add_score 1000, "More than 50 links" if link_count > 50 # more than 20 links? definitely spam.
16
-
16
+
17
17
  # Modify these scores to weight certain problematic domains.
18
18
  # You may need to modify these for your application
19
19
  suspicious_top_level_domains = {
@@ -22,6 +22,7 @@ class Splam::Rules::Href < Splam::Rule
22
22
  'us' => 8, # .us ? possibly spam
23
23
  'it' => 5,
24
24
  'tk' => 20,
25
+ 'eu' => 20,
25
26
  'pl' => 8,
26
27
  'info' => 20,
27
28
  'biz' => 40 # no-one uses these for reals
@@ -33,11 +34,12 @@ class Splam::Rules::Href < Splam::Rule
33
34
 
34
35
  tokens = @body.split(" ")
35
36
  if tokens[-1] =~ /^http:\/\//
36
- add_score 10, "Text ends in a http token"
37
- add_score 50, "Text ends in a http token and only has one token" if link_count == 1
37
+ add_score 20, "Text ends in a http token"
38
+ add_score 150, "Text ends in a http token and only has one token" if link_count == 1
39
+ add_score 150, "Text ends in a http token with a shitty domain " if tokens[-1].match(/http:\/\/#{suspicious_sites.keys.join("|")}\./)
38
40
  end
39
41
 
40
- @body.scan(/http:\/\/(.*?)[\/\]?]/) do |match|
42
+ @body.scan(/http:\/\/(.*?)[\/\>\]?]/) do |match|
41
43
  # $stderr.puts "checking #{match}"
42
44
  if domain = match.to_s.split(".")
43
45
  tld = domain[-1]
@@ -48,7 +50,9 @@ class Splam::Rules::Href < Splam::Rule
48
50
 
49
51
  if found = suspicious_sites[domain[-2]]
50
52
  add_score found, "Suspicious hostname: '#{domain[-2]}'"
53
+ add_score found * 5, "..document ends in suspicious hostname" if tokens[-1] =~ /^http:\/\//
51
54
  end
55
+
52
56
  end
53
57
  end
54
58
  end
@@ -0,0 +1,62 @@
1
+ require 'resolv'
2
+ # Liberally copied from https://github.com/bpalmen/httpbl/blob/master/lib/httpbl.rb
3
+
4
+ class Splam::Rules::Httpbl < Splam::Rule
5
+ if RUBY_VERSION < "1.9"
6
+ require 'system_timer'
7
+ else
8
+ require 'timeout'
9
+ SystemTimer = Timeout
10
+ end
11
+
12
+
13
+ class << self
14
+ attr_accessor :api_key
15
+ end
16
+
17
+ def run
18
+ return unless @request # no ip available
19
+ return unless @request[:remote_ip] # no ip available
20
+
21
+ ip = @request[:remote_ip]
22
+
23
+ if result = self.class.check_blacklist(ip)
24
+ add_score 250, "IP address (#{ip}) appears in ProjectHoneypot blacklist. (#{result.inspect})"
25
+ end
26
+ end
27
+
28
+ def self.check_blacklist(ip)
29
+ # @cache = REDIS if defined?(REDIS)
30
+ # result = @cache && @cache["ip.#{ip}"]
31
+ # result ||= resolve(ip)
32
+ # if @cache
33
+ # @cache.set "ip.#{ip}", result if @cache
34
+ # @cache.expire "ip.#{ip}", 1.week
35
+ # end
36
+ result = resolve(ip)
37
+ response = result.split(".").collect!(&:to_i)
38
+
39
+ # responses:
40
+ # a, b, c, d
41
+ # a = 127 if success
42
+ # b = days since last activity
43
+ # c = threat score, 0..255 (0 is not threat)
44
+ # d = type of visitor
45
+ raise "Bad httpbl request format!" if response[0] != 127
46
+ return response[3] > 0 || response[2] > 100
47
+ end
48
+
49
+ def self.resolve(ip)
50
+ query = "#{@@api_key}.#{ip.split('.').reverse.join('.')}.dnsbl.httpbl.org"
51
+ SystemTimer::timeout(0.5) do
52
+ begin
53
+ Resolv::DNS.new.getaddress(query).to_s
54
+ rescue Resolv::ResolvError
55
+ "127.0.0.0"
56
+ end
57
+ end
58
+ rescue Errno::ECONNREFUSED
59
+ # derp
60
+ end
61
+
62
+ end
@@ -10,8 +10,8 @@ class Splam::Rules::LineLength < Splam::Rule
10
10
  lines.each do |line|
11
11
  next if line =~ /\A\s{4,}/ # ignore code blocks
12
12
 
13
- multiplier = (lines.size == 1) ? 10 : 1 # one line? fail.
14
-
13
+ # multiplier = (lines.size == 1) ? 10 : 1 # one line? fail.
14
+ multiplier = 1
15
15
 
16
16
  # 1 point for each 40 chars in a line.
17
17
  hits = (line.size / 40) * multiplier
@@ -1,5 +1,4 @@
1
1
  class Splam::Rules::Punctuation < Splam::Rule
2
-
3
2
  def run
4
3
  punctuation = @body.scan(/[.,] /)
5
4
  add_score 10, "Text has no punctuation" if punctuation.size == 0
@@ -7,8 +6,10 @@ class Splam::Rules::Punctuation < Splam::Rule
7
6
  @body.split(/[.,]/).each do |sentence|
8
7
  words = sentence.split(" ")
9
8
  # long sentence, add a point.
10
- add_score 1, "Sentence has more than 10 words" if words.size > 10
11
- add_score 10, "Sentence has more than 30 words" if words.size > 30
9
+ unless line_safe?(sentence)
10
+ add_score 1, "Sentence has more than 10 words" if words.size > 10
11
+ add_score 10, "Sentence has more than 30 words" if words.size > 30
12
+ end
12
13
  end
13
14
  end
14
- end
15
+ end
@@ -3,7 +3,7 @@ class Splam::Rules::Russian < Splam::Rule
3
3
 
4
4
  def run
5
5
  banned_words =[ # various russian characters
6
- "\320\241", "\320\220", "\320\234", "\320\257", "\320\233", "\320\243",
6
+ "\320\241", "\320\220", "\320\234", "\320\257", "\320\233", "\320\243",
7
7
  "с", "м", "о", "т", "р", "е", "т", "ь", "п", "о", "р", "н", "о", "р", "л", "и", "к"
8
8
  # unicode char
9
9
  # "\320"
@@ -0,0 +1,15 @@
1
+ class Splam::Rules::User < Splam::Rule
2
+
3
+ def run
4
+ bad_words = ["qq.com", "yahoo.cn", "126.com"]
5
+ bad_words |= %w( mortgage )
6
+
7
+ bad_words.each do |word|
8
+ add_score 50, "User's email address has suspicious parts: #{word}" if @user.email.include?(word)
9
+ end
10
+
11
+ add_score "20", "User has lots and lots of dots" if @user.email.split("@")[0].scan(/\./).size > 5
12
+
13
+ add_score 5, "User is untrusted" if !@user.trusted?
14
+ end
15
+ end
@@ -16,17 +16,16 @@ class Splam::Rules::WordLength < Splam::Rule
16
16
 
17
17
  def run
18
18
  words = []
19
- words = @body.split(/\s/).map do |word|
20
- word.size
21
- end
22
- words.delete_if { |w| w =~ /^http\:\/\//}
19
+ words = @body.split(/\s/)
20
+ words.delete_if { |w| w =~ /^https?\:\/\// }
21
+ words.collect! { |word| word.size }
23
22
 
24
23
  # Only count word lengths over 10
25
24
  if words.size > 5
26
- add_score 20, "Average word length over 5" if average(words) > 5
27
- add_score 50, "Average word length over 10" if average(words) > 10
28
- add_score 10, "Median word length over 5" if median(words) > 5
29
- add_score 50, "Median word length over 10" if median(words) > 10
25
+ add_score 5, "Average word length over 5" if average(words) > 5
26
+ add_score 10, "Average word length over 10" if average(words) > 10
27
+ add_score 5, "Median word length over 5" if median(words) > 5
28
+ add_score 10, "Median word length over 10" if median(words) > 10
30
29
  end
31
30
  end
32
31
  end
@@ -1,6 +1,6 @@
1
1
  name = "splam"
2
2
 
3
- Gem::Specification.new name, "0.1.1" do |s|
3
+ Gem::Specification.new name, "0.2.0" do |s|
4
4
  s.summary = "Run any kind of code in parallel processes"
5
5
  s.authors = ["ENTP"]
6
6
  s.email = "ENTP@example.com"
@@ -0,0 +1,36 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+ require "splam/ngram"
3
+ require "redis"
4
+ REDIS = Redis.new :db => "12"
5
+ class NgramTest < Test::Unit::TestCase
6
+
7
+ def setup
8
+ @corpus = Splam::Ngram.new
9
+
10
+ # Will only try to stuff this into redis if it appears empty. Hacky, but works
11
+ if REDIS.hlen("spam") < 50
12
+ REDIS.expire "spam", 0
13
+ REDIS.expire "ham", 0
14
+
15
+ Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "comment", "spam", "*.txt")).each do |f|
16
+ spam = File.open(f).read
17
+ @corpus.train spam, true
18
+ end
19
+ puts "loading ham"
20
+ Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "comment", "ham", "*.txt")).each do |f|
21
+ ham = File.open(f).read
22
+ @corpus.train ham, false
23
+ end
24
+ end
25
+ end
26
+
27
+ def test_learns_spam
28
+ score = @corpus.compare("Bienvenido a nuestro nuevo portal porno")
29
+ assert score[1] > score[0] * 2
30
+ end
31
+
32
+ def test_learns_ham
33
+ score = @corpus.compare("Is this a known issue?")
34
+ assert score[0] > score[1] * 2
35
+ end
36
+ end
@@ -7,6 +7,16 @@ class SplamTest < Test::Unit::TestCase
7
7
  end
8
8
  end
9
9
 
10
+ class User
11
+ attr_accessor :trusted
12
+ def trusted?
13
+ trusted
14
+ end
15
+ def email
16
+ "test@test.com"
17
+ end
18
+ end
19
+
10
20
  class Foo
11
21
  include ::Splam
12
22
  splammable :body
@@ -14,6 +24,9 @@ class SplamTest < Test::Unit::TestCase
14
24
  def body
15
25
  @body || "This is body\320\224 \320\199"
16
26
  end
27
+ def user
28
+ User.new
29
+ end
17
30
  end
18
31
 
19
32
  class FooCond
@@ -27,7 +40,9 @@ class SplamTest < Test::Unit::TestCase
27
40
  splammable :body do |s|
28
41
  s.rules = [:fixed_rule, FixedRule]
29
42
  end
30
-
43
+ def user
44
+ User.new
45
+ end
31
46
  def body
32
47
  'lol wut'
33
48
  end
@@ -38,6 +53,9 @@ class SplamTest < Test::Unit::TestCase
38
53
  splammable :body do |s|
39
54
  s.rules = {:fixed_rule => 3}
40
55
  end
56
+ def user
57
+ User.new
58
+ end
41
59
 
42
60
  def body
43
61
  'lol wut'
@@ -47,7 +65,7 @@ class SplamTest < Test::Unit::TestCase
47
65
  def test_runs_plugins
48
66
  f = Foo.new
49
67
  assert ! f.splam?
50
- assert_equal 35, f.splam_score
68
+ assert_equal 40, f.splam_score
51
69
  end
52
70
 
53
71
  def test_runs_plugins_with_specified_rules
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: splam
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -47,10 +47,11 @@ files:
47
47
  - Gemfile
48
48
  - Gemfile.lock
49
49
  - MIT-LICENSE
50
- - README
51
50
  - Rakefile
51
+ - Readme.md
52
52
  - gem-public_cert.pem
53
53
  - lib/splam.rb
54
+ - lib/splam/ngram.rb
54
55
  - lib/splam/rule.rb
55
56
  - lib/splam/rules.rb
56
57
  - lib/splam/rules/arms_race.rb
@@ -61,9 +62,11 @@ files:
61
62
  - lib/splam/rules/good_words.rb
62
63
  - lib/splam/rules/href.rb
63
64
  - lib/splam/rules/html.rb
65
+ - lib/splam/rules/httpbl.rb
64
66
  - lib/splam/rules/line_length.rb
65
67
  - lib/splam/rules/punctuation.rb
66
68
  - lib/splam/rules/russian.rb
69
+ - lib/splam/rules/user.rb
67
70
  - lib/splam/rules/word_length.rb
68
71
  - splam.gemspec
69
72
  - test/fixtures/comment/ham/api-1.txt
@@ -101,7 +104,6 @@ files:
101
104
  - test/fixtures/comment/spam/comment_cnn.txt
102
105
  - test/fixtures/comment/spam/comment_randi.txt
103
106
  - test/fixtures/comment/spam/comment_wordy.txt
104
- - test/fixtures/comment/spam/consent.txt
105
107
  - test/fixtures/comment/spam/december.txt
106
108
  - test/fixtures/comment/spam/digital_rights.txt
107
109
  - test/fixtures/comment/spam/dyed_wool.txt
@@ -139,6 +141,7 @@ files:
139
141
  - test/fixtures/comment/spam/troubles.txt
140
142
  - test/fixtures/comment/spam/url_only_idiot.txt
141
143
  - test/fixtures/comment/spam/webcam.txt
144
+ - test/ngram_test.rb
142
145
  - test/splam_rule_test.rb
143
146
  - test/splam_test.rb
144
147
  - test/test_helper.rb
@@ -157,7 +160,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
157
160
  version: '0'
158
161
  segments:
159
162
  - 0
160
- hash: 3919635157433099912
163
+ hash: 2317839165020925326
161
164
  required_rubygems_version: !ruby/object:Gem::Requirement
162
165
  none: false
163
166
  requirements:
@@ -166,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
169
  version: '0'
167
170
  segments:
168
171
  - 0
169
- hash: 3919635157433099912
172
+ hash: 2317839165020925326
170
173
  requirements: []
171
174
  rubyforge_project:
172
175
  rubygems_version: 1.8.24
metadata.gz.sig CHANGED
@@ -1,2 +1 @@
1
- Q +]���eji�� �Y\�NN2d~W�Al�ٽ;�x��*��Z ��,k"�-�3&ݒG��^9Q&� ~�����z�9�
2
- [�z�Q�~�o��#T��5�OiZýh�x��!�M�Yk�S�c�xO�/v���#߼�������C�g�JYP<6��3�b���\^�`6O�Ils+4�j� kU���#q�X��p�5C�n�fy�t5O��V`�ξ�Y�����J�����k O�)[�|.
1
+ Hy7�t�[�>USWH�E�#����ӝ��r���;E8R'���([qq��nfv� ���E(�՗ c ��ojnw��su��ޒ�᪜���]�~�C�E�|Z7߱���ɫ�c94�NCi_�W$@� ��b 7� �ٷ�
@@ -1 +0,0 @@
1
- Metro: Faidherbe-ChalignyHappy Eating! There is one reason and one reason only that conflict occurs in marriage. Epistemology- or why we believe something is irrelevant in each democratic voice. A female under age 16 and a male under age 18 cannot secure a marriage license in the State of Rhode Island without the approval of the Family Court. Great article on CatholicCulture.org:There are a lot of social reasons why well-constructed families are important to our society. Oklahoma: If you are under 18, your parents must appear at the courthouse with you to sign a consent form.