splam 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +5 -1
- data/{README → Readme.md} +9 -0
- data/lib/splam.rb +5 -3
- data/lib/splam/ngram.rb +98 -0
- data/lib/splam/rule.rb +15 -3
- data/lib/splam/rules/arms_race.rb +1 -1
- data/lib/splam/rules/bad_words.rb +60 -19
- data/lib/splam/rules/bbcode.rb +4 -2
- data/lib/splam/rules/chinese.rb +24 -4
- data/lib/splam/rules/fuzz.rb +6 -1
- data/lib/splam/rules/good_words.rb +6 -3
- data/lib/splam/rules/href.rb +10 -6
- data/lib/splam/rules/httpbl.rb +62 -0
- data/lib/splam/rules/line_length.rb +2 -2
- data/lib/splam/rules/punctuation.rb +5 -4
- data/lib/splam/rules/russian.rb +1 -1
- data/lib/splam/rules/user.rb +15 -0
- data/lib/splam/rules/word_length.rb +7 -8
- data/splam.gemspec +1 -1
- data/test/ngram_test.rb +36 -0
- data/test/splam_test.rb +20 -2
- metadata +8 -5
- metadata.gz.sig +1 -2
- data/test/fixtures/comment/spam/consent.txt +0 -1
data.tar.gz.sig
CHANGED
Binary file
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
splam (0.
|
4
|
+
splam (0.2.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: http://rubygems.org/
|
@@ -13,6 +13,8 @@ GEM
|
|
13
13
|
i18n (0.6.1)
|
14
14
|
multi_json (1.6.0)
|
15
15
|
rake (10.0.3)
|
16
|
+
redis (3.0.2)
|
17
|
+
system_timer (1.2.4)
|
16
18
|
|
17
19
|
PLATFORMS
|
18
20
|
ruby
|
@@ -21,4 +23,6 @@ DEPENDENCIES
|
|
21
23
|
activesupport
|
22
24
|
bump
|
23
25
|
rake
|
26
|
+
redis
|
24
27
|
splam!
|
28
|
+
system_timer
|
data/{README → Readme.md}
RENAMED
@@ -42,8 +42,17 @@ site) whether to ban the post or not.
|
|
42
42
|
|
43
43
|
We recommend showing the post to the user (spambox them in) but hide it from everyone else.
|
44
44
|
|
45
|
+
Dev
|
46
|
+
===
|
47
|
+
|
48
|
+
bundle
|
49
|
+
redis-server
|
50
|
+
rake
|
51
|
+
|
45
52
|
TODO
|
53
|
+
====
|
46
54
|
|
55
|
+
- fix on 1.9
|
47
56
|
- Integrate bayesian or other clever algorithm, so that scores aren't hardcoded.
|
48
57
|
- Switch to using a percentage (0.994) rather than a score (250)
|
49
58
|
- Write more plugins!
|
data/lib/splam.rb
CHANGED
@@ -29,11 +29,11 @@ module Splam
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
def run(record)
|
32
|
+
def run(record, request)
|
33
33
|
score, reasons = 0, []
|
34
34
|
rules.each do |rule_class, weight|
|
35
35
|
weight ||= 1
|
36
|
-
worker = rule_class.run(self, record, weight)
|
36
|
+
worker = rule_class.run(self, record, weight, request)
|
37
37
|
score += worker.score
|
38
38
|
reasons << worker.reasons
|
39
39
|
end
|
@@ -51,6 +51,7 @@ module Splam
|
|
51
51
|
Dir["#{File.dirname(__FILE__)}/splam/rules/*.rb"].each do |f|
|
52
52
|
require f
|
53
53
|
end
|
54
|
+
require "splam/ngram"
|
54
55
|
base.send :extend, ClassMethods
|
55
56
|
end
|
56
57
|
|
@@ -113,7 +114,8 @@ protected
|
|
113
114
|
return false if (splam_suite.conditions && !splam_suite.conditions.call(self)) ||
|
114
115
|
skip_splam_check ||
|
115
116
|
send(splam_suite.body).nil?
|
116
|
-
@
|
117
|
+
@request = splam_suite.request.call(self) if splam_suite.request
|
118
|
+
@splam_score, @splam_reasons = splam_suite.run(self, @request)
|
117
119
|
instance_variable_get("@splam_#{attr_suffix}") if attr_suffix
|
118
120
|
end
|
119
121
|
|
data/lib/splam/ngram.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
class Splam::Ngram
|
2
|
+
|
3
|
+
def self.trigram text
|
4
|
+
# this won't be utf-8 happy. Oh well!
|
5
|
+
words = text.gsub("'", "").split(/\W/)
|
6
|
+
hash = Hash.new 0
|
7
|
+
i = 0
|
8
|
+
while (i < words.length)
|
9
|
+
tri = []
|
10
|
+
count = 0
|
11
|
+
while ((words.length > i + count) && (tri.length < 3))
|
12
|
+
word = words[i + count]
|
13
|
+
if word && word != ""
|
14
|
+
tri << words[i + count]
|
15
|
+
end
|
16
|
+
count += 1
|
17
|
+
end
|
18
|
+
if tri.length == 3
|
19
|
+
hash[tri.join(' ')] += 1
|
20
|
+
end
|
21
|
+
i += 1
|
22
|
+
end
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize site_id=nil
|
27
|
+
@site_id = site_id
|
28
|
+
end
|
29
|
+
|
30
|
+
# Train the temporary corpus with your data
|
31
|
+
def train words, spam = false, retrain = false
|
32
|
+
if words.is_a?(String)
|
33
|
+
words = self.class.trigram(words)
|
34
|
+
end
|
35
|
+
words.each do |word,value|
|
36
|
+
key = spam ? "spam" : "ham"
|
37
|
+
REDIS.hincrby key, word, value
|
38
|
+
REDIS.hincrby "#{key}-#{@site_id}", word, value if @site_id
|
39
|
+
if retrain
|
40
|
+
# Remove phrases from existing corpus
|
41
|
+
key = spam ? "ham" : "spam"
|
42
|
+
REDIS.hincrby key, word, -value
|
43
|
+
REDIS.hincrby "#{key}-#{@site_id}", word, -value if @site_id
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def compare text
|
49
|
+
tri = self.class.trigram(text)
|
50
|
+
score = 0
|
51
|
+
spam = 0
|
52
|
+
|
53
|
+
ham_key = @site_id ? "ham-#{@site_id}" : "ham"
|
54
|
+
spam_key = @site_id ? "spam-#{@site_id}" : "spam"
|
55
|
+
|
56
|
+
@ham_tri = Hash.new 0
|
57
|
+
@spam_tri = Hash.new 0
|
58
|
+
|
59
|
+
tri.each do |key,value|
|
60
|
+
next if key.nil? || key.strip == ""
|
61
|
+
hmatch = REDIS.hget(ham_key, key).to_i # ham_tri[key]
|
62
|
+
smatch = REDIS.hget(spam_key, key).to_i # spam_tri[key]
|
63
|
+
|
64
|
+
if hmatch > 0 && smatch > 0
|
65
|
+
# tri appears in both
|
66
|
+
# ignore.
|
67
|
+
next
|
68
|
+
end
|
69
|
+
if hmatch > 0
|
70
|
+
score += hmatch + value
|
71
|
+
elsif smatch > 0
|
72
|
+
spam += smatch + value
|
73
|
+
end
|
74
|
+
end
|
75
|
+
[score, spam]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# corpus = Splam::Ngram.new 10009
|
80
|
+
# s.comments.paginated_each(:order => "id desc") do |c|
|
81
|
+
# puts c.id
|
82
|
+
# words = Splam::Ngram.trigram(c.body.downcase)
|
83
|
+
# if c.author.support? || (c.user && c.user.trusted?)
|
84
|
+
# corpus.train words, false
|
85
|
+
# elsif c.spam
|
86
|
+
# corpus.train words, true
|
87
|
+
# end
|
88
|
+
# end
|
89
|
+
#
|
90
|
+
# Comment.spam.paginated_each(:order => "id desc", :conditions => ['id < 12916619']) do |c|
|
91
|
+
# next if c.user_email == "no-reply@lighthouseapp.com"
|
92
|
+
# score = corpus.compare(c.body)
|
93
|
+
# if score[0] > score[1]
|
94
|
+
# puts "Not spam? #{c.id} : #{score.inspect} - #{c.body.first(100)}"
|
95
|
+
# else
|
96
|
+
# puts "Spam! #{c.id} : #{score.inspect}"
|
97
|
+
# end
|
98
|
+
# end
|
data/lib/splam/rule.rb
CHANGED
@@ -30,8 +30,9 @@ class Splam::Rule
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
def initialize(suite, record, weight = 1.0)
|
34
|
-
@suite, @weight, @score, @reasons, @body = suite, weight, 0, [], record.send(suite.body)
|
33
|
+
def initialize(suite, record, weight = 1.0, request = nil)
|
34
|
+
@suite, @weight, @score, @reasons, @body, @request = suite, weight, 0, [], record.send(suite.body), request
|
35
|
+
@user = record.user # todo: customize user field
|
35
36
|
end
|
36
37
|
|
37
38
|
def name
|
@@ -70,4 +71,15 @@ class Splam::Rule
|
|
70
71
|
@score += points
|
71
72
|
end
|
72
73
|
end
|
73
|
-
|
74
|
+
|
75
|
+
def line_safe?(string)
|
76
|
+
([
|
77
|
+
/\.dylib\b/,
|
78
|
+
/\b0x[0-9a-f]{6,16}\b/i,
|
79
|
+
/\b\/Applications\//,
|
80
|
+
/\b\/System\/Library\//,
|
81
|
+
/\bLibrary\/Application Support\//
|
82
|
+
].map {|r| r.match string }).compact.size > 0
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
@@ -7,7 +7,7 @@ class Splam::Rules::ArmsRace < Splam::Rule
|
|
7
7
|
|
8
8
|
# This is where you put banned domain names or otherwise
|
9
9
|
def run
|
10
|
-
shitty_sites = ["inquisitr"]
|
10
|
+
shitty_sites = ["inquisitr", "beeplog"]
|
11
11
|
shitty_sites.each do |word|
|
12
12
|
results = @body.downcase.scan(word)
|
13
13
|
if results && results.size > 0
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'active_support'
|
1
2
|
class Splam::Rules::BadWords < Splam::Rule
|
2
3
|
class << self
|
3
4
|
attr_accessor :bad_word_score, :suspicious_word_score
|
@@ -7,28 +8,68 @@ class Splam::Rules::BadWords < Splam::Rule
|
|
7
8
|
self.suspicious_word_score = 4
|
8
9
|
|
9
10
|
def run
|
10
|
-
bad_words =
|
11
|
-
bad_words
|
12
|
-
bad_words |= %w(
|
13
|
-
bad_words
|
14
|
-
bad_words
|
11
|
+
bad_words = {}
|
12
|
+
bad_words[:pornspam] = %w( sex sexy porn gay erotica erotico topless naked viagra erotismo porno porn lesbian amateur tit\b)
|
13
|
+
bad_words[:pornspam] |= %w( gratis erotismo porno torrent bittorrent adulto videochat video 3dsex)
|
14
|
+
bad_words[:pornspam] << /pel?cula/ << /pornogr?fica/ << "portal porno" # srsly, spamming in spanish?
|
15
|
+
bad_words[:pornspam] |= %w( webcam free-web-host rapidshare)
|
16
|
+
|
17
|
+
bad_words[:viagraspam] = %w( cialis viagra pharmacy prescription levitra kamagra)
|
18
|
+
bad_words[:benzospam] = %w( ultram tramadol pharmacy prescription )
|
19
|
+
bad_words[:cashspam] = %w( payday loan jihad ) << "payday loan"
|
20
|
+
bad_words[:pharmaspam] = %w( propecia finasteride viagra )
|
21
|
+
|
22
|
+
bad_words[:nigerian] = ["million pounds sterling", "dear sirs,", "any bank account", "winning notification", "western union", "diagnosed with cancer", "bank treasury", "unclaimed inheritance"]
|
23
|
+
|
24
|
+
# linkspammers
|
25
|
+
bad_words[:linkspam] = ["increase traffic", "discovered your blog", "backlinks", "sent me a link", "more visitors to my site", "targeted traffic", "increase traffic to your website", "estore"]
|
26
|
+
|
27
|
+
bad_words[:beats] = %w( beats dre headphones sale cheap shipping ) << "monster beats" << "best online"
|
28
|
+
bad_words[:rolex] = %w( rolex watch replica watches price )
|
29
|
+
bad_words[:wtf] = %w( bilete avion )
|
30
|
+
|
31
|
+
# buying fake shitty brand stuff
|
32
|
+
bad_words[:bagspam] = %w(handbag louis louisvuitton vuitton chanel coach clearance outlet hermes bag scarf sale ralphlauren)
|
33
|
+
bad_words[:handbags] = %w( karenmillen michaelkors kors millen bags purchase handbag chanel outlet tasche longchamp kaufen louboutin christianlouboutin)
|
34
|
+
bad_words[:blingspam] = %w( tiffany jewellery tiffanyco clearance outlet)
|
35
|
+
bad_words[:uggspam] = %w(\buggs?\b \buggboots\b clearance outlet )
|
36
|
+
bad_words[:wedding] = ["wedding", "wedding dress", "weddingdress", "strapless"]
|
37
|
+
|
38
|
+
bad_words[:webcamspam] = %w( live girls webcam adult singles) << "chat room"
|
39
|
+
bad_words[:gamereview] = %w( games-review-it.com game-reviews-online.com )
|
40
|
+
bad_words[:streaming] = %w( watchmlbbaseball watchnhlhockey pspnsportstv.com )
|
41
|
+
|
42
|
+
bad_words[:forum_spam] = ["IMG", "url="]
|
15
43
|
|
16
44
|
suspicious_words = %w( free buy galleries dating gallery hard hardcore video homemade celebrity ) << "credit card" << "my friend" << "friend sent me"
|
17
|
-
suspicious_words |= %w( adult
|
45
|
+
suspicious_words |= %w( adult overnight shipping free hot movie nylon arab ?????? seo)
|
18
46
|
suspicious_words << "forums/member.php?u=" << "chat room" << "free chat" << "yahoo chat" << "page.php"
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
47
|
+
|
48
|
+
bad_words.each do |key,wordlist|
|
49
|
+
counter = 0
|
50
|
+
wordlist.each do |word|
|
51
|
+
results = Regexp.new("\\b(#{word})\\b").match @body
|
52
|
+
if results && results.size > 0
|
53
|
+
counter += 1
|
54
|
+
add_score((self.class.bad_word_score ** results.size), "nasty word: '#{word}'")
|
55
|
+
|
56
|
+
# Add more points if the bad word is INSIDE a link
|
57
|
+
@body.scan(/<a[^>]+>(.*?)<\/a>/).each do |match|
|
58
|
+
add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a link: #{word}"
|
59
|
+
end
|
60
|
+
@body.scan(/\nhttp:\/\/(.*?#{word})\//).each do |match|
|
61
|
+
add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a straight-up link: #{word}"
|
62
|
+
end
|
63
|
+
@body.scan(/<a.*?>(.*?)<\/a>/).each do |links|
|
64
|
+
add_score self.class.bad_word_score * 50, "nasty word is the entire link: #{word}"
|
65
|
+
end
|
66
|
+
@body.scan(/<a(.*?)>/).each do |match|
|
67
|
+
add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a URL: #{word}"
|
68
|
+
end
|
69
|
+
|
29
70
|
end
|
30
|
-
|
31
|
-
add_score
|
71
|
+
if counter > (wordlist.size / 2)
|
72
|
+
add_score 1000, "Lots of bad words from one genre (#{key}): #{counter}"
|
32
73
|
end
|
33
74
|
end
|
34
75
|
end
|
@@ -43,4 +84,4 @@ class Splam::Rules::BadWords < Splam::Rule
|
|
43
84
|
end
|
44
85
|
end
|
45
86
|
end
|
46
|
-
end
|
87
|
+
end
|
data/lib/splam/rules/bbcode.rb
CHANGED
@@ -3,10 +3,12 @@ class Splam::Rules::Bbcode < Splam::Rule
|
|
3
3
|
def run
|
4
4
|
add_score 10 * @body.scan("showpost.php?p=").size, "Linking to a shitty forum"
|
5
5
|
# add_score 10 * @body.scan("\r\n").size, "Poorly formed POST (\\r\\n)"
|
6
|
+
add_score 80 * @body.scan(/\n\[url.*?\]\n/).size, "Shitty bbcode url covers entire line"
|
6
7
|
add_score 40 * @body.scan("[url=").size, "URL" # no URLS for you!!
|
7
8
|
add_score 40 * @body.scan("[URL=").size, "URL" # no URLS for you!!
|
8
|
-
add_score
|
9
|
-
add_score
|
9
|
+
add_score 45 * @body.scan("[url=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
|
10
|
+
add_score 45 * @body.scan("[URL=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
|
11
|
+
add_score 30 * @body.scan("[/CODE").size, "Forum codes?"
|
10
12
|
add_score 10 * @body.scan(/\[[bai]/).size, "b/a/i tag"
|
11
13
|
end
|
12
14
|
end
|
data/lib/splam/rules/chinese.rb
CHANGED
@@ -3,22 +3,42 @@ class Splam::Rules::Chinese < Splam::Rule
|
|
3
3
|
class << self
|
4
4
|
attr_accessor :base_score
|
5
5
|
end
|
6
|
-
self.base_score =
|
6
|
+
self.base_score = 5
|
7
7
|
|
8
8
|
def run
|
9
9
|
banned_words =[ # various chinese characters
|
10
10
|
"\350\263\207",
|
11
11
|
"\351\207\221",
|
12
12
|
"\357\274\222", # number 2 in weird unicode
|
13
|
-
"\357\274\224", # number 4
|
13
|
+
"\357\274\224", # number 4
|
14
14
|
"\357\274\225", # number 5
|
15
15
|
"\357\274\231", # number 9
|
16
|
+
|
17
|
+
"\345\260\232",
|
18
|
+
"\345\256\266",
|
19
|
+
"\345\274\267",
|
20
|
+
"\345\240\261",
|
21
|
+
"\345\260\216",
|
22
|
+
"\345\217\260",
|
23
|
+
"\345\215\227",
|
24
|
+
"\346\235\261",
|
25
|
+
"\345\270\202",
|
26
|
+
"\345\240\264",
|
27
|
+
"\345\202\263",
|
28
|
+
"\346\216\250",
|
29
|
+
"\346\231\202",
|
30
|
+
"\347\203\210",
|
31
|
+
"\347\216\251",
|
32
|
+
"\350\226\246",
|
33
|
+
"\350\217\234",
|
34
|
+
"\350\216\216",
|
35
|
+
|
16
36
|
"\357\274\215", # hyphen
|
17
|
-
# /\\357\2\d\d\\\d{3}/, # TODO
|
37
|
+
# /\\357\2\d\d\\\d{3}/, # TODO SyntaxError on 1.9
|
18
38
|
# "\357", # ugh, these don't work .. because they're only part of a character.
|
19
39
|
# "\351",
|
20
40
|
"\35"
|
21
|
-
]
|
41
|
+
].compact
|
22
42
|
banned_words.each do |word|
|
23
43
|
hits = (self.class.base_score * @body.scan(word).size) # 1 point for every banned word
|
24
44
|
add_score hits, "Banned character: #{word}"
|
data/lib/splam/rules/fuzz.rb
CHANGED
@@ -3,11 +3,16 @@ class Splam::Rules::Fuzz < Splam::Rule
|
|
3
3
|
attr_accessor :bad_word_score
|
4
4
|
end
|
5
5
|
|
6
|
-
self.bad_word_score
|
6
|
+
self.bad_word_score = 10
|
7
7
|
|
8
8
|
def run
|
9
9
|
patterns = [/^(\d[a-z])/, /(\d[a-z][A-Z]\w+)/, /(\b\w+\d\.txt)/, /(;\d+;)/ ]
|
10
|
+
ignore_if = [%r{vendor/rails}, /EXC_BAD_ACCESS/, /JavaAppLauncher/, %r{Contents/MacOS}, %r{/Library/}]
|
10
11
|
matches = 0
|
12
|
+
# looks like a stack trace
|
13
|
+
ignore_if.each do |pattern|
|
14
|
+
return if @body.scan(pattern)
|
15
|
+
end
|
11
16
|
patterns.each do |pattern|
|
12
17
|
results = @body.scan(pattern)
|
13
18
|
if results && results.size > 0
|
@@ -2,18 +2,21 @@ class Splam::Rules::GoodWords < Splam::Rule
|
|
2
2
|
|
3
3
|
def run
|
4
4
|
good_words = [ /I\'having a problem/, ]
|
5
|
-
good_words |= %w( lighthouse activereload warehouse install eclipse settings assigned user ticket tickets token api number query request)
|
5
|
+
good_words |= %w( lighthouse lighthouseapp activereload warehouse install eclipse settings assigned user ticket tickets token api number query request)
|
6
6
|
good_words |= %w( browser feed firefox safari skitch vendor rails action_controller railties )
|
7
7
|
good_words |= %w( redirect login diff dreamhost setup subversion git wildcard domain subdomain ssh database )
|
8
8
|
good_words |= %w( project billing tags description comment milestone saving happening feature mac implement report)
|
9
9
|
good_words |= %w( rss notification subscribe calendar chart note task gantt search service ownership application communicate )
|
10
|
-
good_words |= %w(
|
10
|
+
good_words |= %w( interaction API tickets hosted domain skitch )
|
11
|
+
good_words |= %w( pattern template web integer status xml activereload html state page rack diff )
|
11
12
|
good_words << "project management"
|
12
13
|
good_words << "/usr/local/lib" << "gems"
|
13
14
|
|
14
15
|
body = @body.downcase
|
15
16
|
good_words.each { |rule|
|
16
|
-
|
17
|
+
|
18
|
+
results = Regexp.new("\\b(#{rule})\\b","i").match(body)
|
19
|
+
add_score -50 * results.size, "relevant word match: #{rule}" if results
|
17
20
|
}
|
18
21
|
end
|
19
22
|
end
|
data/lib/splam/rules/href.rb
CHANGED
@@ -4,8 +4,8 @@ class Splam::Rules::Href < Splam::Rule
|
|
4
4
|
|
5
5
|
def run
|
6
6
|
# add_score 3 * @body.scan("href=http").size, "Shitty html 'href=http'" # 3 points for shitty html
|
7
|
-
add_score
|
8
|
-
add_score
|
7
|
+
add_score 35 * @body.scan(/href\=\s*http/).size, "Shitty html 'href=http'" # 15 points for shitty html
|
8
|
+
add_score 35 * @body.scan(/href\="\s+http/).size, "Shitty html 'href=\" http'" # 15 points for shitty html
|
9
9
|
add_score 50 * @body.scan(/\A<a.*?<\/a>\Z/).size, "Single link post'" # 50 points for shitty
|
10
10
|
|
11
11
|
link_count = @body.scan("http://").size
|
@@ -13,7 +13,7 @@ class Splam::Rules::Href < Splam::Rule
|
|
13
13
|
add_score 50, "More than 10 links" if link_count > 10 # more than 10 links? spam.
|
14
14
|
add_score 100, "More than 20 links" if link_count > 20 # more than 20 links? definitely spam.
|
15
15
|
add_score 1000, "More than 50 links" if link_count > 50 # more than 20 links? definitely spam.
|
16
|
-
|
16
|
+
|
17
17
|
# Modify these scores to weight certain problematic domains.
|
18
18
|
# You may need to modify these for your application
|
19
19
|
suspicious_top_level_domains = {
|
@@ -22,6 +22,7 @@ class Splam::Rules::Href < Splam::Rule
|
|
22
22
|
'us' => 8, # .us ? possibly spam
|
23
23
|
'it' => 5,
|
24
24
|
'tk' => 20,
|
25
|
+
'eu' => 20,
|
25
26
|
'pl' => 8,
|
26
27
|
'info' => 20,
|
27
28
|
'biz' => 40 # no-one uses these for reals
|
@@ -33,11 +34,12 @@ class Splam::Rules::Href < Splam::Rule
|
|
33
34
|
|
34
35
|
tokens = @body.split(" ")
|
35
36
|
if tokens[-1] =~ /^http:\/\//
|
36
|
-
add_score
|
37
|
-
add_score
|
37
|
+
add_score 20, "Text ends in a http token"
|
38
|
+
add_score 150, "Text ends in a http token and only has one token" if link_count == 1
|
39
|
+
add_score 150, "Text ends in a http token with a shitty domain " if tokens[-1].match(/http:\/\/#{suspicious_sites.keys.join("|")}\./)
|
38
40
|
end
|
39
41
|
|
40
|
-
@body.scan(/http:\/\/(.*?)[
|
42
|
+
@body.scan(/http:\/\/(.*?)[\/\>\]?]/) do |match|
|
41
43
|
# $stderr.puts "checking #{match}"
|
42
44
|
if domain = match.to_s.split(".")
|
43
45
|
tld = domain[-1]
|
@@ -48,7 +50,9 @@ class Splam::Rules::Href < Splam::Rule
|
|
48
50
|
|
49
51
|
if found = suspicious_sites[domain[-2]]
|
50
52
|
add_score found, "Suspicious hostname: '#{domain[-2]}'"
|
53
|
+
add_score found * 5, "..document ends in suspicious hostname" if tokens[-1] =~ /^http:\/\//
|
51
54
|
end
|
55
|
+
|
52
56
|
end
|
53
57
|
end
|
54
58
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'resolv'
|
2
|
+
# Liberally copied from https://github.com/bpalmen/httpbl/blob/master/lib/httpbl.rb
|
3
|
+
|
4
|
+
class Splam::Rules::Httpbl < Splam::Rule
|
5
|
+
if RUBY_VERSION < "1.9"
|
6
|
+
require 'system_timer'
|
7
|
+
else
|
8
|
+
require 'timeout'
|
9
|
+
SystemTimer = Timeout
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
class << self
|
14
|
+
attr_accessor :api_key
|
15
|
+
end
|
16
|
+
|
17
|
+
def run
|
18
|
+
return unless @request # no ip available
|
19
|
+
return unless @request[:remote_ip] # no ip available
|
20
|
+
|
21
|
+
ip = @request[:remote_ip]
|
22
|
+
|
23
|
+
if result = self.class.check_blacklist(ip)
|
24
|
+
add_score 250, "IP address (#{ip}) appears in ProjectHoneypot blacklist. (#{result.inspect})"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.check_blacklist(ip)
|
29
|
+
# @cache = REDIS if defined?(REDIS)
|
30
|
+
# result = @cache && @cache["ip.#{ip}"]
|
31
|
+
# result ||= resolve(ip)
|
32
|
+
# if @cache
|
33
|
+
# @cache.set "ip.#{ip}", result if @cache
|
34
|
+
# @cache.expire "ip.#{ip}", 1.week
|
35
|
+
# end
|
36
|
+
result = resolve(ip)
|
37
|
+
response = result.split(".").collect!(&:to_i)
|
38
|
+
|
39
|
+
# responses:
|
40
|
+
# a, b, c, d
|
41
|
+
# a = 127 if success
|
42
|
+
# b = days since last activity
|
43
|
+
# c = threat score, 0..255 (0 is not threat)
|
44
|
+
# d = type of visitor
|
45
|
+
raise "Bad httpbl request format!" if response[0] != 127
|
46
|
+
return response[3] > 0 || response[2] > 100
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.resolve(ip)
|
50
|
+
query = "#{@@api_key}.#{ip.split('.').reverse.join('.')}.dnsbl.httpbl.org"
|
51
|
+
SystemTimer::timeout(0.5) do
|
52
|
+
begin
|
53
|
+
Resolv::DNS.new.getaddress(query).to_s
|
54
|
+
rescue Resolv::ResolvError
|
55
|
+
"127.0.0.0"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
rescue Errno::ECONNREFUSED
|
59
|
+
# derp
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
@@ -10,8 +10,8 @@ class Splam::Rules::LineLength < Splam::Rule
|
|
10
10
|
lines.each do |line|
|
11
11
|
next if line =~ /\A\s{4,}/ # ignore code blocks
|
12
12
|
|
13
|
-
multiplier = (lines.size == 1) ? 10 : 1 # one line? fail.
|
14
|
-
|
13
|
+
# multiplier = (lines.size == 1) ? 10 : 1 # one line? fail.
|
14
|
+
multiplier = 1
|
15
15
|
|
16
16
|
# 1 point for each 40 chars in a line.
|
17
17
|
hits = (line.size / 40) * multiplier
|
@@ -1,5 +1,4 @@
|
|
1
1
|
class Splam::Rules::Punctuation < Splam::Rule
|
2
|
-
|
3
2
|
def run
|
4
3
|
punctuation = @body.scan(/[.,] /)
|
5
4
|
add_score 10, "Text has no punctuation" if punctuation.size == 0
|
@@ -7,8 +6,10 @@ class Splam::Rules::Punctuation < Splam::Rule
|
|
7
6
|
@body.split(/[.,]/).each do |sentence|
|
8
7
|
words = sentence.split(" ")
|
9
8
|
# long sentence, add a point.
|
10
|
-
|
11
|
-
|
9
|
+
unless line_safe?(sentence)
|
10
|
+
add_score 1, "Sentence has more than 10 words" if words.size > 10
|
11
|
+
add_score 10, "Sentence has more than 30 words" if words.size > 30
|
12
|
+
end
|
12
13
|
end
|
13
14
|
end
|
14
|
-
end
|
15
|
+
end
|
data/lib/splam/rules/russian.rb
CHANGED
@@ -3,7 +3,7 @@ class Splam::Rules::Russian < Splam::Rule
|
|
3
3
|
|
4
4
|
def run
|
5
5
|
banned_words =[ # various russian characters
|
6
|
-
"\320\241", "\320\220", "\320\234", "\320\257", "\320\233", "\320\243",
|
6
|
+
"\320\241", "\320\220", "\320\234", "\320\257", "\320\233", "\320\243",
|
7
7
|
"с", "м", "о", "т", "р", "е", "т", "ь", "п", "о", "р", "н", "о", "р", "л", "и", "к"
|
8
8
|
# unicode char
|
9
9
|
# "\320"
|
@@ -0,0 +1,15 @@
|
|
1
|
+
class Splam::Rules::User < Splam::Rule
|
2
|
+
|
3
|
+
def run
|
4
|
+
bad_words = ["qq.com", "yahoo.cn", "126.com"]
|
5
|
+
bad_words |= %w( mortgage )
|
6
|
+
|
7
|
+
bad_words.each do |word|
|
8
|
+
add_score 50, "User's email address has suspicious parts: #{word}" if @user.email.include?(word)
|
9
|
+
end
|
10
|
+
|
11
|
+
add_score "20", "User has lots and lots of dots" if @user.email.split("@")[0].scan(/\./).size > 5
|
12
|
+
|
13
|
+
add_score 5, "User is untrusted" if !@user.trusted?
|
14
|
+
end
|
15
|
+
end
|
@@ -16,17 +16,16 @@ class Splam::Rules::WordLength < Splam::Rule
|
|
16
16
|
|
17
17
|
def run
|
18
18
|
words = []
|
19
|
-
words = @body.split(/\s/)
|
20
|
-
|
21
|
-
|
22
|
-
words.delete_if { |w| w =~ /^http\:\/\//}
|
19
|
+
words = @body.split(/\s/)
|
20
|
+
words.delete_if { |w| w =~ /^https?\:\/\// }
|
21
|
+
words.collect! { |word| word.size }
|
23
22
|
|
24
23
|
# Only count word lengths over 10
|
25
24
|
if words.size > 5
|
26
|
-
add_score
|
27
|
-
add_score
|
28
|
-
add_score
|
29
|
-
add_score
|
25
|
+
add_score 5, "Average word length over 5" if average(words) > 5
|
26
|
+
add_score 10, "Average word length over 10" if average(words) > 10
|
27
|
+
add_score 5, "Median word length over 5" if median(words) > 5
|
28
|
+
add_score 10, "Median word length over 10" if median(words) > 10
|
30
29
|
end
|
31
30
|
end
|
32
31
|
end
|
data/splam.gemspec
CHANGED
data/test/ngram_test.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'test_helper')
|
2
|
+
require "splam/ngram"
|
3
|
+
require "redis"
|
4
|
+
REDIS = Redis.new :db => "12"
|
5
|
+
class NgramTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@corpus = Splam::Ngram.new
|
9
|
+
|
10
|
+
# Will only try to stuff this into redis if it appears empty. Hacky, but works
|
11
|
+
if REDIS.hlen("spam") < 50
|
12
|
+
REDIS.expire "spam", 0
|
13
|
+
REDIS.expire "ham", 0
|
14
|
+
|
15
|
+
Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "comment", "spam", "*.txt")).each do |f|
|
16
|
+
spam = File.open(f).read
|
17
|
+
@corpus.train spam, true
|
18
|
+
end
|
19
|
+
puts "loading ham"
|
20
|
+
Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "comment", "ham", "*.txt")).each do |f|
|
21
|
+
ham = File.open(f).read
|
22
|
+
@corpus.train ham, false
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_learns_spam
|
28
|
+
score = @corpus.compare("Bienvenido a nuestro nuevo portal porno")
|
29
|
+
assert score[1] > score[0] * 2
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_learns_ham
|
33
|
+
score = @corpus.compare("Is this a known issue?")
|
34
|
+
assert score[0] > score[1] * 2
|
35
|
+
end
|
36
|
+
end
|
data/test/splam_test.rb
CHANGED
@@ -7,6 +7,16 @@ class SplamTest < Test::Unit::TestCase
|
|
7
7
|
end
|
8
8
|
end
|
9
9
|
|
10
|
+
class User
|
11
|
+
attr_accessor :trusted
|
12
|
+
def trusted?
|
13
|
+
trusted
|
14
|
+
end
|
15
|
+
def email
|
16
|
+
"test@test.com"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
10
20
|
class Foo
|
11
21
|
include ::Splam
|
12
22
|
splammable :body
|
@@ -14,6 +24,9 @@ class SplamTest < Test::Unit::TestCase
|
|
14
24
|
def body
|
15
25
|
@body || "This is body\320\224 \320\199"
|
16
26
|
end
|
27
|
+
def user
|
28
|
+
User.new
|
29
|
+
end
|
17
30
|
end
|
18
31
|
|
19
32
|
class FooCond
|
@@ -27,7 +40,9 @@ class SplamTest < Test::Unit::TestCase
|
|
27
40
|
splammable :body do |s|
|
28
41
|
s.rules = [:fixed_rule, FixedRule]
|
29
42
|
end
|
30
|
-
|
43
|
+
def user
|
44
|
+
User.new
|
45
|
+
end
|
31
46
|
def body
|
32
47
|
'lol wut'
|
33
48
|
end
|
@@ -38,6 +53,9 @@ class SplamTest < Test::Unit::TestCase
|
|
38
53
|
splammable :body do |s|
|
39
54
|
s.rules = {:fixed_rule => 3}
|
40
55
|
end
|
56
|
+
def user
|
57
|
+
User.new
|
58
|
+
end
|
41
59
|
|
42
60
|
def body
|
43
61
|
'lol wut'
|
@@ -47,7 +65,7 @@ class SplamTest < Test::Unit::TestCase
|
|
47
65
|
def test_runs_plugins
|
48
66
|
f = Foo.new
|
49
67
|
assert ! f.splam?
|
50
|
-
assert_equal
|
68
|
+
assert_equal 40, f.splam_score
|
51
69
|
end
|
52
70
|
|
53
71
|
def test_runs_plugins_with_specified_rules
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: splam
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -47,10 +47,11 @@ files:
|
|
47
47
|
- Gemfile
|
48
48
|
- Gemfile.lock
|
49
49
|
- MIT-LICENSE
|
50
|
-
- README
|
51
50
|
- Rakefile
|
51
|
+
- Readme.md
|
52
52
|
- gem-public_cert.pem
|
53
53
|
- lib/splam.rb
|
54
|
+
- lib/splam/ngram.rb
|
54
55
|
- lib/splam/rule.rb
|
55
56
|
- lib/splam/rules.rb
|
56
57
|
- lib/splam/rules/arms_race.rb
|
@@ -61,9 +62,11 @@ files:
|
|
61
62
|
- lib/splam/rules/good_words.rb
|
62
63
|
- lib/splam/rules/href.rb
|
63
64
|
- lib/splam/rules/html.rb
|
65
|
+
- lib/splam/rules/httpbl.rb
|
64
66
|
- lib/splam/rules/line_length.rb
|
65
67
|
- lib/splam/rules/punctuation.rb
|
66
68
|
- lib/splam/rules/russian.rb
|
69
|
+
- lib/splam/rules/user.rb
|
67
70
|
- lib/splam/rules/word_length.rb
|
68
71
|
- splam.gemspec
|
69
72
|
- test/fixtures/comment/ham/api-1.txt
|
@@ -101,7 +104,6 @@ files:
|
|
101
104
|
- test/fixtures/comment/spam/comment_cnn.txt
|
102
105
|
- test/fixtures/comment/spam/comment_randi.txt
|
103
106
|
- test/fixtures/comment/spam/comment_wordy.txt
|
104
|
-
- test/fixtures/comment/spam/consent.txt
|
105
107
|
- test/fixtures/comment/spam/december.txt
|
106
108
|
- test/fixtures/comment/spam/digital_rights.txt
|
107
109
|
- test/fixtures/comment/spam/dyed_wool.txt
|
@@ -139,6 +141,7 @@ files:
|
|
139
141
|
- test/fixtures/comment/spam/troubles.txt
|
140
142
|
- test/fixtures/comment/spam/url_only_idiot.txt
|
141
143
|
- test/fixtures/comment/spam/webcam.txt
|
144
|
+
- test/ngram_test.rb
|
142
145
|
- test/splam_rule_test.rb
|
143
146
|
- test/splam_test.rb
|
144
147
|
- test/test_helper.rb
|
@@ -157,7 +160,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
157
160
|
version: '0'
|
158
161
|
segments:
|
159
162
|
- 0
|
160
|
-
hash:
|
163
|
+
hash: 2317839165020925326
|
161
164
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
162
165
|
none: false
|
163
166
|
requirements:
|
@@ -166,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
169
|
version: '0'
|
167
170
|
segments:
|
168
171
|
- 0
|
169
|
-
hash:
|
172
|
+
hash: 2317839165020925326
|
170
173
|
requirements: []
|
171
174
|
rubyforge_project:
|
172
175
|
rubygems_version: 1.8.24
|
metadata.gz.sig
CHANGED
@@ -1,2 +1 @@
|
|
1
|
-
|
2
|
-
[�z�Q�~�o��#T��5�OiZýh�x��!�M�Yk�S�c�xO�/v���#�������C�g�JYP<6��3�b���\^�`6O�Ils+4�j� kU���#q�X��p�5C�n�fy�t5O��V`�ξ�Y�����J�����kO�)[�|.
|
1
|
+
H�y7�t�[�>USWH�E�#����ӝ��r���;E8R'���([qq��nfv� ���E(� c ��ojn�w��su��ޒ����]�~�C�E�|Z7߱���ɫ�c94�NCi_�W$@���b�7� �ٷ�
|
@@ -1 +0,0 @@
|
|
1
|
-
Metro: Faidherbe-ChalignyHappy Eating! There is one reason and one reason only that conflict occurs in marriage. Epistemology- or why we believe something is irrelevant in each democratic voice. A female under age 16 and a male under age 18 cannot secure a marriage license in the State of Rhode Island without the approval of the Family Court. Great article on CatholicCulture.org:There are a lot of social reasons why well-constructed families are important to our society. Oklahoma: If you are under 18, your parents must appear at the courthouse with you to sign a consent form.
|