splam 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +5 -1
- data/{README → Readme.md} +9 -0
- data/lib/splam.rb +5 -3
- data/lib/splam/ngram.rb +98 -0
- data/lib/splam/rule.rb +15 -3
- data/lib/splam/rules/arms_race.rb +1 -1
- data/lib/splam/rules/bad_words.rb +60 -19
- data/lib/splam/rules/bbcode.rb +4 -2
- data/lib/splam/rules/chinese.rb +24 -4
- data/lib/splam/rules/fuzz.rb +6 -1
- data/lib/splam/rules/good_words.rb +6 -3
- data/lib/splam/rules/href.rb +10 -6
- data/lib/splam/rules/httpbl.rb +62 -0
- data/lib/splam/rules/line_length.rb +2 -2
- data/lib/splam/rules/punctuation.rb +5 -4
- data/lib/splam/rules/russian.rb +1 -1
- data/lib/splam/rules/user.rb +15 -0
- data/lib/splam/rules/word_length.rb +7 -8
- data/splam.gemspec +1 -1
- data/test/ngram_test.rb +36 -0
- data/test/splam_test.rb +20 -2
- metadata +8 -5
- metadata.gz.sig +1 -2
- data/test/fixtures/comment/spam/consent.txt +0 -1
data.tar.gz.sig
CHANGED
Binary file
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
splam (0.
|
4
|
+
splam (0.2.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: http://rubygems.org/
|
@@ -13,6 +13,8 @@ GEM
|
|
13
13
|
i18n (0.6.1)
|
14
14
|
multi_json (1.6.0)
|
15
15
|
rake (10.0.3)
|
16
|
+
redis (3.0.2)
|
17
|
+
system_timer (1.2.4)
|
16
18
|
|
17
19
|
PLATFORMS
|
18
20
|
ruby
|
@@ -21,4 +23,6 @@ DEPENDENCIES
|
|
21
23
|
activesupport
|
22
24
|
bump
|
23
25
|
rake
|
26
|
+
redis
|
24
27
|
splam!
|
28
|
+
system_timer
|
data/{README → Readme.md}
RENAMED
@@ -42,8 +42,17 @@ site) whether to ban the post or not.
|
|
42
42
|
|
43
43
|
We recommend showing the post to the user (spambox them in) but hide it from everyone else.
|
44
44
|
|
45
|
+
Dev
|
46
|
+
===
|
47
|
+
|
48
|
+
bundle
|
49
|
+
redis-server
|
50
|
+
rake
|
51
|
+
|
45
52
|
TODO
|
53
|
+
====
|
46
54
|
|
55
|
+
- fix on 1.9
|
47
56
|
- Integrate bayesian or other clever algorithm, so that scores aren't hardcoded.
|
48
57
|
- Switch to using a percentage (0.994) rather than a score (250)
|
49
58
|
- Write more plugins!
|
data/lib/splam.rb
CHANGED
@@ -29,11 +29,11 @@ module Splam
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
def run(record)
|
32
|
+
def run(record, request)
|
33
33
|
score, reasons = 0, []
|
34
34
|
rules.each do |rule_class, weight|
|
35
35
|
weight ||= 1
|
36
|
-
worker = rule_class.run(self, record, weight)
|
36
|
+
worker = rule_class.run(self, record, weight, request)
|
37
37
|
score += worker.score
|
38
38
|
reasons << worker.reasons
|
39
39
|
end
|
@@ -51,6 +51,7 @@ module Splam
|
|
51
51
|
Dir["#{File.dirname(__FILE__)}/splam/rules/*.rb"].each do |f|
|
52
52
|
require f
|
53
53
|
end
|
54
|
+
require "splam/ngram"
|
54
55
|
base.send :extend, ClassMethods
|
55
56
|
end
|
56
57
|
|
@@ -113,7 +114,8 @@ protected
|
|
113
114
|
return false if (splam_suite.conditions && !splam_suite.conditions.call(self)) ||
|
114
115
|
skip_splam_check ||
|
115
116
|
send(splam_suite.body).nil?
|
116
|
-
@
|
117
|
+
@request = splam_suite.request.call(self) if splam_suite.request
|
118
|
+
@splam_score, @splam_reasons = splam_suite.run(self, @request)
|
117
119
|
instance_variable_get("@splam_#{attr_suffix}") if attr_suffix
|
118
120
|
end
|
119
121
|
|
data/lib/splam/ngram.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
class Splam::Ngram
|
2
|
+
|
3
|
+
def self.trigram text
|
4
|
+
# this won't be utf-8 happy. Oh well!
|
5
|
+
words = text.gsub("'", "").split(/\W/)
|
6
|
+
hash = Hash.new 0
|
7
|
+
i = 0
|
8
|
+
while (i < words.length)
|
9
|
+
tri = []
|
10
|
+
count = 0
|
11
|
+
while ((words.length > i + count) && (tri.length < 3))
|
12
|
+
word = words[i + count]
|
13
|
+
if word && word != ""
|
14
|
+
tri << words[i + count]
|
15
|
+
end
|
16
|
+
count += 1
|
17
|
+
end
|
18
|
+
if tri.length == 3
|
19
|
+
hash[tri.join(' ')] += 1
|
20
|
+
end
|
21
|
+
i += 1
|
22
|
+
end
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize site_id=nil
|
27
|
+
@site_id = site_id
|
28
|
+
end
|
29
|
+
|
30
|
+
# Train the temporary corpus with your data
|
31
|
+
def train words, spam = false, retrain = false
|
32
|
+
if words.is_a?(String)
|
33
|
+
words = self.class.trigram(words)
|
34
|
+
end
|
35
|
+
words.each do |word,value|
|
36
|
+
key = spam ? "spam" : "ham"
|
37
|
+
REDIS.hincrby key, word, value
|
38
|
+
REDIS.hincrby "#{key}-#{@site_id}", word, value if @site_id
|
39
|
+
if retrain
|
40
|
+
# Remove phrases from existing corpus
|
41
|
+
key = spam ? "ham" : "spam"
|
42
|
+
REDIS.hincrby key, word, -value
|
43
|
+
REDIS.hincrby "#{key}-#{@site_id}", word, -value if @site_id
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def compare text
|
49
|
+
tri = self.class.trigram(text)
|
50
|
+
score = 0
|
51
|
+
spam = 0
|
52
|
+
|
53
|
+
ham_key = @site_id ? "ham-#{@site_id}" : "ham"
|
54
|
+
spam_key = @site_id ? "spam-#{@site_id}" : "spam"
|
55
|
+
|
56
|
+
@ham_tri = Hash.new 0
|
57
|
+
@spam_tri = Hash.new 0
|
58
|
+
|
59
|
+
tri.each do |key,value|
|
60
|
+
next if key.nil? || key.strip == ""
|
61
|
+
hmatch = REDIS.hget(ham_key, key).to_i # ham_tri[key]
|
62
|
+
smatch = REDIS.hget(spam_key, key).to_i # spam_tri[key]
|
63
|
+
|
64
|
+
if hmatch > 0 && smatch > 0
|
65
|
+
# tri appears in both
|
66
|
+
# ignore.
|
67
|
+
next
|
68
|
+
end
|
69
|
+
if hmatch > 0
|
70
|
+
score += hmatch + value
|
71
|
+
elsif smatch > 0
|
72
|
+
spam += smatch + value
|
73
|
+
end
|
74
|
+
end
|
75
|
+
[score, spam]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# corpus = Splam::Ngram.new 10009
|
80
|
+
# s.comments.paginated_each(:order => "id desc") do |c|
|
81
|
+
# puts c.id
|
82
|
+
# words = Splam::Ngram.trigram(c.body.downcase)
|
83
|
+
# if c.author.support? || (c.user && c.user.trusted?)
|
84
|
+
# corpus.train words, false
|
85
|
+
# elsif c.spam
|
86
|
+
# corpus.train words, true
|
87
|
+
# end
|
88
|
+
# end
|
89
|
+
#
|
90
|
+
# Comment.spam.paginated_each(:order => "id desc", :conditions => ['id < 12916619']) do |c|
|
91
|
+
# next if c.user_email == "no-reply@lighthouseapp.com"
|
92
|
+
# score = corpus.compare(c.body)
|
93
|
+
# if score[0] > score[1]
|
94
|
+
# puts "Not spam? #{c.id} : #{score.inspect} - #{c.body.first(100)}"
|
95
|
+
# else
|
96
|
+
# puts "Spam! #{c.id} : #{score.inspect}"
|
97
|
+
# end
|
98
|
+
# end
|
data/lib/splam/rule.rb
CHANGED
@@ -30,8 +30,9 @@ class Splam::Rule
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
def initialize(suite, record, weight = 1.0)
|
34
|
-
@suite, @weight, @score, @reasons, @body = suite, weight, 0, [], record.send(suite.body)
|
33
|
+
def initialize(suite, record, weight = 1.0, request = nil)
|
34
|
+
@suite, @weight, @score, @reasons, @body, @request = suite, weight, 0, [], record.send(suite.body), request
|
35
|
+
@user = record.user # todo: customize user field
|
35
36
|
end
|
36
37
|
|
37
38
|
def name
|
@@ -70,4 +71,15 @@ class Splam::Rule
|
|
70
71
|
@score += points
|
71
72
|
end
|
72
73
|
end
|
73
|
-
|
74
|
+
|
75
|
+
def line_safe?(string)
|
76
|
+
([
|
77
|
+
/\.dylib\b/,
|
78
|
+
/\b0x[0-9a-f]{6,16}\b/i,
|
79
|
+
/\b\/Applications\//,
|
80
|
+
/\b\/System\/Library\//,
|
81
|
+
/\bLibrary\/Application Support\//
|
82
|
+
].map {|r| r.match string }).compact.size > 0
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
@@ -7,7 +7,7 @@ class Splam::Rules::ArmsRace < Splam::Rule
|
|
7
7
|
|
8
8
|
# This is where you put banned domain names or otherwise
|
9
9
|
def run
|
10
|
-
shitty_sites = ["inquisitr"]
|
10
|
+
shitty_sites = ["inquisitr", "beeplog"]
|
11
11
|
shitty_sites.each do |word|
|
12
12
|
results = @body.downcase.scan(word)
|
13
13
|
if results && results.size > 0
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'active_support'
|
1
2
|
class Splam::Rules::BadWords < Splam::Rule
|
2
3
|
class << self
|
3
4
|
attr_accessor :bad_word_score, :suspicious_word_score
|
@@ -7,28 +8,68 @@ class Splam::Rules::BadWords < Splam::Rule
|
|
7
8
|
self.suspicious_word_score = 4
|
8
9
|
|
9
10
|
def run
|
10
|
-
bad_words =
|
11
|
-
bad_words
|
12
|
-
bad_words |= %w(
|
13
|
-
bad_words
|
14
|
-
bad_words
|
11
|
+
bad_words = {}
|
12
|
+
bad_words[:pornspam] = %w( sex sexy porn gay erotica erotico topless naked viagra erotismo porno porn lesbian amateur tit\b)
|
13
|
+
bad_words[:pornspam] |= %w( gratis erotismo porno torrent bittorrent adulto videochat video 3dsex)
|
14
|
+
bad_words[:pornspam] << /pel?cula/ << /pornogr?fica/ << "portal porno" # srsly, spamming in spanish?
|
15
|
+
bad_words[:pornspam] |= %w( webcam free-web-host rapidshare)
|
16
|
+
|
17
|
+
bad_words[:viagraspam] = %w( cialis viagra pharmacy prescription levitra kamagra)
|
18
|
+
bad_words[:benzospam] = %w( ultram tramadol pharmacy prescription )
|
19
|
+
bad_words[:cashspam] = %w( payday loan jihad ) << "payday loan"
|
20
|
+
bad_words[:pharmaspam] = %w( propecia finasteride viagra )
|
21
|
+
|
22
|
+
bad_words[:nigerian] = ["million pounds sterling", "dear sirs,", "any bank account", "winning notification", "western union", "diagnosed with cancer", "bank treasury", "unclaimed inheritance"]
|
23
|
+
|
24
|
+
# linkspammers
|
25
|
+
bad_words[:linkspam] = ["increase traffic", "discovered your blog", "backlinks", "sent me a link", "more visitors to my site", "targeted traffic", "increase traffic to your website", "estore"]
|
26
|
+
|
27
|
+
bad_words[:beats] = %w( beats dre headphones sale cheap shipping ) << "monster beats" << "best online"
|
28
|
+
bad_words[:rolex] = %w( rolex watch replica watches price )
|
29
|
+
bad_words[:wtf] = %w( bilete avion )
|
30
|
+
|
31
|
+
# buying fake shitty brand stuff
|
32
|
+
bad_words[:bagspam] = %w(handbag louis louisvuitton vuitton chanel coach clearance outlet hermes bag scarf sale ralphlauren)
|
33
|
+
bad_words[:handbags] = %w( karenmillen michaelkors kors millen bags purchase handbag chanel outlet tasche longchamp kaufen louboutin christianlouboutin)
|
34
|
+
bad_words[:blingspam] = %w( tiffany jewellery tiffanyco clearance outlet)
|
35
|
+
bad_words[:uggspam] = %w(\buggs?\b \buggboots\b clearance outlet )
|
36
|
+
bad_words[:wedding] = ["wedding", "wedding dress", "weddingdress", "strapless"]
|
37
|
+
|
38
|
+
bad_words[:webcamspam] = %w( live girls webcam adult singles) << "chat room"
|
39
|
+
bad_words[:gamereview] = %w( games-review-it.com game-reviews-online.com )
|
40
|
+
bad_words[:streaming] = %w( watchmlbbaseball watchnhlhockey pspnsportstv.com )
|
41
|
+
|
42
|
+
bad_words[:forum_spam] = ["IMG", "url="]
|
15
43
|
|
16
44
|
suspicious_words = %w( free buy galleries dating gallery hard hardcore video homemade celebrity ) << "credit card" << "my friend" << "friend sent me"
|
17
|
-
suspicious_words |= %w( adult
|
45
|
+
suspicious_words |= %w( adult overnight shipping free hot movie nylon arab ?????? seo)
|
18
46
|
suspicious_words << "forums/member.php?u=" << "chat room" << "free chat" << "yahoo chat" << "page.php"
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
47
|
+
|
48
|
+
bad_words.each do |key,wordlist|
|
49
|
+
counter = 0
|
50
|
+
wordlist.each do |word|
|
51
|
+
results = Regexp.new("\\b(#{word})\\b").match @body
|
52
|
+
if results && results.size > 0
|
53
|
+
counter += 1
|
54
|
+
add_score((self.class.bad_word_score ** results.size), "nasty word: '#{word}'")
|
55
|
+
|
56
|
+
# Add more points if the bad word is INSIDE a link
|
57
|
+
@body.scan(/<a[^>]+>(.*?)<\/a>/).each do |match|
|
58
|
+
add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a link: #{word}"
|
59
|
+
end
|
60
|
+
@body.scan(/\nhttp:\/\/(.*?#{word})\//).each do |match|
|
61
|
+
add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a straight-up link: #{word}"
|
62
|
+
end
|
63
|
+
@body.scan(/<a.*?>(.*?)<\/a>/).each do |links|
|
64
|
+
add_score self.class.bad_word_score * 50, "nasty word is the entire link: #{word}"
|
65
|
+
end
|
66
|
+
@body.scan(/<a(.*?)>/).each do |match|
|
67
|
+
add_score self.class.bad_word_score * 10 * match[0].scan(word).size, "nasty word inside a URL: #{word}"
|
68
|
+
end
|
69
|
+
|
29
70
|
end
|
30
|
-
|
31
|
-
add_score
|
71
|
+
if counter > (wordlist.size / 2)
|
72
|
+
add_score 1000, "Lots of bad words from one genre (#{key}): #{counter}"
|
32
73
|
end
|
33
74
|
end
|
34
75
|
end
|
@@ -43,4 +84,4 @@ class Splam::Rules::BadWords < Splam::Rule
|
|
43
84
|
end
|
44
85
|
end
|
45
86
|
end
|
46
|
-
end
|
87
|
+
end
|
data/lib/splam/rules/bbcode.rb
CHANGED
@@ -3,10 +3,12 @@ class Splam::Rules::Bbcode < Splam::Rule
|
|
3
3
|
def run
|
4
4
|
add_score 10 * @body.scan("showpost.php?p=").size, "Linking to a shitty forum"
|
5
5
|
# add_score 10 * @body.scan("\r\n").size, "Poorly formed POST (\\r\\n)"
|
6
|
+
add_score 80 * @body.scan(/\n\[url.*?\]\n/).size, "Shitty bbcode url covers entire line"
|
6
7
|
add_score 40 * @body.scan("[url=").size, "URL" # no URLS for you!!
|
7
8
|
add_score 40 * @body.scan("[URL=").size, "URL" # no URLS for you!!
|
8
|
-
add_score
|
9
|
-
add_score
|
9
|
+
add_score 45 * @body.scan("[url=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
|
10
|
+
add_score 45 * @body.scan("[URL=http").size, "Shitty URL/html" # another 10 points for shitty bbcode html
|
11
|
+
add_score 30 * @body.scan("[/CODE").size, "Forum codes?"
|
10
12
|
add_score 10 * @body.scan(/\[[bai]/).size, "b/a/i tag"
|
11
13
|
end
|
12
14
|
end
|
data/lib/splam/rules/chinese.rb
CHANGED
@@ -3,22 +3,42 @@ class Splam::Rules::Chinese < Splam::Rule
|
|
3
3
|
class << self
|
4
4
|
attr_accessor :base_score
|
5
5
|
end
|
6
|
-
self.base_score =
|
6
|
+
self.base_score = 5
|
7
7
|
|
8
8
|
def run
|
9
9
|
banned_words =[ # various chinese characters
|
10
10
|
"\350\263\207",
|
11
11
|
"\351\207\221",
|
12
12
|
"\357\274\222", # number 2 in weird unicode
|
13
|
-
"\357\274\224", # number 4
|
13
|
+
"\357\274\224", # number 4
|
14
14
|
"\357\274\225", # number 5
|
15
15
|
"\357\274\231", # number 9
|
16
|
+
|
17
|
+
"\345\260\232",
|
18
|
+
"\345\256\266",
|
19
|
+
"\345\274\267",
|
20
|
+
"\345\240\261",
|
21
|
+
"\345\260\216",
|
22
|
+
"\345\217\260",
|
23
|
+
"\345\215\227",
|
24
|
+
"\346\235\261",
|
25
|
+
"\345\270\202",
|
26
|
+
"\345\240\264",
|
27
|
+
"\345\202\263",
|
28
|
+
"\346\216\250",
|
29
|
+
"\346\231\202",
|
30
|
+
"\347\203\210",
|
31
|
+
"\347\216\251",
|
32
|
+
"\350\226\246",
|
33
|
+
"\350\217\234",
|
34
|
+
"\350\216\216",
|
35
|
+
|
16
36
|
"\357\274\215", # hyphen
|
17
|
-
# /\\357\2\d\d\\\d{3}/, # TODO
|
37
|
+
# /\\357\2\d\d\\\d{3}/, # TODO SyntaxError on 1.9
|
18
38
|
# "\357", # ugh, these don't work .. because they're only part of a character.
|
19
39
|
# "\351",
|
20
40
|
"\35"
|
21
|
-
]
|
41
|
+
].compact
|
22
42
|
banned_words.each do |word|
|
23
43
|
hits = (self.class.base_score * @body.scan(word).size) # 1 point for every banned word
|
24
44
|
add_score hits, "Banned character: #{word}"
|
data/lib/splam/rules/fuzz.rb
CHANGED
@@ -3,11 +3,16 @@ class Splam::Rules::Fuzz < Splam::Rule
|
|
3
3
|
attr_accessor :bad_word_score
|
4
4
|
end
|
5
5
|
|
6
|
-
self.bad_word_score
|
6
|
+
self.bad_word_score = 10
|
7
7
|
|
8
8
|
def run
|
9
9
|
patterns = [/^(\d[a-z])/, /(\d[a-z][A-Z]\w+)/, /(\b\w+\d\.txt)/, /(;\d+;)/ ]
|
10
|
+
ignore_if = [%r{vendor/rails}, /EXC_BAD_ACCESS/, /JavaAppLauncher/, %r{Contents/MacOS}, %r{/Library/}]
|
10
11
|
matches = 0
|
12
|
+
# looks like a stack trace
|
13
|
+
ignore_if.each do |pattern|
|
14
|
+
return if @body.scan(pattern)
|
15
|
+
end
|
11
16
|
patterns.each do |pattern|
|
12
17
|
results = @body.scan(pattern)
|
13
18
|
if results && results.size > 0
|
@@ -2,18 +2,21 @@ class Splam::Rules::GoodWords < Splam::Rule
|
|
2
2
|
|
3
3
|
def run
|
4
4
|
good_words = [ /I\'having a problem/, ]
|
5
|
-
good_words |= %w( lighthouse activereload warehouse install eclipse settings assigned user ticket tickets token api number query request)
|
5
|
+
good_words |= %w( lighthouse lighthouseapp activereload warehouse install eclipse settings assigned user ticket tickets token api number query request)
|
6
6
|
good_words |= %w( browser feed firefox safari skitch vendor rails action_controller railties )
|
7
7
|
good_words |= %w( redirect login diff dreamhost setup subversion git wildcard domain subdomain ssh database )
|
8
8
|
good_words |= %w( project billing tags description comment milestone saving happening feature mac implement report)
|
9
9
|
good_words |= %w( rss notification subscribe calendar chart note task gantt search service ownership application communicate )
|
10
|
-
good_words |= %w(
|
10
|
+
good_words |= %w( interaction API tickets hosted domain skitch )
|
11
|
+
good_words |= %w( pattern template web integer status xml activereload html state page rack diff )
|
11
12
|
good_words << "project management"
|
12
13
|
good_words << "/usr/local/lib" << "gems"
|
13
14
|
|
14
15
|
body = @body.downcase
|
15
16
|
good_words.each { |rule|
|
16
|
-
|
17
|
+
|
18
|
+
results = Regexp.new("\\b(#{rule})\\b","i").match(body)
|
19
|
+
add_score -50 * results.size, "relevant word match: #{rule}" if results
|
17
20
|
}
|
18
21
|
end
|
19
22
|
end
|
data/lib/splam/rules/href.rb
CHANGED
@@ -4,8 +4,8 @@ class Splam::Rules::Href < Splam::Rule
|
|
4
4
|
|
5
5
|
def run
|
6
6
|
# add_score 3 * @body.scan("href=http").size, "Shitty html 'href=http'" # 3 points for shitty html
|
7
|
-
add_score
|
8
|
-
add_score
|
7
|
+
add_score 35 * @body.scan(/href\=\s*http/).size, "Shitty html 'href=http'" # 15 points for shitty html
|
8
|
+
add_score 35 * @body.scan(/href\="\s+http/).size, "Shitty html 'href=\" http'" # 15 points for shitty html
|
9
9
|
add_score 50 * @body.scan(/\A<a.*?<\/a>\Z/).size, "Single link post'" # 50 points for shitty
|
10
10
|
|
11
11
|
link_count = @body.scan("http://").size
|
@@ -13,7 +13,7 @@ class Splam::Rules::Href < Splam::Rule
|
|
13
13
|
add_score 50, "More than 10 links" if link_count > 10 # more than 10 links? spam.
|
14
14
|
add_score 100, "More than 20 links" if link_count > 20 # more than 20 links? definitely spam.
|
15
15
|
add_score 1000, "More than 50 links" if link_count > 50 # more than 20 links? definitely spam.
|
16
|
-
|
16
|
+
|
17
17
|
# Modify these scores to weight certain problematic domains.
|
18
18
|
# You may need to modify these for your application
|
19
19
|
suspicious_top_level_domains = {
|
@@ -22,6 +22,7 @@ class Splam::Rules::Href < Splam::Rule
|
|
22
22
|
'us' => 8, # .us ? possibly spam
|
23
23
|
'it' => 5,
|
24
24
|
'tk' => 20,
|
25
|
+
'eu' => 20,
|
25
26
|
'pl' => 8,
|
26
27
|
'info' => 20,
|
27
28
|
'biz' => 40 # no-one uses these for reals
|
@@ -33,11 +34,12 @@ class Splam::Rules::Href < Splam::Rule
|
|
33
34
|
|
34
35
|
tokens = @body.split(" ")
|
35
36
|
if tokens[-1] =~ /^http:\/\//
|
36
|
-
add_score
|
37
|
-
add_score
|
37
|
+
add_score 20, "Text ends in a http token"
|
38
|
+
add_score 150, "Text ends in a http token and only has one token" if link_count == 1
|
39
|
+
add_score 150, "Text ends in a http token with a shitty domain " if tokens[-1].match(/http:\/\/#{suspicious_sites.keys.join("|")}\./)
|
38
40
|
end
|
39
41
|
|
40
|
-
@body.scan(/http:\/\/(.*?)[
|
42
|
+
@body.scan(/http:\/\/(.*?)[\/\>\]?]/) do |match|
|
41
43
|
# $stderr.puts "checking #{match}"
|
42
44
|
if domain = match.to_s.split(".")
|
43
45
|
tld = domain[-1]
|
@@ -48,7 +50,9 @@ class Splam::Rules::Href < Splam::Rule
|
|
48
50
|
|
49
51
|
if found = suspicious_sites[domain[-2]]
|
50
52
|
add_score found, "Suspicious hostname: '#{domain[-2]}'"
|
53
|
+
add_score found * 5, "..document ends in suspicious hostname" if tokens[-1] =~ /^http:\/\//
|
51
54
|
end
|
55
|
+
|
52
56
|
end
|
53
57
|
end
|
54
58
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'resolv'
|
2
|
+
# Liberally copied from https://github.com/bpalmen/httpbl/blob/master/lib/httpbl.rb
|
3
|
+
|
4
|
+
class Splam::Rules::Httpbl < Splam::Rule
|
5
|
+
if RUBY_VERSION < "1.9"
|
6
|
+
require 'system_timer'
|
7
|
+
else
|
8
|
+
require 'timeout'
|
9
|
+
SystemTimer = Timeout
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
class << self
|
14
|
+
attr_accessor :api_key
|
15
|
+
end
|
16
|
+
|
17
|
+
def run
|
18
|
+
return unless @request # no ip available
|
19
|
+
return unless @request[:remote_ip] # no ip available
|
20
|
+
|
21
|
+
ip = @request[:remote_ip]
|
22
|
+
|
23
|
+
if result = self.class.check_blacklist(ip)
|
24
|
+
add_score 250, "IP address (#{ip}) appears in ProjectHoneypot blacklist. (#{result.inspect})"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.check_blacklist(ip)
|
29
|
+
# @cache = REDIS if defined?(REDIS)
|
30
|
+
# result = @cache && @cache["ip.#{ip}"]
|
31
|
+
# result ||= resolve(ip)
|
32
|
+
# if @cache
|
33
|
+
# @cache.set "ip.#{ip}", result if @cache
|
34
|
+
# @cache.expire "ip.#{ip}", 1.week
|
35
|
+
# end
|
36
|
+
result = resolve(ip)
|
37
|
+
response = result.split(".").collect!(&:to_i)
|
38
|
+
|
39
|
+
# responses:
|
40
|
+
# a, b, c, d
|
41
|
+
# a = 127 if success
|
42
|
+
# b = days since last activity
|
43
|
+
# c = threat score, 0..255 (0 is not threat)
|
44
|
+
# d = type of visitor
|
45
|
+
raise "Bad httpbl request format!" if response[0] != 127
|
46
|
+
return response[3] > 0 || response[2] > 100
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.resolve(ip)
|
50
|
+
query = "#{@@api_key}.#{ip.split('.').reverse.join('.')}.dnsbl.httpbl.org"
|
51
|
+
SystemTimer::timeout(0.5) do
|
52
|
+
begin
|
53
|
+
Resolv::DNS.new.getaddress(query).to_s
|
54
|
+
rescue Resolv::ResolvError
|
55
|
+
"127.0.0.0"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
rescue Errno::ECONNREFUSED
|
59
|
+
# derp
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
@@ -10,8 +10,8 @@ class Splam::Rules::LineLength < Splam::Rule
|
|
10
10
|
lines.each do |line|
|
11
11
|
next if line =~ /\A\s{4,}/ # ignore code blocks
|
12
12
|
|
13
|
-
multiplier = (lines.size == 1) ? 10 : 1 # one line? fail.
|
14
|
-
|
13
|
+
# multiplier = (lines.size == 1) ? 10 : 1 # one line? fail.
|
14
|
+
multiplier = 1
|
15
15
|
|
16
16
|
# 1 point for each 40 chars in a line.
|
17
17
|
hits = (line.size / 40) * multiplier
|
@@ -1,5 +1,4 @@
|
|
1
1
|
class Splam::Rules::Punctuation < Splam::Rule
|
2
|
-
|
3
2
|
def run
|
4
3
|
punctuation = @body.scan(/[.,] /)
|
5
4
|
add_score 10, "Text has no punctuation" if punctuation.size == 0
|
@@ -7,8 +6,10 @@ class Splam::Rules::Punctuation < Splam::Rule
|
|
7
6
|
@body.split(/[.,]/).each do |sentence|
|
8
7
|
words = sentence.split(" ")
|
9
8
|
# long sentence, add a point.
|
10
|
-
|
11
|
-
|
9
|
+
unless line_safe?(sentence)
|
10
|
+
add_score 1, "Sentence has more than 10 words" if words.size > 10
|
11
|
+
add_score 10, "Sentence has more than 30 words" if words.size > 30
|
12
|
+
end
|
12
13
|
end
|
13
14
|
end
|
14
|
-
end
|
15
|
+
end
|
data/lib/splam/rules/russian.rb
CHANGED
@@ -3,7 +3,7 @@ class Splam::Rules::Russian < Splam::Rule
|
|
3
3
|
|
4
4
|
def run
|
5
5
|
banned_words =[ # various russian characters
|
6
|
-
"\320\241", "\320\220", "\320\234", "\320\257", "\320\233", "\320\243",
|
6
|
+
"\320\241", "\320\220", "\320\234", "\320\257", "\320\233", "\320\243",
|
7
7
|
"с", "м", "о", "т", "р", "е", "т", "ь", "п", "о", "р", "н", "о", "р", "л", "и", "к"
|
8
8
|
# unicode char
|
9
9
|
# "\320"
|
@@ -0,0 +1,15 @@
|
|
1
|
+
class Splam::Rules::User < Splam::Rule
|
2
|
+
|
3
|
+
def run
|
4
|
+
bad_words = ["qq.com", "yahoo.cn", "126.com"]
|
5
|
+
bad_words |= %w( mortgage )
|
6
|
+
|
7
|
+
bad_words.each do |word|
|
8
|
+
add_score 50, "User's email address has suspicious parts: #{word}" if @user.email.include?(word)
|
9
|
+
end
|
10
|
+
|
11
|
+
add_score "20", "User has lots and lots of dots" if @user.email.split("@")[0].scan(/\./).size > 5
|
12
|
+
|
13
|
+
add_score 5, "User is untrusted" if !@user.trusted?
|
14
|
+
end
|
15
|
+
end
|
@@ -16,17 +16,16 @@ class Splam::Rules::WordLength < Splam::Rule
|
|
16
16
|
|
17
17
|
def run
|
18
18
|
words = []
|
19
|
-
words = @body.split(/\s/)
|
20
|
-
|
21
|
-
|
22
|
-
words.delete_if { |w| w =~ /^http\:\/\//}
|
19
|
+
words = @body.split(/\s/)
|
20
|
+
words.delete_if { |w| w =~ /^https?\:\/\// }
|
21
|
+
words.collect! { |word| word.size }
|
23
22
|
|
24
23
|
# Only count word lengths over 10
|
25
24
|
if words.size > 5
|
26
|
-
add_score
|
27
|
-
add_score
|
28
|
-
add_score
|
29
|
-
add_score
|
25
|
+
add_score 5, "Average word length over 5" if average(words) > 5
|
26
|
+
add_score 10, "Average word length over 10" if average(words) > 10
|
27
|
+
add_score 5, "Median word length over 5" if median(words) > 5
|
28
|
+
add_score 10, "Median word length over 10" if median(words) > 10
|
30
29
|
end
|
31
30
|
end
|
32
31
|
end
|
data/splam.gemspec
CHANGED
data/test/ngram_test.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'test_helper')
|
2
|
+
require "splam/ngram"
|
3
|
+
require "redis"
|
4
|
+
REDIS = Redis.new :db => "12"
|
5
|
+
class NgramTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@corpus = Splam::Ngram.new
|
9
|
+
|
10
|
+
# Will only try to stuff this into redis if it appears empty. Hacky, but works
|
11
|
+
if REDIS.hlen("spam") < 50
|
12
|
+
REDIS.expire "spam", 0
|
13
|
+
REDIS.expire "ham", 0
|
14
|
+
|
15
|
+
Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "comment", "spam", "*.txt")).each do |f|
|
16
|
+
spam = File.open(f).read
|
17
|
+
@corpus.train spam, true
|
18
|
+
end
|
19
|
+
puts "loading ham"
|
20
|
+
Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "comment", "ham", "*.txt")).each do |f|
|
21
|
+
ham = File.open(f).read
|
22
|
+
@corpus.train ham, false
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_learns_spam
|
28
|
+
score = @corpus.compare("Bienvenido a nuestro nuevo portal porno")
|
29
|
+
assert score[1] > score[0] * 2
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_learns_ham
|
33
|
+
score = @corpus.compare("Is this a known issue?")
|
34
|
+
assert score[0] > score[1] * 2
|
35
|
+
end
|
36
|
+
end
|
data/test/splam_test.rb
CHANGED
@@ -7,6 +7,16 @@ class SplamTest < Test::Unit::TestCase
|
|
7
7
|
end
|
8
8
|
end
|
9
9
|
|
10
|
+
class User
|
11
|
+
attr_accessor :trusted
|
12
|
+
def trusted?
|
13
|
+
trusted
|
14
|
+
end
|
15
|
+
def email
|
16
|
+
"test@test.com"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
10
20
|
class Foo
|
11
21
|
include ::Splam
|
12
22
|
splammable :body
|
@@ -14,6 +24,9 @@ class SplamTest < Test::Unit::TestCase
|
|
14
24
|
def body
|
15
25
|
@body || "This is body\320\224 \320\199"
|
16
26
|
end
|
27
|
+
def user
|
28
|
+
User.new
|
29
|
+
end
|
17
30
|
end
|
18
31
|
|
19
32
|
class FooCond
|
@@ -27,7 +40,9 @@ class SplamTest < Test::Unit::TestCase
|
|
27
40
|
splammable :body do |s|
|
28
41
|
s.rules = [:fixed_rule, FixedRule]
|
29
42
|
end
|
30
|
-
|
43
|
+
def user
|
44
|
+
User.new
|
45
|
+
end
|
31
46
|
def body
|
32
47
|
'lol wut'
|
33
48
|
end
|
@@ -38,6 +53,9 @@ class SplamTest < Test::Unit::TestCase
|
|
38
53
|
splammable :body do |s|
|
39
54
|
s.rules = {:fixed_rule => 3}
|
40
55
|
end
|
56
|
+
def user
|
57
|
+
User.new
|
58
|
+
end
|
41
59
|
|
42
60
|
def body
|
43
61
|
'lol wut'
|
@@ -47,7 +65,7 @@ class SplamTest < Test::Unit::TestCase
|
|
47
65
|
def test_runs_plugins
|
48
66
|
f = Foo.new
|
49
67
|
assert ! f.splam?
|
50
|
-
assert_equal
|
68
|
+
assert_equal 40, f.splam_score
|
51
69
|
end
|
52
70
|
|
53
71
|
def test_runs_plugins_with_specified_rules
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: splam
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -47,10 +47,11 @@ files:
|
|
47
47
|
- Gemfile
|
48
48
|
- Gemfile.lock
|
49
49
|
- MIT-LICENSE
|
50
|
-
- README
|
51
50
|
- Rakefile
|
51
|
+
- Readme.md
|
52
52
|
- gem-public_cert.pem
|
53
53
|
- lib/splam.rb
|
54
|
+
- lib/splam/ngram.rb
|
54
55
|
- lib/splam/rule.rb
|
55
56
|
- lib/splam/rules.rb
|
56
57
|
- lib/splam/rules/arms_race.rb
|
@@ -61,9 +62,11 @@ files:
|
|
61
62
|
- lib/splam/rules/good_words.rb
|
62
63
|
- lib/splam/rules/href.rb
|
63
64
|
- lib/splam/rules/html.rb
|
65
|
+
- lib/splam/rules/httpbl.rb
|
64
66
|
- lib/splam/rules/line_length.rb
|
65
67
|
- lib/splam/rules/punctuation.rb
|
66
68
|
- lib/splam/rules/russian.rb
|
69
|
+
- lib/splam/rules/user.rb
|
67
70
|
- lib/splam/rules/word_length.rb
|
68
71
|
- splam.gemspec
|
69
72
|
- test/fixtures/comment/ham/api-1.txt
|
@@ -101,7 +104,6 @@ files:
|
|
101
104
|
- test/fixtures/comment/spam/comment_cnn.txt
|
102
105
|
- test/fixtures/comment/spam/comment_randi.txt
|
103
106
|
- test/fixtures/comment/spam/comment_wordy.txt
|
104
|
-
- test/fixtures/comment/spam/consent.txt
|
105
107
|
- test/fixtures/comment/spam/december.txt
|
106
108
|
- test/fixtures/comment/spam/digital_rights.txt
|
107
109
|
- test/fixtures/comment/spam/dyed_wool.txt
|
@@ -139,6 +141,7 @@ files:
|
|
139
141
|
- test/fixtures/comment/spam/troubles.txt
|
140
142
|
- test/fixtures/comment/spam/url_only_idiot.txt
|
141
143
|
- test/fixtures/comment/spam/webcam.txt
|
144
|
+
- test/ngram_test.rb
|
142
145
|
- test/splam_rule_test.rb
|
143
146
|
- test/splam_test.rb
|
144
147
|
- test/test_helper.rb
|
@@ -157,7 +160,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
157
160
|
version: '0'
|
158
161
|
segments:
|
159
162
|
- 0
|
160
|
-
hash:
|
163
|
+
hash: 2317839165020925326
|
161
164
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
162
165
|
none: false
|
163
166
|
requirements:
|
@@ -166,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
169
|
version: '0'
|
167
170
|
segments:
|
168
171
|
- 0
|
169
|
-
hash:
|
172
|
+
hash: 2317839165020925326
|
170
173
|
requirements: []
|
171
174
|
rubyforge_project:
|
172
175
|
rubygems_version: 1.8.24
|
metadata.gz.sig
CHANGED
@@ -1,2 +1 @@
|
|
1
|
-
|
2
|
-
[�z�Q�~�o��#T��5�OiZýh�x��!�M�Yk�S�c�xO�/v���#�������C�g�JYP<6��3�b���\^�`6O�Ils+4�j� kU���#q�X��p�5C�n�fy�t5O��V`�ξ�Y�����J�����kO�)[�|.
|
1
|
+
H�y7�t�[�>USWH�E�#����ӝ��r���;E8R'���([qq��nfv� ���E(� c ��ojn�w��su��ޒ����]�~�C�E�|Z7߱���ɫ�c94�NCi_�W$@���b�7� �ٷ�
|
@@ -1 +0,0 @@
|
|
1
|
-
Metro: Faidherbe-ChalignyHappy Eating! There is one reason and one reason only that conflict occurs in marriage. Epistemology- or why we believe something is irrelevant in each democratic voice. A female under age 16 and a male under age 18 cannot secure a marriage license in the State of Rhode Island without the approval of the Family Court. Great article on CatholicCulture.org:There are a lot of social reasons why well-constructed families are important to our society. Oklahoma: If you are under 18, your parents must appear at the courthouse with you to sign a consent form.
|