rbayes 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/Manifest.txt +7 -0
- data/README.txt +20 -0
- data/Rakefile +15 -0
- data/bin/rbayes_check +42 -0
- data/bin/rbayes_dump +27 -0
- data/lib/rbayes.rb +231 -0
- metadata +61 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.txt
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'hoe'
|
2
|
+
|
3
|
+
require './lib/rbayes'
|
4
|
+
|
5
|
+
Hoe.new 'rbayes', RBayes::VERSION do |p|
|
6
|
+
p.summary = 'An email-focused bayesian filter'
|
7
|
+
p.description = 'An bayesian filter fed by a tokenizer that throws crap out you\'d find in emails. Originally by Dan Peterson'
|
8
|
+
p.author = 'Eric Hodel'
|
9
|
+
p.email = 'drbrain@segment7.net'
|
10
|
+
p.url = 'http://seattlerb.rubyforge.org/rbayes/'
|
11
|
+
p.changes = File.read('History.txt').scan(/\A(=.*?)(=|\Z)/m).first.first
|
12
|
+
|
13
|
+
p.rubyforge_name = 'seattlerb'
|
14
|
+
end
|
15
|
+
|
data/bin/rbayes_check
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/local/bin/ruby
|
2
|
+
#
|
3
|
+
# check.rb - part of rbayes
|
4
|
+
# Dan Peterson <danp@danp.net>
|
5
|
+
# you can do whatever you want with this file but i appreciate credit
|
6
|
+
#
|
7
|
+
# given a message on stdin, find tokens and determine the message's spam
|
8
|
+
# probability based on token ratings as described at
|
9
|
+
# http://www.paulgraham.com/spam.html
|
10
|
+
#
|
11
|
+
# options:
|
12
|
+
# -c turn on case sensitivity (default: off)
|
13
|
+
# -d debug
|
14
|
+
# -f database file
|
15
|
+
#
|
16
|
+
|
17
|
+
require 'getoptlong'
|
18
|
+
require 'rbayes'
|
19
|
+
|
20
|
+
case_sensitive = false
|
21
|
+
debug = false
|
22
|
+
dbfile = nil
|
23
|
+
|
24
|
+
opts = GetoptLong.new(
|
25
|
+
['-c', GetoptLong::NO_ARGUMENT],
|
26
|
+
['-d', GetoptLong::NO_ARGUMENT],
|
27
|
+
['-f', GetoptLong::REQUIRED_ARGUMENT]
|
28
|
+
)
|
29
|
+
|
30
|
+
opts.each do |opt, arg|
|
31
|
+
case opt
|
32
|
+
when '-c' then case_sensitive = true
|
33
|
+
when '-d' then debug = true
|
34
|
+
when '-f' then dbfile = arg
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
rbayes = RBayes.new dbfile, case_sensitive, debug
|
39
|
+
prob = rbayes.rate $stdin.read
|
40
|
+
|
41
|
+
puts '%0.4f' % prob
|
42
|
+
|
data/bin/rbayes_dump
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/local/bin/ruby
|
2
|
+
#
|
3
|
+
# dump.rb - part of rbayes
|
4
|
+
# Dan Peterson <danp@danp.net>
|
5
|
+
# you can do whatever you want with this file but i appreciate credit
|
6
|
+
#
|
7
|
+
# prints token information for each token in the database used by gen.rb and
|
8
|
+
# check.rb
|
9
|
+
#
|
10
|
+
|
11
|
+
require 'rbayes'
|
12
|
+
|
13
|
+
rb = RBayes.new ARGV.shift
|
14
|
+
|
15
|
+
count_bland = rb.count_bland
|
16
|
+
count_tasty = rb.count_tasty
|
17
|
+
|
18
|
+
length = [count_bland, count_tasty].map { |c| c.to_s.length }.max
|
19
|
+
|
20
|
+
puts "%#{length}d tasty tokens" % count_tasty
|
21
|
+
puts "%#{length}d bland tokens" % count_bland
|
22
|
+
|
23
|
+
rb.database.each do |token, value|
|
24
|
+
next if token =~ /^\s/
|
25
|
+
puts "rating = %0.4f (#{token}: #{value})" % [rb.rate_token(token)]
|
26
|
+
end
|
27
|
+
|
data/lib/rbayes.rb
ADDED
@@ -0,0 +1,231 @@
|
|
1
|
+
#!/usr/bin/env ruby -w
|
2
|
+
|
3
|
+
require 'bdb1'
|
4
|
+
|
5
|
+
# Dan Peterson <danp@danp.net>
|
6
|
+
# you can do whatever you want with this file but i appreciate credit
|
7
|
+
#
|
8
|
+
# Refactored by Eric Hodel <drbrain@segment7.net>
|
9
|
+
|
10
|
+
class RBayes
|
11
|
+
|
12
|
+
##
|
13
|
+
# The version of RBayes you are using.
|
14
|
+
|
15
|
+
VERSION = '1.0.0'
|
16
|
+
|
17
|
+
# :stopdoc:
|
18
|
+
COUNT_BLAND = " count_bland "
|
19
|
+
COUNT_TASTY = " count_tasty "
|
20
|
+
# :startdoc:
|
21
|
+
|
22
|
+
##
|
23
|
+
# Bland tokens
|
24
|
+
|
25
|
+
attr_reader :count_bland
|
26
|
+
|
27
|
+
##
|
28
|
+
# Tasty tokens
|
29
|
+
|
30
|
+
attr_reader :count_tasty
|
31
|
+
|
32
|
+
##
|
33
|
+
# The BDB1 DB holding the token information.
|
34
|
+
|
35
|
+
attr_reader :database
|
36
|
+
|
37
|
+
##
|
38
|
+
# Creates a new RBayes object using the database +token_file+. If +test+ is
|
39
|
+
# true no writes are performed. If +debug+ is true stuff gets logged to
|
40
|
+
# $stderr. +case_sensitive+ should be obvious.
|
41
|
+
|
42
|
+
def initialize(token_file, case_sensitive = false, test = false,
|
43
|
+
debug = false)
|
44
|
+
@case_sensitive = case_sensitive
|
45
|
+
@test = test
|
46
|
+
@debug = debug
|
47
|
+
|
48
|
+
@database = BDB1::Hash.open token_file, 'a+'
|
49
|
+
|
50
|
+
@count_tasty = @database[COUNT_TASTY].to_i || 0
|
51
|
+
@count_bland = @database[COUNT_BLAND].to_i || 0
|
52
|
+
|
53
|
+
log "ham tokens: #{@count_tasty} bland tokens: #{@count_bland}"
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Logs +s+ to $stderr if debugging is on.
|
58
|
+
|
59
|
+
def log(s)
|
60
|
+
$stderr.puts s if @debug
|
61
|
+
end
|
62
|
+
|
63
|
+
##
|
64
|
+
# Yields tokens in +message+ ignoring the boring headers and such.
|
65
|
+
|
66
|
+
def read_tokens_in(message)
|
67
|
+
message.split($/).each do |line|
|
68
|
+
line.chomp! "\r\n"
|
69
|
+
|
70
|
+
next if line =~ /^\.?Date:/i
|
71
|
+
next if line =~ /^\.?Message-ID:/i
|
72
|
+
next if line =~ /^\.?In-Reply-To:/i
|
73
|
+
next if line =~ /^\.?References:/i
|
74
|
+
next if line =~ /^\.?[A-Za-z0-9\/\+]+$/
|
75
|
+
next if line =~ /SMTP id/i
|
76
|
+
next if line =~ /boundary=/
|
77
|
+
next if line =~ /name=\"/
|
78
|
+
next if line =~ /filename=\"/
|
79
|
+
next if line =~ /^--[^\s\n]*$/
|
80
|
+
|
81
|
+
line.downcase! unless @case_sensitive
|
82
|
+
|
83
|
+
#log "Tokenizing #{line.inspect}"
|
84
|
+
line.split(/(?:[^\w.?'@:$\/+-]+)/).each do |token|
|
85
|
+
next if token.length < 3
|
86
|
+
next if token =~ /^\d+$/
|
87
|
+
|
88
|
+
yield token
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
# Returns a Hash mapping tokens to the number of occurances in +message+.
|
95
|
+
|
96
|
+
def count_tokens_in(message)
|
97
|
+
counts = Hash.new 0
|
98
|
+
|
99
|
+
read_tokens_in message do |tok|
|
100
|
+
counts[tok] += 1
|
101
|
+
end
|
102
|
+
|
103
|
+
return counts
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
# Rates +message+ as tasty or bland.
|
108
|
+
|
109
|
+
def rate(message)
|
110
|
+
ratings = {}
|
111
|
+
|
112
|
+
read_tokens_in message do |tok|
|
113
|
+
unless ratings.has_key? tok then
|
114
|
+
ratings[tok] = (0.5 - rate_token(tok)).abs
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
inttok = ratings.sort_by { |v| -v[1] }[0..14]
|
119
|
+
|
120
|
+
p = 1.0
|
121
|
+
m1p = 1.0
|
122
|
+
|
123
|
+
inttok.each do |tok, blandness|
|
124
|
+
y = rate_token tok
|
125
|
+
log "token #{tok} is %0.2f bland" % y
|
126
|
+
p *= y
|
127
|
+
m1p *= 1.0 - y
|
128
|
+
end
|
129
|
+
|
130
|
+
return p / (p + m1p)
|
131
|
+
end
|
132
|
+
|
133
|
+
##
|
134
|
+
# Updates the database with tokens from +message+.
|
135
|
+
#
|
136
|
+
# +mode+ may be:
|
137
|
+
#
|
138
|
+
# <tt>:add_bland</tt>:: increases tastiness of found tokens
|
139
|
+
# <tt>:add_tasty</tt>:: increases tastiness of found tokens
|
140
|
+
# <tt>:remove_bland</tt>:: decreases blandness of found tokens
|
141
|
+
# <tt>:remove_tasty</tt>:: decreases tastiness of found tokens
|
142
|
+
|
143
|
+
def update_db_with(message, mode)
|
144
|
+
unless [:add_bland, :remove_bland, :add_tasty, :remove_tasty].include? mode
|
145
|
+
raise ArgumentError, 'invalid mode'
|
146
|
+
end
|
147
|
+
log "updating db: #{mode}"
|
148
|
+
|
149
|
+
counts = count_tokens_in message
|
150
|
+
|
151
|
+
counts.each do |tok, cnt|
|
152
|
+
tnum, bnum = (@database[tok] || "0 0").split(/\s+/)
|
153
|
+
tnum, bnum = tnum.to_i, bnum.to_i
|
154
|
+
log "found: #{tok} #{cnt} times, tasty: #{tnum}, bland: #{bnum}"
|
155
|
+
|
156
|
+
unless @test then
|
157
|
+
case mode
|
158
|
+
when :add_tasty then tnum += cnt
|
159
|
+
when :add_bland then bnum += cnt
|
160
|
+
when :remove_tasty then tnum -= cnt
|
161
|
+
when :remove_bland then bnum -= cnt
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
tnum = 0 if tnum < 0
|
166
|
+
bnum = 0 if bnum < 0
|
167
|
+
|
168
|
+
# token not needed any more, don't waste space
|
169
|
+
if tnum == 0 && bnum == 0 then
|
170
|
+
@database.delete tok unless @test
|
171
|
+
log "probs: #{tok} deleted"
|
172
|
+
|
173
|
+
# update probability database
|
174
|
+
else
|
175
|
+
@database[tok] = [tnum, bnum].join(" ") unless @test
|
176
|
+
log "update: #{tok}, tasty: #{tnum}, bland: #{bnum}"
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# for master count
|
181
|
+
case mode
|
182
|
+
when :add_tasty then @count_tasty += 1
|
183
|
+
when :add_bland then @count_bland += 1
|
184
|
+
when :remove_tasty then @count_tasty -= 1
|
185
|
+
when :remove_bland then @count_bland -= 1
|
186
|
+
end
|
187
|
+
|
188
|
+
@count_tasty = 0 if @count_tasty < 0
|
189
|
+
@count_bland = 0 if @count_bland < 0
|
190
|
+
|
191
|
+
unless @test then
|
192
|
+
@database[COUNT_TASTY] = @count_tasty
|
193
|
+
@database[COUNT_BLAND] = @count_bland
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
##
|
198
|
+
# Rates token +tok+ for tastiness. Returns a probability between 0 and 1.
|
199
|
+
|
200
|
+
def rate_token(tok)
|
201
|
+
tnum, bnum = (@database[tok] || "0 0").split(/\s+/)
|
202
|
+
tnum, bnum = tnum.to_i, bnum.to_i
|
203
|
+
|
204
|
+
if tnum == 0 && bnum > 0 then
|
205
|
+
return 0.99
|
206
|
+
|
207
|
+
elsif bnum == 0 && tnum > 0 then
|
208
|
+
return 0.01
|
209
|
+
|
210
|
+
elsif tnum == 0 && bnum == 0 then
|
211
|
+
return 0.4
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
tasty = 2.0 * tnum
|
216
|
+
bland = bnum.to_f
|
217
|
+
|
218
|
+
tasty /= @count_tasty.to_f
|
219
|
+
tasty = 1.0 if tasty > 1.0
|
220
|
+
bland /= @count_bland.to_f
|
221
|
+
bland = 1.0 if bland > 1.0
|
222
|
+
|
223
|
+
t = bland / (tasty + bland)
|
224
|
+
t = 0.99 if t > 0.99
|
225
|
+
t = 0.01 if t < 0.01
|
226
|
+
|
227
|
+
return t
|
228
|
+
end
|
229
|
+
|
230
|
+
end
|
231
|
+
|
metadata
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0.8
|
3
|
+
specification_version: 1
|
4
|
+
name: rbayes
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2007-01-06 00:00:00 -08:00
|
8
|
+
summary: An email-focused bayesian filter
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: drbrain@segment7.net
|
12
|
+
homepage: http://seattlerb.rubyforge.org/rbayes/
|
13
|
+
rubyforge_project: seattlerb
|
14
|
+
description: An bayesian filter fed by a tokenizer that throws crap out you'd find in emails. Originally by Dan Peterson
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Eric Hodel
|
31
|
+
files:
|
32
|
+
- History.txt
|
33
|
+
- Manifest.txt
|
34
|
+
- README.txt
|
35
|
+
- Rakefile
|
36
|
+
- bin/rbayes_check
|
37
|
+
- bin/rbayes_dump
|
38
|
+
- lib/rbayes.rb
|
39
|
+
test_files: []
|
40
|
+
|
41
|
+
rdoc_options: []
|
42
|
+
|
43
|
+
extra_rdoc_files: []
|
44
|
+
|
45
|
+
executables:
|
46
|
+
- rbayes_check
|
47
|
+
- rbayes_dump
|
48
|
+
extensions: []
|
49
|
+
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
dependencies:
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: hoe
|
55
|
+
version_requirement:
|
56
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.1.6
|
61
|
+
version:
|