rbayes 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ = 1.0.0 / 2007-01-06
2
+
3
+ * Birthday!
4
+ * Fully refactored from Dan Peterson's original to be in a single class.
5
+
@@ -0,0 +1,7 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ bin/rbayes_check
6
+ bin/rbayes_dump
7
+ lib/rbayes.rb
@@ -0,0 +1,20 @@
1
+ = rbayes
2
+
3
+ Rubyforge Project:
4
+
5
+ http://rubyforge.org/projects/seattlerb
6
+
7
+ Documentation:
8
+
9
+ FILL ME IN
10
+
11
+ == About
12
+
13
+ == Installing rbayes
14
+
15
+ Just install the gem:
16
+
17
+ $ sudo gem install rbayes
18
+
19
+ == Using rbayes
20
+
@@ -0,0 +1,15 @@
1
+ require 'hoe'
2
+
3
+ require './lib/rbayes'
4
+
5
+ Hoe.new 'rbayes', RBayes::VERSION do |p|
6
+ p.summary = 'An email-focused bayesian filter'
7
+ p.description = 'An bayesian filter fed by a tokenizer that throws crap out you\'d find in emails. Originally by Dan Peterson'
8
+ p.author = 'Eric Hodel'
9
+ p.email = 'drbrain@segment7.net'
10
+ p.url = 'http://seattlerb.rubyforge.org/rbayes/'
11
+ p.changes = File.read('History.txt').scan(/\A(=.*?)(=|\Z)/m).first.first
12
+
13
+ p.rubyforge_name = 'seattlerb'
14
+ end
15
+
@@ -0,0 +1,42 @@
1
+ #!/usr/local/bin/ruby
2
+ #
3
+ # check.rb - part of rbayes
4
+ # Dan Peterson <danp@danp.net>
5
+ # you can do whatever you want with this file but i appreciate credit
6
+ #
7
+ # given a message on stdin, find tokens and determine the message's spam
8
+ # probability based on token ratings as described at
9
+ # http://www.paulgraham.com/spam.html
10
+ #
11
+ # options:
12
+ # -c turn on case sensitivity (default: off)
13
+ # -d debug
14
+ # -f database file
15
+ #
16
+
17
+ require 'getoptlong'
18
+ require 'rbayes'
19
+
20
+ case_sensitive = false
21
+ debug = false
22
+ dbfile = nil
23
+
24
+ opts = GetoptLong.new(
25
+ ['-c', GetoptLong::NO_ARGUMENT],
26
+ ['-d', GetoptLong::NO_ARGUMENT],
27
+ ['-f', GetoptLong::REQUIRED_ARGUMENT]
28
+ )
29
+
30
+ opts.each do |opt, arg|
31
+ case opt
32
+ when '-c' then case_sensitive = true
33
+ when '-d' then debug = true
34
+ when '-f' then dbfile = arg
35
+ end
36
+ end
37
+
38
+ rbayes = RBayes.new dbfile, case_sensitive, debug
39
+ prob = rbayes.rate $stdin.read
40
+
41
+ puts '%0.4f' % prob
42
+
@@ -0,0 +1,27 @@
1
+ #!/usr/local/bin/ruby
2
+ #
3
+ # dump.rb - part of rbayes
4
+ # Dan Peterson <danp@danp.net>
5
+ # you can do whatever you want with this file but i appreciate credit
6
+ #
7
+ # prints token information for each token in the database used by gen.rb and
8
+ # check.rb
9
+ #
10
+
11
+ require 'rbayes'
12
+
13
+ rb = RBayes.new ARGV.shift
14
+
15
+ count_bland = rb.count_bland
16
+ count_tasty = rb.count_tasty
17
+
18
+ length = [count_bland, count_tasty].map { |c| c.to_s.length }.max
19
+
20
+ puts "%#{length}d tasty tokens" % count_tasty
21
+ puts "%#{length}d bland tokens" % count_bland
22
+
23
+ rb.database.each do |token, value|
24
+ next if token =~ /^\s/
25
+ puts "rating = %0.4f (#{token}: #{value})" % [rb.rate_token(token)]
26
+ end
27
+
@@ -0,0 +1,231 @@
1
+ #!/usr/bin/env ruby -w
2
+
3
+ require 'bdb1'
4
+
5
+ # Dan Peterson <danp@danp.net>
6
+ # you can do whatever you want with this file but i appreciate credit
7
+ #
8
+ # Refactored by Eric Hodel <drbrain@segment7.net>
9
+
10
+ class RBayes
11
+
12
+ ##
13
+ # The version of RBayes you are using.
14
+
15
+ VERSION = '1.0.0'
16
+
17
+ # :stopdoc:
18
+ COUNT_BLAND = " count_bland "
19
+ COUNT_TASTY = " count_tasty "
20
+ # :startdoc:
21
+
22
+ ##
23
+ # Bland tokens
24
+
25
+ attr_reader :count_bland
26
+
27
+ ##
28
+ # Tasty tokens
29
+
30
+ attr_reader :count_tasty
31
+
32
+ ##
33
+ # The BDB1 DB holding the token information.
34
+
35
+ attr_reader :database
36
+
37
+ ##
38
+ # Creates a new RBayes object using the database +token_file+. If +test+ is
39
+ # true no writes are performed. If +debug+ is true stuff gets logged to
40
+ # $stderr. +case_sensitive+ should be obvious.
41
+
42
+ def initialize(token_file, case_sensitive = false, test = false,
43
+ debug = false)
44
+ @case_sensitive = case_sensitive
45
+ @test = test
46
+ @debug = debug
47
+
48
+ @database = BDB1::Hash.open token_file, 'a+'
49
+
50
+ @count_tasty = @database[COUNT_TASTY].to_i || 0
51
+ @count_bland = @database[COUNT_BLAND].to_i || 0
52
+
53
+ log "ham tokens: #{@count_tasty} bland tokens: #{@count_bland}"
54
+ end
55
+
56
+ ##
57
+ # Logs +s+ to $stderr if debugging is on.
58
+
59
+ def log(s)
60
+ $stderr.puts s if @debug
61
+ end
62
+
63
+ ##
64
+ # Yields tokens in +message+ ignoring the boring headers and such.
65
+
66
+ def read_tokens_in(message)
67
+ message.split($/).each do |line|
68
+ line.chomp! "\r\n"
69
+
70
+ next if line =~ /^\.?Date:/i
71
+ next if line =~ /^\.?Message-ID:/i
72
+ next if line =~ /^\.?In-Reply-To:/i
73
+ next if line =~ /^\.?References:/i
74
+ next if line =~ /^\.?[A-Za-z0-9\/\+]+$/
75
+ next if line =~ /SMTP id/i
76
+ next if line =~ /boundary=/
77
+ next if line =~ /name=\"/
78
+ next if line =~ /filename=\"/
79
+ next if line =~ /^--[^\s\n]*$/
80
+
81
+ line.downcase! unless @case_sensitive
82
+
83
+ #log "Tokenizing #{line.inspect}"
84
+ line.split(/(?:[^\w.?'@:$\/+-]+)/).each do |token|
85
+ next if token.length < 3
86
+ next if token =~ /^\d+$/
87
+
88
+ yield token
89
+ end
90
+ end
91
+ end
92
+
93
+ ##
94
+ # Returns a Hash mapping tokens to the number of occurances in +message+.
95
+
96
+ def count_tokens_in(message)
97
+ counts = Hash.new 0
98
+
99
+ read_tokens_in message do |tok|
100
+ counts[tok] += 1
101
+ end
102
+
103
+ return counts
104
+ end
105
+
106
+ ##
107
+ # Rates +message+ as tasty or bland.
108
+
109
+ def rate(message)
110
+ ratings = {}
111
+
112
+ read_tokens_in message do |tok|
113
+ unless ratings.has_key? tok then
114
+ ratings[tok] = (0.5 - rate_token(tok)).abs
115
+ end
116
+ end
117
+
118
+ inttok = ratings.sort_by { |v| -v[1] }[0..14]
119
+
120
+ p = 1.0
121
+ m1p = 1.0
122
+
123
+ inttok.each do |tok, blandness|
124
+ y = rate_token tok
125
+ log "token #{tok} is %0.2f bland" % y
126
+ p *= y
127
+ m1p *= 1.0 - y
128
+ end
129
+
130
+ return p / (p + m1p)
131
+ end
132
+
133
+ ##
134
+ # Updates the database with tokens from +message+.
135
+ #
136
+ # +mode+ may be:
137
+ #
138
+ # <tt>:add_bland</tt>:: increases tastiness of found tokens
139
+ # <tt>:add_tasty</tt>:: increases tastiness of found tokens
140
+ # <tt>:remove_bland</tt>:: decreases blandness of found tokens
141
+ # <tt>:remove_tasty</tt>:: decreases tastiness of found tokens
142
+
143
+ def update_db_with(message, mode)
144
+ unless [:add_bland, :remove_bland, :add_tasty, :remove_tasty].include? mode
145
+ raise ArgumentError, 'invalid mode'
146
+ end
147
+ log "updating db: #{mode}"
148
+
149
+ counts = count_tokens_in message
150
+
151
+ counts.each do |tok, cnt|
152
+ tnum, bnum = (@database[tok] || "0 0").split(/\s+/)
153
+ tnum, bnum = tnum.to_i, bnum.to_i
154
+ log "found: #{tok} #{cnt} times, tasty: #{tnum}, bland: #{bnum}"
155
+
156
+ unless @test then
157
+ case mode
158
+ when :add_tasty then tnum += cnt
159
+ when :add_bland then bnum += cnt
160
+ when :remove_tasty then tnum -= cnt
161
+ when :remove_bland then bnum -= cnt
162
+ end
163
+ end
164
+
165
+ tnum = 0 if tnum < 0
166
+ bnum = 0 if bnum < 0
167
+
168
+ # token not needed any more, don't waste space
169
+ if tnum == 0 && bnum == 0 then
170
+ @database.delete tok unless @test
171
+ log "probs: #{tok} deleted"
172
+
173
+ # update probability database
174
+ else
175
+ @database[tok] = [tnum, bnum].join(" ") unless @test
176
+ log "update: #{tok}, tasty: #{tnum}, bland: #{bnum}"
177
+ end
178
+ end
179
+
180
+ # for master count
181
+ case mode
182
+ when :add_tasty then @count_tasty += 1
183
+ when :add_bland then @count_bland += 1
184
+ when :remove_tasty then @count_tasty -= 1
185
+ when :remove_bland then @count_bland -= 1
186
+ end
187
+
188
+ @count_tasty = 0 if @count_tasty < 0
189
+ @count_bland = 0 if @count_bland < 0
190
+
191
+ unless @test then
192
+ @database[COUNT_TASTY] = @count_tasty
193
+ @database[COUNT_BLAND] = @count_bland
194
+ end
195
+ end
196
+
197
+ ##
198
+ # Rates token +tok+ for tastiness. Returns a probability between 0 and 1.
199
+
200
+ def rate_token(tok)
201
+ tnum, bnum = (@database[tok] || "0 0").split(/\s+/)
202
+ tnum, bnum = tnum.to_i, bnum.to_i
203
+
204
+ if tnum == 0 && bnum > 0 then
205
+ return 0.99
206
+
207
+ elsif bnum == 0 && tnum > 0 then
208
+ return 0.01
209
+
210
+ elsif tnum == 0 && bnum == 0 then
211
+ return 0.4
212
+
213
+ end
214
+
215
+ tasty = 2.0 * tnum
216
+ bland = bnum.to_f
217
+
218
+ tasty /= @count_tasty.to_f
219
+ tasty = 1.0 if tasty > 1.0
220
+ bland /= @count_bland.to_f
221
+ bland = 1.0 if bland > 1.0
222
+
223
+ t = bland / (tasty + bland)
224
+ t = 0.99 if t > 0.99
225
+ t = 0.01 if t < 0.01
226
+
227
+ return t
228
+ end
229
+
230
+ end
231
+
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0.8
3
+ specification_version: 1
4
+ name: rbayes
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2007-01-06 00:00:00 -08:00
8
+ summary: An email-focused bayesian filter
9
+ require_paths:
10
+ - lib
11
+ email: drbrain@segment7.net
12
+ homepage: http://seattlerb.rubyforge.org/rbayes/
13
+ rubyforge_project: seattlerb
14
+ description: An bayesian filter fed by a tokenizer that throws crap out you'd find in emails. Originally by Dan Peterson
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Eric Hodel
31
+ files:
32
+ - History.txt
33
+ - Manifest.txt
34
+ - README.txt
35
+ - Rakefile
36
+ - bin/rbayes_check
37
+ - bin/rbayes_dump
38
+ - lib/rbayes.rb
39
+ test_files: []
40
+
41
+ rdoc_options: []
42
+
43
+ extra_rdoc_files: []
44
+
45
+ executables:
46
+ - rbayes_check
47
+ - rbayes_dump
48
+ extensions: []
49
+
50
+ requirements: []
51
+
52
+ dependencies:
53
+ - !ruby/object:Gem::Dependency
54
+ name: hoe
55
+ version_requirement:
56
+ version_requirements: !ruby/object:Gem::Version::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.1.6
61
+ version: