rbayes 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ = 1.0.0 / 2007-01-06
2
+
3
+ * Birthday!
4
+ * Fully refactored from Dan Peterson's original to be in a single class.
5
+
@@ -0,0 +1,7 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ bin/rbayes_check
6
+ bin/rbayes_dump
7
+ lib/rbayes.rb
@@ -0,0 +1,20 @@
1
+ = rbayes
2
+
3
+ Rubyforge Project:
4
+
5
+ http://rubyforge.org/projects/seattlerb
6
+
7
+ Documentation:
8
+
9
+ FILL ME IN
10
+
11
+ == About
12
+
13
+ == Installing rbayes
14
+
15
+ Just install the gem:
16
+
17
+ $ sudo gem install rbayes
18
+
19
+ == Using rbayes
20
+
@@ -0,0 +1,15 @@
1
+ require 'hoe'
2
+
3
+ require './lib/rbayes'
4
+
5
+ Hoe.new 'rbayes', RBayes::VERSION do |p|
6
+ p.summary = 'An email-focused bayesian filter'
7
+ p.description = 'An bayesian filter fed by a tokenizer that throws crap out you\'d find in emails. Originally by Dan Peterson'
8
+ p.author = 'Eric Hodel'
9
+ p.email = 'drbrain@segment7.net'
10
+ p.url = 'http://seattlerb.rubyforge.org/rbayes/'
11
+ p.changes = File.read('History.txt').scan(/\A(=.*?)(=|\Z)/m).first.first
12
+
13
+ p.rubyforge_name = 'seattlerb'
14
+ end
15
+
@@ -0,0 +1,42 @@
1
+ #!/usr/local/bin/ruby
2
+ #
3
+ # check.rb - part of rbayes
4
+ # Dan Peterson <danp@danp.net>
5
+ # you can do whatever you want with this file but i appreciate credit
6
+ #
7
+ # given a message on stdin, find tokens and determine the message's spam
8
+ # probability based on token ratings as described at
9
+ # http://www.paulgraham.com/spam.html
10
+ #
11
+ # options:
12
+ # -c turn on case sensitivity (default: off)
13
+ # -d debug
14
+ # -f database file
15
+ #
16
+
17
+ require 'getoptlong'
18
+ require 'rbayes'
19
+
20
+ case_sensitive = false
21
+ debug = false
22
+ dbfile = nil
23
+
24
+ opts = GetoptLong.new(
25
+ ['-c', GetoptLong::NO_ARGUMENT],
26
+ ['-d', GetoptLong::NO_ARGUMENT],
27
+ ['-f', GetoptLong::REQUIRED_ARGUMENT]
28
+ )
29
+
30
+ opts.each do |opt, arg|
31
+ case opt
32
+ when '-c' then case_sensitive = true
33
+ when '-d' then debug = true
34
+ when '-f' then dbfile = arg
35
+ end
36
+ end
37
+
38
+ rbayes = RBayes.new dbfile, case_sensitive, debug
39
+ prob = rbayes.rate $stdin.read
40
+
41
+ puts '%0.4f' % prob
42
+
@@ -0,0 +1,27 @@
1
+ #!/usr/local/bin/ruby
2
+ #
3
+ # dump.rb - part of rbayes
4
+ # Dan Peterson <danp@danp.net>
5
+ # you can do whatever you want with this file but i appreciate credit
6
+ #
7
+ # prints token information for each token in the database used by gen.rb and
8
+ # check.rb
9
+ #
10
+
11
+ require 'rbayes'
12
+
13
+ rb = RBayes.new ARGV.shift
14
+
15
+ count_bland = rb.count_bland
16
+ count_tasty = rb.count_tasty
17
+
18
+ length = [count_bland, count_tasty].map { |c| c.to_s.length }.max
19
+
20
+ puts "%#{length}d tasty tokens" % count_tasty
21
+ puts "%#{length}d bland tokens" % count_bland
22
+
23
+ rb.database.each do |token, value|
24
+ next if token =~ /^\s/
25
+ puts "rating = %0.4f (#{token}: #{value})" % [rb.rate_token(token)]
26
+ end
27
+
@@ -0,0 +1,231 @@
1
+ #!/usr/bin/env ruby -w
2
+
3
+ require 'bdb1'
4
+
5
+ # Dan Peterson <danp@danp.net>
6
+ # you can do whatever you want with this file but i appreciate credit
7
+ #
8
+ # Refactored by Eric Hodel <drbrain@segment7.net>
9
+
10
+ class RBayes
11
+
12
+ ##
13
+ # The version of RBayes you are using.
14
+
15
+ VERSION = '1.0.0'
16
+
17
+ # :stopdoc:
18
+ COUNT_BLAND = " count_bland "
19
+ COUNT_TASTY = " count_tasty "
20
+ # :startdoc:
21
+
22
+ ##
23
+ # Bland tokens
24
+
25
+ attr_reader :count_bland
26
+
27
+ ##
28
+ # Tasty tokens
29
+
30
+ attr_reader :count_tasty
31
+
32
+ ##
33
+ # The BDB1 DB holding the token information.
34
+
35
+ attr_reader :database
36
+
37
+ ##
38
+ # Creates a new RBayes object using the database +token_file+. If +test+ is
39
+ # true no writes are performed. If +debug+ is true stuff gets logged to
40
+ # $stderr. +case_sensitive+ should be obvious.
41
+
42
+ def initialize(token_file, case_sensitive = false, test = false,
43
+ debug = false)
44
+ @case_sensitive = case_sensitive
45
+ @test = test
46
+ @debug = debug
47
+
48
+ @database = BDB1::Hash.open token_file, 'a+'
49
+
50
+ @count_tasty = @database[COUNT_TASTY].to_i || 0
51
+ @count_bland = @database[COUNT_BLAND].to_i || 0
52
+
53
+ log "ham tokens: #{@count_tasty} bland tokens: #{@count_bland}"
54
+ end
55
+
56
+ ##
57
+ # Logs +s+ to $stderr if debugging is on.
58
+
59
+ def log(s)
60
+ $stderr.puts s if @debug
61
+ end
62
+
63
+ ##
64
+ # Yields tokens in +message+ ignoring the boring headers and such.
65
+
66
+ def read_tokens_in(message)
67
+ message.split($/).each do |line|
68
+ line.chomp! "\r\n"
69
+
70
+ next if line =~ /^\.?Date:/i
71
+ next if line =~ /^\.?Message-ID:/i
72
+ next if line =~ /^\.?In-Reply-To:/i
73
+ next if line =~ /^\.?References:/i
74
+ next if line =~ /^\.?[A-Za-z0-9\/\+]+$/
75
+ next if line =~ /SMTP id/i
76
+ next if line =~ /boundary=/
77
+ next if line =~ /name=\"/
78
+ next if line =~ /filename=\"/
79
+ next if line =~ /^--[^\s\n]*$/
80
+
81
+ line.downcase! unless @case_sensitive
82
+
83
+ #log "Tokenizing #{line.inspect}"
84
+ line.split(/(?:[^\w.?'@:$\/+-]+)/).each do |token|
85
+ next if token.length < 3
86
+ next if token =~ /^\d+$/
87
+
88
+ yield token
89
+ end
90
+ end
91
+ end
92
+
93
+ ##
94
+ # Returns a Hash mapping tokens to the number of occurances in +message+.
95
+
96
+ def count_tokens_in(message)
97
+ counts = Hash.new 0
98
+
99
+ read_tokens_in message do |tok|
100
+ counts[tok] += 1
101
+ end
102
+
103
+ return counts
104
+ end
105
+
106
+ ##
107
+ # Rates +message+ as tasty or bland.
108
+
109
+ def rate(message)
110
+ ratings = {}
111
+
112
+ read_tokens_in message do |tok|
113
+ unless ratings.has_key? tok then
114
+ ratings[tok] = (0.5 - rate_token(tok)).abs
115
+ end
116
+ end
117
+
118
+ inttok = ratings.sort_by { |v| -v[1] }[0..14]
119
+
120
+ p = 1.0
121
+ m1p = 1.0
122
+
123
+ inttok.each do |tok, blandness|
124
+ y = rate_token tok
125
+ log "token #{tok} is %0.2f bland" % y
126
+ p *= y
127
+ m1p *= 1.0 - y
128
+ end
129
+
130
+ return p / (p + m1p)
131
+ end
132
+
133
+ ##
134
+ # Updates the database with tokens from +message+.
135
+ #
136
+ # +mode+ may be:
137
+ #
138
+ # <tt>:add_bland</tt>:: increases tastiness of found tokens
139
+ # <tt>:add_tasty</tt>:: increases tastiness of found tokens
140
+ # <tt>:remove_bland</tt>:: decreases blandness of found tokens
141
+ # <tt>:remove_tasty</tt>:: decreases tastiness of found tokens
142
+
143
+ def update_db_with(message, mode)
144
+ unless [:add_bland, :remove_bland, :add_tasty, :remove_tasty].include? mode
145
+ raise ArgumentError, 'invalid mode'
146
+ end
147
+ log "updating db: #{mode}"
148
+
149
+ counts = count_tokens_in message
150
+
151
+ counts.each do |tok, cnt|
152
+ tnum, bnum = (@database[tok] || "0 0").split(/\s+/)
153
+ tnum, bnum = tnum.to_i, bnum.to_i
154
+ log "found: #{tok} #{cnt} times, tasty: #{tnum}, bland: #{bnum}"
155
+
156
+ unless @test then
157
+ case mode
158
+ when :add_tasty then tnum += cnt
159
+ when :add_bland then bnum += cnt
160
+ when :remove_tasty then tnum -= cnt
161
+ when :remove_bland then bnum -= cnt
162
+ end
163
+ end
164
+
165
+ tnum = 0 if tnum < 0
166
+ bnum = 0 if bnum < 0
167
+
168
+ # token not needed any more, don't waste space
169
+ if tnum == 0 && bnum == 0 then
170
+ @database.delete tok unless @test
171
+ log "probs: #{tok} deleted"
172
+
173
+ # update probability database
174
+ else
175
+ @database[tok] = [tnum, bnum].join(" ") unless @test
176
+ log "update: #{tok}, tasty: #{tnum}, bland: #{bnum}"
177
+ end
178
+ end
179
+
180
+ # for master count
181
+ case mode
182
+ when :add_tasty then @count_tasty += 1
183
+ when :add_bland then @count_bland += 1
184
+ when :remove_tasty then @count_tasty -= 1
185
+ when :remove_bland then @count_bland -= 1
186
+ end
187
+
188
+ @count_tasty = 0 if @count_tasty < 0
189
+ @count_bland = 0 if @count_bland < 0
190
+
191
+ unless @test then
192
+ @database[COUNT_TASTY] = @count_tasty
193
+ @database[COUNT_BLAND] = @count_bland
194
+ end
195
+ end
196
+
197
+ ##
198
+ # Rates token +tok+ for tastiness. Returns a probability between 0 and 1.
199
+
200
+ def rate_token(tok)
201
+ tnum, bnum = (@database[tok] || "0 0").split(/\s+/)
202
+ tnum, bnum = tnum.to_i, bnum.to_i
203
+
204
+ if tnum == 0 && bnum > 0 then
205
+ return 0.99
206
+
207
+ elsif bnum == 0 && tnum > 0 then
208
+ return 0.01
209
+
210
+ elsif tnum == 0 && bnum == 0 then
211
+ return 0.4
212
+
213
+ end
214
+
215
+ tasty = 2.0 * tnum
216
+ bland = bnum.to_f
217
+
218
+ tasty /= @count_tasty.to_f
219
+ tasty = 1.0 if tasty > 1.0
220
+ bland /= @count_bland.to_f
221
+ bland = 1.0 if bland > 1.0
222
+
223
+ t = bland / (tasty + bland)
224
+ t = 0.99 if t > 0.99
225
+ t = 0.01 if t < 0.01
226
+
227
+ return t
228
+ end
229
+
230
+ end
231
+
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0.8
3
+ specification_version: 1
4
+ name: rbayes
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2007-01-06 00:00:00 -08:00
8
+ summary: An email-focused bayesian filter
9
+ require_paths:
10
+ - lib
11
+ email: drbrain@segment7.net
12
+ homepage: http://seattlerb.rubyforge.org/rbayes/
13
+ rubyforge_project: seattlerb
14
+ description: An bayesian filter fed by a tokenizer that throws crap out you'd find in emails. Originally by Dan Peterson
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Eric Hodel
31
+ files:
32
+ - History.txt
33
+ - Manifest.txt
34
+ - README.txt
35
+ - Rakefile
36
+ - bin/rbayes_check
37
+ - bin/rbayes_dump
38
+ - lib/rbayes.rb
39
+ test_files: []
40
+
41
+ rdoc_options: []
42
+
43
+ extra_rdoc_files: []
44
+
45
+ executables:
46
+ - rbayes_check
47
+ - rbayes_dump
48
+ extensions: []
49
+
50
+ requirements: []
51
+
52
+ dependencies:
53
+ - !ruby/object:Gem::Dependency
54
+ name: hoe
55
+ version_requirement:
56
+ version_requirements: !ruby/object:Gem::Version::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.1.6
61
+ version: