bishop 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/bishop.rb +317 -0
- data/test/test_bishop.rb +21 -0
- metadata +38 -0
data/lib/bishop.rb
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
#
|
|
2
|
+
# This module is a port to the Ruby language of the Reverend Bayesian classifier distributed
|
|
3
|
+
# as part of the Divmod project (which is Copyright 2003 Amir Bakhtiar <amir@divmod.org>
|
|
4
|
+
#
|
|
5
|
+
# This Ruby port is Copyright 2005 Matt Mower <self@mattmower.com> and is free software;
|
|
6
|
+
# you can distribute it and/or modify it under the terms of version 2.1 of the GNU
|
|
7
|
+
# Lesser General Public License as published by the Free Software Foundation.
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
require 'yaml'
|
|
11
|
+
|
|
12
|
+
module Bishop
|
|
13
|
+
|
|
14
|
+
#
|
|
15
|
+
# Ruby's YAML persists Hashes using special processing rather than by
|
|
16
|
+
# dumping it's instance variables, hence no instance variables in a Hash
|
|
17
|
+
# subclass get dumped either <sigh>
|
|
18
|
+
#
|
|
19
|
+
class BayesData
|
|
20
|
+
|
|
21
|
+
attr_accessor :token_count, :train_count, :name
|
|
22
|
+
attr_reader :training, :data
|
|
23
|
+
|
|
24
|
+
def initialize( name = '', pool = nil )
|
|
25
|
+
@name = name
|
|
26
|
+
@training = []
|
|
27
|
+
@pool = pool
|
|
28
|
+
@data = Hash.new( 0.0 )
|
|
29
|
+
self.token_count = 0
|
|
30
|
+
self.train_count = 0
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def trained_on?( item )
|
|
34
|
+
self.training.include? item
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def to_s
|
|
38
|
+
"<BayesDict: #{self.name || noname}, #{self.token_count} tokens>"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
class Tokenizer
|
|
44
|
+
def tokenize( item )
|
|
45
|
+
item.split( /\s+/ ).map do |i|
|
|
46
|
+
token = i.split( /\-/ ).map { |token| token.downcase.gsub( /\W/, "" ) }.join( "-" )
|
|
47
|
+
end.reject { |t| t == "" || t == "-" }
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
class Bayes
|
|
52
|
+
|
|
53
|
+
attr_accessor :dirty, :train_count, :pools, :tokenizer, :data_class, :corpus, :cache, :combiner
|
|
54
|
+
attr_reader :data_class
|
|
55
|
+
|
|
56
|
+
def initialize( tokenizer = nil, data_class = BayesData, &combiner )
|
|
57
|
+
@tokenizer = tokenizer || Tokenizer.new
|
|
58
|
+
@combiner = combiner || Proc.new { |probs,ignore| Bishop.robinson( probs, ignore ) }
|
|
59
|
+
@data_class = data_class
|
|
60
|
+
@pools = {}
|
|
61
|
+
@corpus = new_pool( '__Corpus__' )
|
|
62
|
+
@pools['__Corpus__'] = @corpus
|
|
63
|
+
@train_count = 0
|
|
64
|
+
@dirty = true
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def commit
|
|
68
|
+
self.save
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def dirty?
|
|
72
|
+
self.dirty
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Create a new, empty, pool without training.
|
|
76
|
+
def new_pool( pool_name )
|
|
77
|
+
self.dirty = true
|
|
78
|
+
self.pools[ pool_name ] ||= @data_class.new( pool_name )
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def remove_pool( pool_name )
|
|
82
|
+
self.pools.delete( pool_name )
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def rename_pool( pool_name, new_name )
|
|
86
|
+
self.pools[new_name] = self.pools[pool_name]
|
|
87
|
+
self.pools[new_name].name = new_name
|
|
88
|
+
self.pools.delete( pool_name )
|
|
89
|
+
self.dirty = true
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Merge the contents of the source pool into the destination
|
|
93
|
+
# destination pool.
|
|
94
|
+
def merge_pools( dest_name, source_name )
|
|
95
|
+
dest_pool = self.pools[dest_name]
|
|
96
|
+
self.pools[source_name].data.each do |token,count|
|
|
97
|
+
if dest_pool.data.has_key?( token )
|
|
98
|
+
dest_pool.data[token] += count
|
|
99
|
+
else
|
|
100
|
+
dest_pool.data[token] = count
|
|
101
|
+
dest_pool.token_count += 1
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
self.dirty = true
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Return an array of token counts for the specified pool.
|
|
108
|
+
def pool_data( pool_name )
|
|
109
|
+
self.pools[pool_name].data.to_a
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Return an array of tokens trained in the specified pool.
|
|
113
|
+
def pool_tokens( pool_name )
|
|
114
|
+
self.pools[pool_name].data.keys
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def save( file = 'bayesdata.yml' )
|
|
118
|
+
File.open( file, 'w' ) { |f| YAML.dump( self.pools, f ) }
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def export
|
|
122
|
+
self.pools.to_yaml
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def load( file = 'bayesdata.yml' )
|
|
126
|
+
begin
|
|
127
|
+
File.open( file ) { |f| load_data( f ) }
|
|
128
|
+
rescue Errno::ENOENT
|
|
129
|
+
# File does not exist
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def load_data( source )
|
|
134
|
+
self.pools = YAML.load( source )
|
|
135
|
+
self.pools.each { |pool_name,pool| pool.data.default = 0.0 }
|
|
136
|
+
self.corpus = self.pools['__Corpus__']
|
|
137
|
+
self.dirty = true
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def pool_names
|
|
141
|
+
self.pools.keys.sort.reject { |name| name == '__Corpus__' }
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Create a cache of the metrics for each pool.
|
|
145
|
+
def build_cache
|
|
146
|
+
self.cache = {}
|
|
147
|
+
|
|
148
|
+
self.pools.each do |name,pool|
|
|
149
|
+
unless name == '__Corpus__'
|
|
150
|
+
|
|
151
|
+
pool_count = pool.token_count
|
|
152
|
+
them_count = [ 1, self.corpus.token_count - pool_count ].max
|
|
153
|
+
cache_dict = self.cache[ name ] ||= @data_class.new( name )
|
|
154
|
+
|
|
155
|
+
self.corpus.data.each do |token,tot_count|
|
|
156
|
+
this_count = pool.data[token]
|
|
157
|
+
|
|
158
|
+
unless this_count == 0.0
|
|
159
|
+
other_count = tot_count - this_count
|
|
160
|
+
|
|
161
|
+
if pool_count > 0
|
|
162
|
+
good_metric = [ 1.0, other_count / pool_count ].min
|
|
163
|
+
else
|
|
164
|
+
good_metric = 1.0
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
bad_metric = [ 1.0, this_count / them_count ].min
|
|
168
|
+
|
|
169
|
+
f = bad_metric / ( good_metric + bad_metric )
|
|
170
|
+
|
|
171
|
+
if ( f - 0.5 ).abs >= 0.1
|
|
172
|
+
cache_dict.data[token] = [ 0.0001, [ 0.9999, f ].min ].max
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Get the probabilities for each pool, recreating the cached information if
|
|
181
|
+
# any token information for any of the pools has changed.
|
|
182
|
+
def pool_probs
|
|
183
|
+
if self.dirty?
|
|
184
|
+
self.build_cache
|
|
185
|
+
self.dirty = false
|
|
186
|
+
end
|
|
187
|
+
self.cache
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Create a token array from the specified input.
|
|
191
|
+
def get_tokens( input )
|
|
192
|
+
self.tokenizer.tokenize( input )
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# For each word trained in the pool, collect it's occurrence data in the pool into a sorted array.
|
|
196
|
+
def get_probs( pool, words )
|
|
197
|
+
words.find_all { |word| pool.data.has_key? word }.map { |word| [word,pool.data[word]] }.sort
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def train( pool_name, item, uid = nil )
|
|
201
|
+
tokens = get_tokens( item )
|
|
202
|
+
pool = new_pool( pool_name )
|
|
203
|
+
train_( pool, tokens )
|
|
204
|
+
self.corpus.train_count += 1
|
|
205
|
+
pool.train_count += 1
|
|
206
|
+
if uid
|
|
207
|
+
pool.training.push( uid )
|
|
208
|
+
end
|
|
209
|
+
self.dirty = true
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def train_( pool, tokens )
|
|
213
|
+
wc = 0
|
|
214
|
+
tokens.each do |token|
|
|
215
|
+
pool.data[token] += 1
|
|
216
|
+
self.corpus.data[token] += 1
|
|
217
|
+
wc += 1
|
|
218
|
+
end
|
|
219
|
+
pool.token_count += wc
|
|
220
|
+
self.corpus.token_count += wc
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def untrain( pool_name, item, uid = nil )
|
|
224
|
+
tokens = get_tokens( item )
|
|
225
|
+
pool = new_pool( pool_name )
|
|
226
|
+
untrain_( pool, tokens )
|
|
227
|
+
self.corpus.train_count += 1
|
|
228
|
+
pool.train_count += 1
|
|
229
|
+
if uid
|
|
230
|
+
pool.training.delete( uid )
|
|
231
|
+
end
|
|
232
|
+
self.dirty = true
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def untrain_( pool, tokens )
|
|
236
|
+
tokens.each do |token|
|
|
237
|
+
if pool.data.has_key? token
|
|
238
|
+
if pool.data[token] == 1
|
|
239
|
+
pool.data.delete( token )
|
|
240
|
+
else
|
|
241
|
+
pool.data[token] -= 1
|
|
242
|
+
end
|
|
243
|
+
pool.token_count -= 1
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
if self.corpus.has_key? token
|
|
247
|
+
if self.corpus.data[token] == 1
|
|
248
|
+
self.corpus.data.delete( token )
|
|
249
|
+
else
|
|
250
|
+
self.corpus.data[token] -= 1
|
|
251
|
+
end
|
|
252
|
+
self.corpus.token_count -= 1
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
def trained_on?( msg )
|
|
258
|
+
self.cache.values.any? { |v| v.trained_on? msg }
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Call this method to classify a "message". The return value will be
|
|
262
|
+
# an array containing tuples (pool, probability) for each pool which
|
|
263
|
+
# is a likely match for the message.
|
|
264
|
+
def guess( msg )
|
|
265
|
+
tokens = get_tokens( msg )
|
|
266
|
+
res = {}
|
|
267
|
+
|
|
268
|
+
pool_probs.each do |pool_name,pool|
|
|
269
|
+
p = get_probs( pool, tokens )
|
|
270
|
+
if p.length != 0
|
|
271
|
+
res[pool_name] = self.combiner.call( p, pool_name )
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
res.sort
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
private :train_, :untrain_
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
def self.robinson( probs, ignore )
|
|
282
|
+
nth = 1.0/probs.length
|
|
283
|
+
what_is_p = 1.0 - probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s * v } ** nth
|
|
284
|
+
what_is_q = 1.0 - probs.map { |p| p[1] }.inject { |s,v| s * v } ** nth
|
|
285
|
+
what_is_s = ( what_is_p - what_is_q ) / ( what_is_p + what_is_q )
|
|
286
|
+
( 1 + what_is_s ) / 2
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def self.robinson_fisher( probs, ignore )
|
|
290
|
+
n = probs.length
|
|
291
|
+
|
|
292
|
+
begin
|
|
293
|
+
h = chi2p( -2.0 * Math.log( probs.map { |p| p[1] }.inject( 1.0 ) { |s,v| s*v } ), 2*n )
|
|
294
|
+
rescue
|
|
295
|
+
h = 0.0
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
begin
|
|
299
|
+
s = chi2p( -2.0 * Math.log( probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s*v } ), 2*n )
|
|
300
|
+
rescue
|
|
301
|
+
s = 0.0
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
( 1 + h - s ) / 2
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
def self.chi2p( chi, df )
|
|
308
|
+
m = chi / 2
|
|
309
|
+
sum = term = Math.exp( -m )
|
|
310
|
+
(1 .. df/2).each do |i|
|
|
311
|
+
term *= m/i
|
|
312
|
+
sum += term
|
|
313
|
+
end
|
|
314
|
+
[1.0, sum].min
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
end
|
data/test/test_bishop.rb
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
$:.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
|
|
2
|
+
require 'bishop'
|
|
3
|
+
require 'test/unit'
|
|
4
|
+
|
|
5
|
+
class TestBishop < Test::Unit::TestCase
|
|
6
|
+
|
|
7
|
+
def test_robinson
|
|
8
|
+
b = Bishop::Bayes.new { |p,i| Bishop::robinson( p, i ) }
|
|
9
|
+
b.train( "interesting", "here are some good words. I hope you love them" )
|
|
10
|
+
b.train( "uninteresting", "here are some bad words, I hate you" )
|
|
11
|
+
assert_equal( [[ "uninteresting", 0.9999 ]], b.guess( "I hate bad words and you" ) )
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def test_robinson_fisher
|
|
15
|
+
b = Bishop::Bayes.new { |p,i| Bishop::robinson_fisher( p, i ) }
|
|
16
|
+
b.train( "interesting", "here are some good words. I hope you love them" )
|
|
17
|
+
b.train( "uninteresting", "here are some bad words, I hate you" )
|
|
18
|
+
assert_equal( [["uninteresting", 0.999999054592232]], b.guess( "I hate bad words and you" ) )
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
rubygems_version: 0.8.10
|
|
3
|
+
specification_version: 1
|
|
4
|
+
name: bishop
|
|
5
|
+
version: !ruby/object:Gem::Version
|
|
6
|
+
version: 0.3.0
|
|
7
|
+
date: 2005-04-13
|
|
8
|
+
summary: A port of the Reverend Bayesian classification library.
|
|
9
|
+
require_paths:
|
|
10
|
+
- lib
|
|
11
|
+
email: self@mattmower.com
|
|
12
|
+
homepage: http://rubyforge.org/projects/bishop/
|
|
13
|
+
rubyforge_project:
|
|
14
|
+
description:
|
|
15
|
+
autorequire: bishop
|
|
16
|
+
default_executable:
|
|
17
|
+
bindir: bin
|
|
18
|
+
has_rdoc: false
|
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
20
|
+
requirements:
|
|
21
|
+
-
|
|
22
|
+
- ">"
|
|
23
|
+
- !ruby/object:Gem::Version
|
|
24
|
+
version: 0.0.0
|
|
25
|
+
version:
|
|
26
|
+
platform: ruby
|
|
27
|
+
authors:
|
|
28
|
+
- Matt Mower
|
|
29
|
+
files:
|
|
30
|
+
- lib/bishop.rb
|
|
31
|
+
- test/test_bishop.rb
|
|
32
|
+
test_files: []
|
|
33
|
+
rdoc_options: []
|
|
34
|
+
extra_rdoc_files: []
|
|
35
|
+
executables: []
|
|
36
|
+
extensions: []
|
|
37
|
+
requirements: []
|
|
38
|
+
dependencies: []
|