bishop 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/bishop.rb +317 -0
  2. data/test/test_bishop.rb +21 -0
  3. metadata +38 -0
@@ -0,0 +1,317 @@
1
+ #
2
+ # This module is a port to the Ruby language of the Reverend Bayesian classifier distributed
3
+ # as part of the Divmod project (which is Copyright 2003 Amir Bakhtiar <amir@divmod.org>
4
+ #
5
+ # This Ruby port is Copyright 2005 Matt Mower <self@mattmower.com> and is free software;
6
+ # you can distribute it and/or modify it under the terms of version 2.1 of the GNU
7
+ # Lesser General Public License as published by the Free Software Foundation.
8
+ #
9
+
10
+ require 'yaml'
11
+
12
+ module Bishop
13
+
14
+ #
15
+ # Ruby's YAML persists Hashes using special processing rather than by
16
+ # dumping it's instance variables, hence no instance variables in a Hash
17
+ # subclass get dumped either <sigh>
18
+ #
19
+ class BayesData
20
+
21
+ attr_accessor :token_count, :train_count, :name
22
+ attr_reader :training, :data
23
+
24
+ def initialize( name = '', pool = nil )
25
+ @name = name
26
+ @training = []
27
+ @pool = pool
28
+ @data = Hash.new( 0.0 )
29
+ self.token_count = 0
30
+ self.train_count = 0
31
+ end
32
+
33
+ def trained_on?( item )
34
+ self.training.include? item
35
+ end
36
+
37
+ def to_s
38
+ "<BayesDict: #{self.name || noname}, #{self.token_count} tokens>"
39
+ end
40
+
41
+ end
42
+
43
+ class Tokenizer
44
+ def tokenize( item )
45
+ item.split( /\s+/ ).map do |i|
46
+ token = i.split( /\-/ ).map { |token| token.downcase.gsub( /\W/, "" ) }.join( "-" )
47
+ end.reject { |t| t == "" || t == "-" }
48
+ end
49
+ end
50
+
51
+ class Bayes
52
+
53
+ attr_accessor :dirty, :train_count, :pools, :tokenizer, :data_class, :corpus, :cache, :combiner
54
+ attr_reader :data_class
55
+
56
+ def initialize( tokenizer = nil, data_class = BayesData, &combiner )
57
+ @tokenizer = tokenizer || Tokenizer.new
58
+ @combiner = combiner || Proc.new { |probs,ignore| Bishop.robinson( probs, ignore ) }
59
+ @data_class = data_class
60
+ @pools = {}
61
+ @corpus = new_pool( '__Corpus__' )
62
+ @pools['__Corpus__'] = @corpus
63
+ @train_count = 0
64
+ @dirty = true
65
+ end
66
+
67
+ def commit
68
+ self.save
69
+ end
70
+
71
+ def dirty?
72
+ self.dirty
73
+ end
74
+
75
+ # Create a new, empty, pool without training.
76
+ def new_pool( pool_name )
77
+ self.dirty = true
78
+ self.pools[ pool_name ] ||= @data_class.new( pool_name )
79
+ end
80
+
81
+ def remove_pool( pool_name )
82
+ self.pools.delete( pool_name )
83
+ end
84
+
85
+ def rename_pool( pool_name, new_name )
86
+ self.pools[new_name] = self.pools[pool_name]
87
+ self.pools[new_name].name = new_name
88
+ self.pools.delete( pool_name )
89
+ self.dirty = true
90
+ end
91
+
92
+ # Merge the contents of the source pool into the destination
93
+ # destination pool.
94
+ def merge_pools( dest_name, source_name )
95
+ dest_pool = self.pools[dest_name]
96
+ self.pools[source_name].data.each do |token,count|
97
+ if dest_pool.data.has_key?( token )
98
+ dest_pool.data[token] += count
99
+ else
100
+ dest_pool.data[token] = count
101
+ dest_pool.token_count += 1
102
+ end
103
+ end
104
+ self.dirty = true
105
+ end
106
+
107
+ # Return an array of token counts for the specified pool.
108
+ def pool_data( pool_name )
109
+ self.pools[pool_name].data.to_a
110
+ end
111
+
112
+ # Return an array of tokens trained in the specified pool.
113
+ def pool_tokens( pool_name )
114
+ self.pools[pool_name].data.keys
115
+ end
116
+
117
+ def save( file = 'bayesdata.yml' )
118
+ File.open( file, 'w' ) { |f| YAML.dump( self.pools, f ) }
119
+ end
120
+
121
+ def export
122
+ self.pools.to_yaml
123
+ end
124
+
125
+ def load( file = 'bayesdata.yml' )
126
+ begin
127
+ File.open( file ) { |f| load_data( f ) }
128
+ rescue Errno::ENOENT
129
+ # File does not exist
130
+ end
131
+ end
132
+
133
+ def load_data( source )
134
+ self.pools = YAML.load( source )
135
+ self.pools.each { |pool_name,pool| pool.data.default = 0.0 }
136
+ self.corpus = self.pools['__Corpus__']
137
+ self.dirty = true
138
+ end
139
+
140
+ def pool_names
141
+ self.pools.keys.sort.reject { |name| name == '__Corpus__' }
142
+ end
143
+
144
+ # Create a cache of the metrics for each pool.
145
+ def build_cache
146
+ self.cache = {}
147
+
148
+ self.pools.each do |name,pool|
149
+ unless name == '__Corpus__'
150
+
151
+ pool_count = pool.token_count
152
+ them_count = [ 1, self.corpus.token_count - pool_count ].max
153
+ cache_dict = self.cache[ name ] ||= @data_class.new( name )
154
+
155
+ self.corpus.data.each do |token,tot_count|
156
+ this_count = pool.data[token]
157
+
158
+ unless this_count == 0.0
159
+ other_count = tot_count - this_count
160
+
161
+ if pool_count > 0
162
+ good_metric = [ 1.0, other_count / pool_count ].min
163
+ else
164
+ good_metric = 1.0
165
+ end
166
+
167
+ bad_metric = [ 1.0, this_count / them_count ].min
168
+
169
+ f = bad_metric / ( good_metric + bad_metric )
170
+
171
+ if ( f - 0.5 ).abs >= 0.1
172
+ cache_dict.data[token] = [ 0.0001, [ 0.9999, f ].min ].max
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
179
+
180
+ # Get the probabilities for each pool, recreating the cached information if
181
+ # any token information for any of the pools has changed.
182
+ def pool_probs
183
+ if self.dirty?
184
+ self.build_cache
185
+ self.dirty = false
186
+ end
187
+ self.cache
188
+ end
189
+
190
+ # Create a token array from the specified input.
191
+ def get_tokens( input )
192
+ self.tokenizer.tokenize( input )
193
+ end
194
+
195
+ # For each word trained in the pool, collect it's occurrence data in the pool into a sorted array.
196
+ def get_probs( pool, words )
197
+ words.find_all { |word| pool.data.has_key? word }.map { |word| [word,pool.data[word]] }.sort
198
+ end
199
+
200
+ def train( pool_name, item, uid = nil )
201
+ tokens = get_tokens( item )
202
+ pool = new_pool( pool_name )
203
+ train_( pool, tokens )
204
+ self.corpus.train_count += 1
205
+ pool.train_count += 1
206
+ if uid
207
+ pool.training.push( uid )
208
+ end
209
+ self.dirty = true
210
+ end
211
+
212
+ def train_( pool, tokens )
213
+ wc = 0
214
+ tokens.each do |token|
215
+ pool.data[token] += 1
216
+ self.corpus.data[token] += 1
217
+ wc += 1
218
+ end
219
+ pool.token_count += wc
220
+ self.corpus.token_count += wc
221
+ end
222
+
223
+ def untrain( pool_name, item, uid = nil )
224
+ tokens = get_tokens( item )
225
+ pool = new_pool( pool_name )
226
+ untrain_( pool, tokens )
227
+ self.corpus.train_count += 1
228
+ pool.train_count += 1
229
+ if uid
230
+ pool.training.delete( uid )
231
+ end
232
+ self.dirty = true
233
+ end
234
+
235
+ def untrain_( pool, tokens )
236
+ tokens.each do |token|
237
+ if pool.data.has_key? token
238
+ if pool.data[token] == 1
239
+ pool.data.delete( token )
240
+ else
241
+ pool.data[token] -= 1
242
+ end
243
+ pool.token_count -= 1
244
+ end
245
+
246
+ if self.corpus.has_key? token
247
+ if self.corpus.data[token] == 1
248
+ self.corpus.data.delete( token )
249
+ else
250
+ self.corpus.data[token] -= 1
251
+ end
252
+ self.corpus.token_count -= 1
253
+ end
254
+ end
255
+ end
256
+
257
+ def trained_on?( msg )
258
+ self.cache.values.any? { |v| v.trained_on? msg }
259
+ end
260
+
261
+ # Call this method to classify a "message". The return value will be
262
+ # an array containing tuples (pool, probability) for each pool which
263
+ # is a likely match for the message.
264
+ def guess( msg )
265
+ tokens = get_tokens( msg )
266
+ res = {}
267
+
268
+ pool_probs.each do |pool_name,pool|
269
+ p = get_probs( pool, tokens )
270
+ if p.length != 0
271
+ res[pool_name] = self.combiner.call( p, pool_name )
272
+ end
273
+ end
274
+
275
+ res.sort
276
+ end
277
+
278
+ private :train_, :untrain_
279
+ end
280
+
281
+ def self.robinson( probs, ignore )
282
+ nth = 1.0/probs.length
283
+ what_is_p = 1.0 - probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s * v } ** nth
284
+ what_is_q = 1.0 - probs.map { |p| p[1] }.inject { |s,v| s * v } ** nth
285
+ what_is_s = ( what_is_p - what_is_q ) / ( what_is_p + what_is_q )
286
+ ( 1 + what_is_s ) / 2
287
+ end
288
+
289
+ def self.robinson_fisher( probs, ignore )
290
+ n = probs.length
291
+
292
+ begin
293
+ h = chi2p( -2.0 * Math.log( probs.map { |p| p[1] }.inject( 1.0 ) { |s,v| s*v } ), 2*n )
294
+ rescue
295
+ h = 0.0
296
+ end
297
+
298
+ begin
299
+ s = chi2p( -2.0 * Math.log( probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s*v } ), 2*n )
300
+ rescue
301
+ s = 0.0
302
+ end
303
+
304
+ ( 1 + h - s ) / 2
305
+ end
306
+
307
+ def self.chi2p( chi, df )
308
+ m = chi / 2
309
+ sum = term = Math.exp( -m )
310
+ (1 .. df/2).each do |i|
311
+ term *= m/i
312
+ sum += term
313
+ end
314
+ [1.0, sum].min
315
+ end
316
+
317
+ end
@@ -0,0 +1,21 @@
1
+ $:.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
2
+ require 'bishop'
3
+ require 'test/unit'
4
+
5
+ class TestBishop < Test::Unit::TestCase
6
+
7
+ def test_robinson
8
+ b = Bishop::Bayes.new { |p,i| Bishop::robinson( p, i ) }
9
+ b.train( "interesting", "here are some good words. I hope you love them" )
10
+ b.train( "uninteresting", "here are some bad words, I hate you" )
11
+ assert_equal( [[ "uninteresting", 0.9999 ]], b.guess( "I hate bad words and you" ) )
12
+ end
13
+
14
+ def test_robinson_fisher
15
+ b = Bishop::Bayes.new { |p,i| Bishop::robinson_fisher( p, i ) }
16
+ b.train( "interesting", "here are some good words. I hope you love them" )
17
+ b.train( "uninteresting", "here are some bad words, I hate you" )
18
+ assert_equal( [["uninteresting", 0.999999054592232]], b.guess( "I hate bad words and you" ) )
19
+ end
20
+
21
+ end
metadata ADDED
@@ -0,0 +1,38 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.10
3
+ specification_version: 1
4
+ name: bishop
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.3.0
7
+ date: 2005-04-13
8
+ summary: A port of the Reverend Bayesian classification library.
9
+ require_paths:
10
+ - lib
11
+ email: self@mattmower.com
12
+ homepage: http://rubyforge.org/projects/bishop/
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: bishop
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ authors:
28
+ - Matt Mower
29
+ files:
30
+ - lib/bishop.rb
31
+ - test/test_bishop.rb
32
+ test_files: []
33
+ rdoc_options: []
34
+ extra_rdoc_files: []
35
+ executables: []
36
+ extensions: []
37
+ requirements: []
38
+ dependencies: []