rb_prob 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2010, Steffen Siering
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+ * Redistributions of source code must retain the above copyright
7
+ notice, this list of conditions and the following disclaimer.
8
+ * Redistributions in binary form must reproduce the above copyright
9
+ notice, this list of conditions and the following disclaimer in the
10
+ documentation and/or other materials provided with the distribution.
11
+ * Neither the name of Steffen Siering nor the
12
+ names of its contributors may be used to endorse or promote products
13
+ derived from this software without specific prior written permission.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
19
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+
data/examples/alarm.rb ADDED
@@ -0,0 +1,283 @@
1
+
2
+ require 'rubygems'
3
+
4
+ require 'prob'
5
+ include Probably
6
+
7
+ # Alarm example from "Artificial Intelligence - A Modern Approach" by Russel
8
+ # and Norvig Page 493 cc.
9
+ #
10
+ # Suppose you have a new fairly reliable burglar alarm at home but occasionally
11
+ # it responds to minor earthquakes. You also have two neighbors John and Mary,
12
+ # who have promised to call you at work when they hear the alarm. John always
13
+ # calls when he hears the alarm, but sometimes confuses the telephone ringing
14
+ # with the alarm and calls then, too. Mary, on the other hand, is too much in
15
+ # loud music and sometimes misses the alarm altogether.
16
+ #
17
+ # So the bayesian network will be:
18
+ #
19
+ # B E
20
+ # \ /
21
+ # _\| |/_
22
+ # A
23
+ # / \
24
+ # |/_ _\|
25
+ # J M
26
+ #
27
+ # with probabilities:
28
+ # P(B) = 0.001
29
+ # P(E) = 0.002
30
+ #
31
+ # P(A| B=true, E=true) = 0.95
32
+ # P(A| B=true, E=false) = 0.94
33
+ # P(A| B=false, E=true) = 0.29
34
+ # P(A| B=false, E=false) = 0.001
35
+ #
36
+ # P(J| A=true) = 0.9
37
+ # P(J| A=false) = 0.05
38
+ #
39
+ # P(M| A=true) = 0.7
40
+ # P(M| A=false) = 0.01
41
+ #
42
+ # where B = burglar, E = earthquake, A = alarm, J = John calls and
43
+ # M = Mary calls
44
+ #
45
+ # ----------------------------------------------------------------------------
46
+ #
47
+ # Next we want to develop some 'equivalent' functions for querying that
48
+ # network and do some benchmarks.
49
+ #
50
+
51
+ # first let's encode the probabilities from the network
52
+ # P(B)
53
+ PBurglary = choose(0.001, :B, :notB )
54
+
55
+ # P(E)
56
+ PEarthquake = choose(0.002, :E, :notE)
57
+
58
+ # P(A|B = b,E = e)
59
+ def p_alarm(b, e)
60
+ pAlarmTable = {
61
+ [:B, :E] => 0.95,
62
+ [:B, :notE] => 0.94,
63
+ [:notB, :E] => 0.29,
64
+ [:notB, :notE] => 0.001
65
+ }
66
+
67
+ choose(pAlarmTable[[b, e]], :A, :notA)
68
+ end
69
+
70
+ # P(J|A = a)
71
+ def p_john(a)
72
+ choose( a == :A ? 0.9 : 0.05, :J, :notJ)
73
+ end
74
+
75
+ # P(M|A = a)
76
+ def p_mary(a)
77
+ choose( a == :A ? 0.7 : 0.01, :M, :notM)
78
+ end
79
+
80
+ # computes the joint probability and transform result using block (if given)
81
+ # allowing to do some marginalization over one random variable by
82
+ # "leaving it out"
83
+ #
84
+ # for example:
85
+ # mk_joint_p {|b,e,a,j,m| [b,e,a]} will find P(b,e,a) = Sum(j,m) { P(b,e,a,j,m) }
86
+ #
87
+ def mk_joint_p(&blk)
88
+ PBurglary.dep { |b|
89
+ PEarthquake.dep {|e|
90
+ p_alarm(b, e).dep {|a|
91
+ p_john(a).dep { |j|
92
+ p_mary(a).dep {|m|
93
+ mkState(if blk then blk.call([b,e,a,j,m])
94
+ else [b,e,a,j,m] end)
95
+ }
96
+ }
97
+ }
98
+ }
99
+ }
100
+ end
101
+
102
+ # compute (optionally conditional) joint probability of (free) random
103
+ # variables like mk_joint_p.
104
+ #
105
+ # To compute conditional probability set random variables to a known state.
106
+ # for example
107
+ # mk_joint_p2( {:john = :J, :mary = :M} )
108
+ # will compute
109
+ # P(B,E,A| J = true, M = true)
110
+ #
111
+ # or
112
+ # mk_joint_p2({:john = :J, :mary = :M}) {|b,e,a,j,m| b} will find
113
+ # P(B | J = true, M = true)
114
+ def mk_joint_p2( tsts = {}, &blk )
115
+ PBurglary.dep { |b|
116
+ condition(!tsts[:burglary] || tsts[:burglary] == b) {
117
+ PEarthquake.dep {|e|
118
+ condition(!tsts[:earthquake] || tsts[:earthquake] == e) {
119
+ p_alarm(b,e).dep {|a|
120
+ condition(!tsts[:alarm] || tsts[:alarm] == a) {
121
+ p_john(a).dep {|j|
122
+ condition(!tsts[:john] || tsts[:john] == j) {
123
+ p_mary(a).dep {|m|
124
+ condition(!tsts[:mary] || tsts[:mary] == m) {
125
+ mkState(if blk then blk.call [b,e,a,j,m] else [b,e,a,j,m] end)
126
+ }}
127
+ }}
128
+ }}
129
+ }}
130
+ }}.normalize
131
+ end
132
+
133
+ # like mk_joint_p2, but using event_dep directly instead of mixing in
134
+ # condition-statements
135
+ def mk_joint_p3 (tsts = {}, &blk)
136
+ tst_b = ifJust tsts[:burglary]
137
+ tst_e = ifJust tsts[:earthquake]
138
+ tst_a = ifJust tsts[:alarm]
139
+ tst_j = ifJust tsts[:john]
140
+ tst_m = ifJust tsts[:mary]
141
+
142
+ PBurglary.event_dep(tst_b) {|b|
143
+ PEarthquake.event_dep(tst_e) {|e|
144
+ p_alarm(b,e).event_dep(tst_a) {|a|
145
+ p_john(a).event_dep(tst_j) {|j|
146
+ p_mary(a).event_dep(tst_m) {|m|
147
+ mkState(if blk then blk.call [b,e,a,j,m] else [b,e,a,j,m] end)
148
+ }
149
+ }
150
+ }
151
+ }
152
+ }.normalize
153
+ end
154
+
155
+ # precompute joint probability to do bayesian inference using filter, map and
156
+ # query?
157
+ PJoint = mk_joint_p
158
+
159
+ puts 'P(B|M=true, J=true) :'
160
+ puts mk_joint_p3({:mary => :M, :john => :J}) {|b,e,a,j,m| b }
161
+
162
+ # puts "\njoint probability:"
163
+ # puts "=================="
164
+ # puts PJoint
165
+
166
+ # compute P(B | M=true, J=true, E=false, A=true) using all 3 different
167
+ # functions mk_joint_p, mk_joint_p2 and mk_joint_p3:
168
+ puts "\nP(B | M=true, J=true, E=false, A=true)"
169
+ puts "====================================="
170
+ puts mk_joint_p2({:mary => :M, :john => :J, :earthquake => :notE, :alarm => :A}) { |b,e,a,j,m| b }.query?(&just(:B))
171
+ puts mk_joint_p3({:mary => :M, :john => :J, :earthquake => :notE, :alarm => :A}) { |b,e,a,j,m| b }.probability(:B)
172
+ puts PJoint.filter {|b,e,a,j,m| e == :notE && j == :J && m == :M && a == :A }.query? {|b,e,a,j,m| b == :B }
173
+
174
+ # do some benchmarking:
175
+
176
+ require 'benchmark'
177
+
178
+ Benchmark.bmbm {|x|
179
+ i = 1000
180
+ x.report('joint probability:') {
181
+ (1..i).each {
182
+ mk_joint_p.filter {|b,e,a,j,m| e == :notE && j == :J && m == :M && a == :A }.query? {|b,e,a,j,m| b == :B }
183
+ }
184
+ }
185
+
186
+ x.report('joint probability precomputed:') {
187
+ (1..i).each {
188
+ PJoint.filter {|b,e,a,j,m| e == :notE && j == :J && m == :M && a == :A}.query? {|b,e,a,j,m| b == :B}
189
+ }
190
+ }
191
+
192
+ x.report('direkt:') {
193
+ (1..i).each {
194
+ mk_joint_p {|b,e,a,j,m|
195
+ if e == :notE && j == :J && m == :M && a == :A
196
+ [b,a]
197
+ else
198
+ nil
199
+ end
200
+ }.query? {|b,a| b == :B}
201
+ }
202
+ }
203
+
204
+ x.report('direkt with conditions:') {
205
+ (1..i).each {
206
+ mk_joint_p2({:mary => :M, :john => :J, :earthquake => :notE, :alarm => :A}) { |b,e,a,j,m| b }.query?(&just(:B))
207
+ }
208
+ }
209
+
210
+ x.report('direkt with event condition:') {
211
+ (1..i).each {
212
+ mk_joint_p3({:mary => :M, :john => :J, :earthquake => :notE, :alarm => :A}) { |b,e,a,j,m| b }.query?(&just(:B))
213
+ }
214
+ }
215
+ }
216
+
217
+ # I'm too lazy now to write an interpretation of benchmarking,
218
+ # but I guess you can make up your mind yourself...
219
+ # In short: it's always a trade of between space/time usage and macruby must
220
+ # improve floating point...
221
+ #
222
+ # my results (on unibody MacBook 2GHz with snow leopard):
223
+ #
224
+ # ===========================================================================
225
+ #
226
+ # $ ruby -version
227
+ # ruby 1.8.7 (2008-08-11 patchlevel 72) [universal-darwin10.0]
228
+ #
229
+ # Rehearsal ------------------------------------------------------------------
230
+ # joint probability: 3.080000 0.190000 3.270000 ( 3.273073)
231
+ # joint probability precomputed: 0.170000 0.000000 0.170000 ( 0.171786)
232
+ # direkt: 2.450000 0.180000 2.630000 ( 2.638515)
233
+ # direkt with conditions: 0.780000 0.050000 0.830000 ( 0.829055)
234
+ # direkt with event condition: 0.960000 0.070000 1.030000 ( 1.024606)
235
+ #--------------------------------------------------------- total: 7.930000sec
236
+ #
237
+ # user system total real
238
+ # joint probability: 3.010000 0.110000 3.120000 ( 3.132044)
239
+ # joint probability precomputed: 0.170000 0.000000 0.170000 ( 0.165960)
240
+ # direkt: 2.470000 0.150000 2.620000 ( 2.634326)
241
+ # direkt with conditions: 0.770000 0.050000 0.820000 ( 0.810167)
242
+ # direkt with event condition: 0.930000 0.050000 0.980000 ( 0.995371)
243
+ #
244
+ # ===========================================================================
245
+ #
246
+ # $ jruby -version
247
+ # jruby 1.4.0 (ruby 1.8.7 patchlevel 174) (2009-11-02 69fbfa3) (Java HotSpot(TM) 64-Bit Server VM 1.6.0_17) [x86_64-java]
248
+ #
249
+ # Rehearsal ------------------------------------------------------------------
250
+ # joint probability: 3.100000 0.000000 3.100000 ( 3.100000)
251
+ # joint probability precomputed: 0.148000 0.000000 0.148000 ( 0.148000)
252
+ # direkt: 0.988000 0.000000 0.988000 ( 0.988000)
253
+ # direkt with conditions: 0.424000 0.000000 0.424000 ( 0.424000)
254
+ # direkt with event condition: 0.558000 0.000000 0.558000 ( 0.558000)
255
+ # --------------------------------------------------------- total: 5.217999sec
256
+ #
257
+ # user system total real
258
+ # joint probability: 0.992000 0.000000 0.992000 0.992000
259
+ # joint probability precomputed: 0.087000 0.000000 0.087000 0.087000
260
+ # direkt: 0.621000 0.000000 0.621000 0.621000
261
+ # direkt with conditions: 0.321000 0.000000 0.321000 0.321000
262
+ # direkt with event condition: 0.327000 0.000000 0.327000 0.327000
263
+ #
264
+ # ===========================================================================
265
+ #
266
+ # $ macruby -version
267
+ # MacRuby version 0.5 (ruby 1.9.0) [universal-darwin10.0, x86_64]
268
+ #
269
+ # Rehearsal ------------------------------------------------------------------
270
+ # joint probability: 7.710000 0.220000 7.930000 ( 6.988403)
271
+ # joint probability precomputed: 0.140000 0.000000 0.140000 ( 0.135137)
272
+ # direkt: 5.550000 0.170000 5.720000 ( 5.117666)
273
+ # direkt with conditions: 1.740000 0.060000 1.800000 ( 1.490908)
274
+ # direkt with event condition: 1.750000 0.060000 1.810000 ( 1.526937)
275
+ # -------------------------------------------------------- total: 17.400000sec
276
+ #
277
+ # user system total real
278
+ # joint probability: 7.610000 0.230000 7.840000 6.693219
279
+ # joint probability precomputed: 0.120000 0.010000 0.130000 0.118537
280
+ # direkt: 5.600000 0.190000 5.790000 4.846050
281
+ # direkt with conditions: 1.720000 0.070000 1.790000 1.484840
282
+ # direkt with event condition: 1.750000 0.060000 1.810000 1.507850
283
+ #
@@ -0,0 +1,87 @@
1
+
2
+ require 'rubygems'
3
+
4
+ require 'prob'
5
+ include Probably
6
+
7
+ #
8
+ # Problem:
9
+ # Given a positive or negative test for a specific illness we want to know the
10
+ # probability for being ill or healthy.
11
+ #
12
+ # Suppose the random variables I and T are given with I = {Ill, Healthy}
13
+ # being the health status and T = {Negative, Positive} the test result.
14
+ #
15
+ # It is known that the probability of being 'ill' is 1 in a 1000,
16
+ # thus:
17
+ # P(I = Ill) = 0.001 and P(I = Healthy) = 0.999
18
+ #
19
+ # Furthermore we do know that the test has an accuracy of 99%, thus
20
+ # P(T = Positive | I = Ill ) = 0.99
21
+ # P(T = Negative | I = Ill ) = 0.01
22
+ # P(T = Positive | I = Healthy ) = 0.01
23
+ # P(T = Negative | I = Healthy ) = 0.99
24
+ #
25
+ # Task:
26
+ # compute the probability of being 'ill', given the test was positive.
27
+ # Using bayes rule:
28
+ #
29
+ # P(T, I) = P(T|I) * P(I) = P(I|T) * P(T)
30
+ #
31
+ # =>
32
+ #
33
+ # P(T |I) * P(I)
34
+ # P(I|T) = ---------------- = < P(T|I) * P(I) >
35
+ # P(T)
36
+ #
37
+ #
38
+
39
+ PFalseNegative = 0.01 # constant for P( T | I = Ill)
40
+ PFalsePositive = 0.01 # constant for P( T | I = Healthy)
41
+
42
+ # define: P(I)
43
+ PDisease = choose 0.001, :ILL, :HEALTHY
44
+
45
+ # P(T|I)
46
+ def pTest(i)
47
+ choose(i == :ILL ? PFalseNegative : 1 - PFalsePositive,
48
+ :Negative, :Positive)
49
+ end
50
+
51
+
52
+ # P(T|I)
53
+ # but combine states and save final distribution in constant
54
+ PTest = PDisease.dep {|i|
55
+ pTest(i).dep {|t| mkState([i,t]) }
56
+ }
57
+
58
+ testpred = Proc.new {|disease, test| disease == :ILL}
59
+
60
+ p PTest
61
+
62
+ # using filter we find on PTest which is P(T|I) we find
63
+ # P( I | T = Positive )
64
+ p "probability of I if test is Positive:"
65
+ p PTest.filter{|disease, test| test == :Positive}
66
+
67
+ # using the testpred function and query we can find the probability of all
68
+ # events testpred returns true for. In this case P( I = Ill | T = Positive)
69
+ p "probability of being ill"
70
+ p PTest.filter{|disease,test| test == :Positive}.query? &testpred
71
+
72
+ # next find the most probable explanation if Test was Positive:
73
+ p "most probable"
74
+ p PTest.filter{|disease,test| test == :Positive}.most_probable
75
+
76
+ # alternatively using condition on the monadic computation directly
77
+ # and normalizing the result needed multiplications and memory may be reduced:
78
+ # event_dep is like 'dep {|var| condition(var == :Positive) { ... } }'
79
+ p "another way of finding P(I|T=Positive)"
80
+ p PDisease.dep {|i|
81
+ # event_dep will execute block only if
82
+ # Test was :Positive and return 'nil' else
83
+ pTest(i).event_dep(just :Positive) {
84
+ mkState(i)
85
+ }
86
+ }.normalize
87
+
@@ -0,0 +1,44 @@
1
+
2
+ require 'rubygems'
3
+
4
+ require 'prob'
5
+ include Probably
6
+
7
+ # same problem as in diagnosis.rb, but with drug users and Test.
8
+ # just using some different methods to implement the same queries...
9
+
10
+ def drugTest(puser = 0.001, p_posifuser = 0.99, p_posifclean = 0.01)
11
+ choose(puser, :User, :Clean).dep { |user|
12
+ choose(if user == :User then p_posifuser else p_posifclean end,
13
+ :Pos, :Neg).dep { |test|
14
+ mkState([user, test])
15
+ }
16
+ }
17
+ end
18
+
19
+ def drugTest2
20
+ drugTest.dep {|u,t|
21
+ if t == :Pos then mkState(u) else nil end
22
+ }
23
+ end
24
+
25
+ def drugTest3(puser = 0.001, p_posifuser = 0.99, p_posifclean = 0.01)
26
+ choose(puser, :User, :Clean).dep { |user|
27
+ choose(if user == :User then p_posifuser else p_posifclean end,
28
+ :Pos, :Neg).dep { |test|
29
+ condition(test == :Pos) {
30
+ mkState user
31
+ }
32
+ }
33
+ }.normalize
34
+ end
35
+
36
+ #p drugTest2
37
+
38
+ p drugTest
39
+ p drugTest.filter {|u,t| t == :Pos }
40
+ p drugTest(0.5).filter {|u,t| t == :Pos}
41
+
42
+ p drugTest3
43
+ # p drugTest3(0.5)
44
+
@@ -0,0 +1,106 @@
1
+
2
+ require 'rubygems'
3
+
4
+ require 'prob'
5
+ include Probably
6
+
7
+ # the monty hall problem is a simple game show based probability puzzle with
8
+ # a puzzling outcome :)
9
+ #
10
+ # Suppose you are on a game show and you are given the choice of 3 doors.
11
+ # Behind one of these doors is the price and behind the others a goat. Only the
12
+ # moderator knows behind which door the price is and will open one door with a
13
+ # goat after you did your first choice. Next you can choose if you want to
14
+ # switch doors or not.
15
+ #
16
+ # Question:
17
+ # What is the best strategie? Stay or switch?
18
+ # What are the probabilities of winning for each of these strategies?
19
+ #
20
+
21
+ # first we want to encode our state.
22
+ #
23
+ # these are the doors one can choose from:
24
+ $doors = [:A, :B, :C]
25
+
26
+ # state final state is hashmap with keys:
27
+ # :open => door opened by entertainer
28
+ # :prize => door the prize is behind
29
+ # :selected => by player selected door
30
+
31
+ # testing function on state to find out if we win or loose
32
+ $testWinner = proc do |s|
33
+ if s[:prize] == s[:selected]
34
+ :Winner
35
+ else
36
+ :Looser
37
+ end
38
+ end
39
+
40
+ # apply event function $testWinner on
41
+ # each possible state
42
+ def winnerProb(prob)
43
+ prob.map &$testWinner
44
+ end
45
+
46
+ # Let us encode the problem with random variables:
47
+ #
48
+ # P = doors : door prize was put behind
49
+ # C1 = doors : the door chosen in the first round by player
50
+ # O = doors : the door opened by show's host
51
+ #
52
+
53
+ # first step: let's hide the price
54
+ # P(P = A) = 1/3
55
+ # P(P = B) = 1/3
56
+ # P(P = C) = 1/3
57
+ hide = uniform( $doors.map { |d| {:prize => d} } )
58
+
59
+ # and then let the player choose one door:
60
+ # P(C1 = A) = 1/3
61
+ # P(C1 = B) = 1/3
62
+ # P(C1 = C) = 1/3
63
+ choose = uniform( $doors.map { |d| {:selected => d}} )
64
+
65
+ # combine event P and C1 and create state representation:
66
+ # P(C1|P) = P(C1) * P(P) <- because event P and C1 are independent
67
+ hideThenChoose = hide.mult(choose) { |p,s|
68
+ {:prize => p[:prize], :selected => s[:selected]}
69
+ }
70
+
71
+ # compute probability distribution of host opening a specific door
72
+ # given the event P and C1:
73
+ # P(O|C1,P)
74
+ # with O != C1 and O != P
75
+ opened = hideThenChoose.dep do |s|
76
+ s_ = ($doors - [s[:prize], s[:selected]]).map do |d|
77
+ {:open => d, :prize => s[:prize], :selected => s[:selected]}
78
+ end
79
+ uniform s_
80
+ end
81
+ #p opened
82
+
83
+ # finally implement strategie 'stay'
84
+ def stay(prob)
85
+ prob
86
+ end
87
+
88
+ # and strategy 'switch' choosing a door C2 with
89
+ # C2 != O and C2 != C1.
90
+ # find P(C2|O, C1, P)
91
+ def switch(prob)
92
+ prob.dep do |s|
93
+ s_ = ($doors - [s[:selected], s[:open]]).map do |d|
94
+ {:open => s[:open], :selected => d, :prize => s[:prize]}
95
+ end
96
+ uniform s_
97
+ end
98
+ end
99
+
100
+ # print some results
101
+ puts 'if stay most probable result: ', winnerProb(stay(opened)).most_probable
102
+ puts 'if switch most probable result: ', winnerProb(switch(opened)).most_probable
103
+ puts ''
104
+ puts 'if stay porbability of winning: ', winnerProb(stay(opened)).probability(:Winner)
105
+ puts 'if switch porbability of winning: ', winnerProb(switch(opened)).probability(:Winner)
106
+
@@ -0,0 +1,326 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #require 'rubygems'
4
+
5
+ require '../lib/prob'
6
+ include Probably
7
+
8
+ # Bayesian Spam filter example.
9
+ # We try to find the probability of a message it's classification being spam
10
+ # or ham using a naive bayesian filter and a second filter using fisher's
11
+ # methods to analyse the plausibility of the first filter its result.
12
+ #
13
+ # In essence the bayesian filter tries to find the probability for the message
14
+ # being spam using the message its features and previously seen messages.
15
+ #
16
+ # Suppose we have the random variables:
17
+ # S = {:Spam, :Ham}
18
+ # Document = Set of words/features = {Wi ... Wn}
19
+ # Wi = word Wi present or not present {true, false}
20
+ #
21
+ # then
22
+ #
23
+ # P(S|Document) = P(S|W1) * P(S|W2) * ... * P(S|Wn)
24
+ #
25
+ # meaning we assume all feature/words to be statistically independent (hence
26
+ # naive bayesian filter).
27
+ #
28
+ # Finding words in old message and their spam/ham count we can drive the
29
+ # filter.
30
+ #
31
+ # Next let's find the probability for spam given a word P(S|Wi):
32
+ #
33
+ # P(Wi|S) * P(S)
34
+ # P(S|Wi) = ---------------
35
+ # P(Wi)
36
+ #
37
+ # But to minimize computational effort a classifier for each word assuming a
38
+ # uniform prior distribution P(S) is precomputed and the true prior is used
39
+ # later on inference. So we can store the classifiers directly in our database
40
+ # instead of recomputing them over and over again.
41
+ #
42
+ # P(S|Document) = < P(S|W1) * P(S|W2) * ... >
43
+ # = < P(W1|S) * prior * P(W2|S) * prior * ... >
44
+ #
45
+ # here < P(...) > stands for "alpha * P(...)" and expresses normalization which
46
+ # is done automatically by our library. Thus
47
+ #
48
+ # P(Wi|S) * P(S)
49
+ # P(S|Wi) = ---------------- = < P(Wi|S) * P(S) >
50
+ # P(Wi)
51
+ #
52
+ # First we need to explain how the classifiers are precomputed and how these
53
+ # precomputed classifiers are used to do the classification:
54
+ #
55
+ # Suppose P_uni is uniform distribution for spam/ham, thus P_uni(spam) = 0.5
56
+ # and P_uni(ham) = 0.5. Then
57
+ #
58
+ # P(Wi | S) * P_uni(S) P(Wi | S) * P_uni(S)
59
+ # P_uni(S | Wi) = -------------------- = ------------------------------------
60
+ # P(Wi) Sum(s={spam,ham}) P(Wi|s) * P_uni(s)
61
+ #
62
+ # = < P(Wi|S) * P_uni(S) >
63
+ #
64
+ # now Suppose the real prior is given, thus with new prior:
65
+ #
66
+ # P_prior(S|Wi) = < P(Wi|S) * P_prior(S) >
67
+ #
68
+ # P(Wi|S) * P_prior(S) P_uni(S|Wi) * P_prior(S)
69
+ # = -------------------- = ------------------------
70
+ # P(Wi) P_uni(S)
71
+ #
72
+ # = < P_uni(S|Wi) * P_prior(S) >
73
+ #
74
+ # = P(S|Wi)
75
+ #
76
+ # P(S|Document) = < P(S|W1) * P(S|W2) * ... >
77
+ # = < P(W1|S) * P_prior(S) * P(W2|S) * P_prior(S) * ... >
78
+ # = < P_uni(S|W1) * P_prior(S) * P_uni(S|W2) * P_prior(S) * ... >
79
+ #
80
+ # Using these, our classifiers to store in the database are P_uni(S|Wi) for
81
+ # each word found during learning. So when learning from new message not all
82
+ # classifiers need to be recomputed. Alternatively one may want to store
83
+ # P_prior(S|Wi) in the database, but when learning from new messages all
84
+ # classifiers need to be updated then. One may even assume the prior to always
85
+ # be distributed uniform. In that case P(S|Document) becomes
86
+ # P(S|Document) = < P_uni(S|W1) * P_uni(S|W2) ... >
87
+ #
88
+ # Instead of using all classifiers for all words found only a subset is used.
89
+ # This subset of classifiers to use is found by scoring the classifiers and
90
+ # using the classifiers with highest scores for the words found in the
91
+ # document.
92
+ #
93
+ # Scoring is done by computing the 'quadratic distance' of a classifier to the
94
+ # uniform distribution:
95
+ # score = ( 0.5 - P_uni(S=spam|Wi) )^2 + ( 0.5 - P_uni(S=ham|Wi))^2
96
+ #
97
+ # Furthermore if a classifier assumes P_uni(S=spam|Wi) = 0 or P_uni(S=ham|Wi) = 0
98
+ # the probability will be adjusted to 0.01.
99
+ #
100
+
101
+ S = [:Spam, :Ham]
102
+
103
+ # module to be mixed into a 'Spam Feature Database' to compute probabilities
104
+ # from the database.
105
+ #
106
+ # It's assumed that the 'Spam Feature Database' provides the following
107
+ # functions:
108
+ #
109
+ # countWord(word:String, type:{:Spam, :Ham}) => Int # occurences of word given
110
+ # # Spam/Ham messages
111
+ #
112
+ # countType(type:{:Spam, :Ham}) => Int # number of Spam/Ham messages learned
113
+ #
114
+ module SpamDatabaseProbabilities
115
+ # probabilities
116
+ #
117
+ # S = {:Spam, :Ham} ; Set of possible message type
118
+ # P(S) <- prior probability
119
+ #
120
+ # W = {set of known words}
121
+ # P(W|S) <- likelyhood
122
+
123
+ def pMsgType # P(S)
124
+ enumDist types, @msgCounts
125
+ end
126
+
127
+ def pWord(word, type) # P(W == word | S == type)
128
+ n = countWord(word, type).to_f
129
+ total = countType(type).to_f
130
+ choose n / total, true, false
131
+ end
132
+
133
+ # P(S | W == word) = < P(W == word | S) * prior >
134
+ def pHasWord(word, prior = pMsgType)
135
+ prior.dep {|t|
136
+ pWord(word, t).event_dep(just true) {
137
+ mkState(t)
138
+ }
139
+ }.normalize
140
+ end
141
+
142
+ #P(S | W1 == word1, W2 == word2, ...) = < P(S|W1) * P(S|W2) * ...>
143
+ def pHasWords(words, prior = pMsgType)
144
+ words.reduce(prior) {|p,w| pHasWord(w, p) }
145
+ end
146
+ end
147
+
148
+ # our test database
149
+ class SpamBaseKnowledge
150
+ include SpamDatabaseProbabilities
151
+
152
+ def initialize
153
+ @msgCounts = [103, 57]
154
+ @wordCountTable = block1({
155
+ "the" => [1, 2],
156
+ "quick" => [1, 1],
157
+ "brown" => [0, 1],
158
+ "fox" => [0, 1],
159
+ "jumps" => [0, 1],
160
+ "over" => [0, 1],
161
+ "lazy" => [0, 1],
162
+ "dog" => [0, 1],
163
+ "make" => [1, 0],
164
+ "money" => [1, 0],
165
+ "in" => [1,0],
166
+ "online" => [1,0],
167
+ "casino" => [1, 0],
168
+ "free" => [57, 6],
169
+ "bayes" => [1, 10],
170
+ "monad" => [0, 22],
171
+ "hello" => [30, 32],
172
+ "asdf" => [40, 2]
173
+ }) { |h| h.default = [0,0] }
174
+ end
175
+
176
+ def types
177
+ S
178
+ end
179
+
180
+ def knownWords
181
+ @wordCountTable.keys
182
+ end
183
+
184
+ def countType(type)
185
+ if type != :Spam && type != :Ham
186
+ return 0
187
+ else
188
+ @msgCounts[ type2Index type ]
189
+ end
190
+ end
191
+
192
+ def countWord(word, type)
193
+ @wordCountTable[word][ type2Index type ]
194
+ end
195
+
196
+ private
197
+ def type2Index(type)
198
+ if type == :Spam then 0 else 1 end
199
+ end
200
+ end
201
+
202
+ # The naive bayesian classifier.
203
+ BayesianStrategy = proc {|classifiers, prior, _, _|
204
+ classifiers.map { |c|
205
+ # compute < P_uni(S|Wi) * P_prior(S) >
206
+ # and use nil for invalid cases to do doing bayesian inference (it is
207
+ # important to use nil for invalid cases until the end for invalid
208
+ # cases for normalization).
209
+ prior.dep { |t|
210
+ c.map { |t_c| t == t_c ? t : nil }
211
+ }
212
+ }.inject { |da, db| # multiply all probabilities (naive bayesian part)
213
+ da.dep { |t|
214
+ db.map { |t_b| t == t_b ? t : nil }
215
+ }
216
+ }.normalize
217
+ }
218
+
219
+ # use bayesian classifier and analyse using fisher's method
220
+ FisherStrategy = proc {|classifiers, prior, n, words|
221
+ hypothesis = BayesianStrategy.call(classifiers, prior, n, words)
222
+ dof = classifiers.length # dof / 2
223
+ map = Hash.new(0)
224
+
225
+ for p,k in hypothesis
226
+ # chi_square = -2.0 * sum(i) { log(p_i) }
227
+ # = -2.0 * log(p)
228
+ #
229
+ # copmute p-value by solving
230
+ #
231
+ # integral( x^(n-1) * exp(-x/2) / (gamma(n) * 2^n) , -2 log(p), inf, dx)
232
+ #
233
+ # integral ( x^(n-1) * exp(-x/2), -2 log(p), inf, dx)
234
+ # = ---------------------------------------------------
235
+ # gamma(n) * 2^n
236
+ #
237
+ # = p * Sum(i = 1 to n) { (-log(p))^(n - i) / (n - i)! }
238
+ #
239
+ # = p + p * Sum(i = 1 to n-1) { (-log(p))^(n - i) / (n - i)! }
240
+ #
241
+ # with n = dof
242
+
243
+ m = -Math.log(p) # 0.5 chi
244
+ t = p # exp(-m) = exp(log(p)) = p
245
+
246
+ # compute p value
247
+ tmp = 1.upto(dof-1).reduce(t) {|sum,i|
248
+ t *= m / i.to_f
249
+ sum + t
250
+ }
251
+
252
+ map[k] = if tmp < 1.0 then tmp else 1.0 end
253
+ end
254
+ map
255
+ }
256
+
257
+ # other part of the database computing, scoring and storing the classifiers
258
+ # P_uni(S|Wi)
259
+ class SpamClassifier
260
+
261
+ def initialize(knowledge, strategie)
262
+ @knowledge = knowledge # our database
263
+ @classifiers = {} # the classifiers
264
+ @strategie = strategie # the strategy to use, naive bayesian or fisher's method
265
+
266
+ buildClassifiers {|w,s,probs|
267
+ @classifiers[w] = [s,probs]
268
+ }
269
+ end
270
+
271
+ def pMsgTypeByWords(words, n = 15, prior = @knowledge.pMsgType)
272
+ @strategie.call(findClassifiers(words, n), prior, n, words)
273
+ end
274
+
275
+ # classify a message using the n most prominent classifiers
276
+ def classify(words, n = 15)
277
+ pMsgTypeByWords(words, n).most_probable
278
+ end
279
+
280
+ private
281
+ def characteristic(f)
282
+ f.call uniform(@knowledge.types)
283
+ end
284
+
285
+ def score(f = nil, &blk)
286
+ pDistance( characteristic(f || blk), uniform(@knowledge.types))
287
+ end
288
+
289
+ def buildClassifiers
290
+ @knowledge.knownWords.each {|w,types|
291
+ s = score {|prior| @knowledge.pHasWord(w,prior)}
292
+ probs = adjustMinimums(@knowledge.pHasWord(w, uniform(S)))
293
+ yield w, s, probs
294
+ }
295
+ end
296
+
297
+ def findClassifiers(words, n)
298
+ classifiers = words.map {|w| [w, @classifiers[w]] }.delete_if {|w,c| c == nil}
299
+ classifiers.sort! {|x,y| x[1][0] <=> y[1][0]}
300
+ classifiers[0,n].map {|w,(s,prob)|
301
+ prob
302
+ }
303
+ end
304
+ end
305
+
306
+ # run some tests using the test database, some key words and the different
307
+ # strategies
308
+ classifiers = [ ["bayesian", SpamClassifier.new(SpamBaseKnowledge.new, BayesianStrategy)],
309
+ ["fisher's method", SpamClassifier.new(SpamBaseKnowledge.new, FisherStrategy)] ]
310
+
311
+ testCorpus = [["free"],
312
+ ["monad"],
313
+ ["free", "asdf", "bayes", "quick", "jump", "test"],
314
+ ["free", "monad", "asdf", "bayes", "quick", "jump", "test"]
315
+ ]
316
+
317
+ puts "\ntest classifier"
318
+ testCorpus.each do |data|
319
+ printf "use corpus: #{data}\n"
320
+ classifiers.each do |n, c|
321
+ puts n
322
+ puts c.pMsgTypeByWords(data)
323
+ puts ""
324
+ end
325
+ end
326
+
data/lib/prob.rb ADDED
@@ -0,0 +1,328 @@
1
+
2
+ # The Probably module provides functions and a discrete Distribution class for
3
+ # monadic functional probabilistic programming in ruby.
4
+
5
+ puts 'loading rb_prob'
6
+
7
+ module Probably
8
+
9
+ # simple helper function running a given block with its first argument and
10
+ # returns first argument
11
+ def block1(x, &blk)
12
+ blk.call(x)
13
+ x
14
+ end
15
+
16
+ # given a block return a new Proc defined on range [0..1]
17
+ def mkShapeFunction
18
+ proc { |x|
19
+ if x < 0 || x > 1.0 then 0 else yield x end
20
+ }
21
+ end
22
+
23
+ # creates a Proc computing a gaussian distribution
24
+ # in range [0..1] given a mean and deviation
25
+ def normalDistShape(mean, dev)
26
+ include Math
27
+
28
+ mkShapeFunction { |x|
29
+ u = (x - mean) / dev
30
+ exp (-0.5 * u * u) / sqrt(2 * PI)
31
+ }
32
+ end
33
+
34
+ # The Discrete Distribution representation class
35
+ class Distribution
36
+ include Enumerable
37
+
38
+ protected
39
+ def initializeLists(data, shape)
40
+ @map = Hash.new(0)
41
+ count = data.length
42
+ data.each_with_index { |val, i|
43
+ @map[val] += shape.call( Float(i + 1) / count )
44
+ }
45
+ end
46
+
47
+ def initializeMap(m)
48
+ @map = Hash.new(0)
49
+ m.each { |k,v| @map[k] = v }
50
+ self.normalizeProbabilities
51
+ end
52
+
53
+ def normalizeProbabilities
54
+ sum = Float( @map.values.inject(:+) )
55
+ @map.keys.each { |k| @map[k] /= sum } if sum != 1.0
56
+ end
57
+
58
+ public
59
+
60
+ # Creates a new Discrete Distribution with
61
+ # said constructor type (init_type) and initial data
62
+ # upon construction the data are automatically normalized
63
+ # if init_type is:
64
+ # - :MAP then the given map use used directly and should not
65
+ # be used anymore by someone else but the current
66
+ # distribution class
67
+ # - :MAPCOPY then the given map is copied for further use
68
+ # - :LISTS then the second parameter is the list of keys and the
69
+ # third parameter the corresponding list of probabilities
70
+ def initialize(init_type, *data)
71
+ case init_type
72
+ when :MAP
73
+ @map = data[0]
74
+ when :MAPCOPY
75
+ initializeMap(data[0])
76
+ when :LISTS
77
+ initializeLists(data[0], data[1])
78
+ else
79
+ raise "unable to create probability distribution"
80
+ end
81
+ self.normalizeProbabilities
82
+ end
83
+
84
+ # set of keys in distribution
85
+ def keys
86
+ @map.keys
87
+ end
88
+
89
+ # returns normalized distribution removing
90
+ # all nil values.
91
+ # In combination with condition, normalize must be used
92
+ # to compute normalization of bayes theorem
93
+ def normalize
94
+ if @map[nil] > 0.0
95
+ filter { |v| v != nil }
96
+ else
97
+ @self
98
+ end
99
+ end
100
+
101
+ # returns probability of event val from
102
+ # distribution
103
+ def probability(val)
104
+ @map[val]
105
+ end
106
+
107
+ # use most_probable to retrieve most probable event and
108
+ # its probability from given distribution
109
+ def most_probable
110
+ @map.reduce { |best, value|
111
+ if best[1] < value[1] then value else best end
112
+ }
113
+ end
114
+
115
+ # randomly pick a key-value with respect to its probability
116
+ # in given distribution
117
+ def pick
118
+ r = rand
119
+ sum = 0
120
+ for k,p in @map
121
+ sum += p
122
+ return k,p if r < sum
123
+ end
124
+ return nil
125
+ end
126
+
127
+ def each
128
+ @map.each { |k, p| yield p, k }
129
+ end
130
+
131
+ def map
132
+ tmp = Hash.new(0)
133
+ for k,p in @map
134
+ tmp[yield(k)] += p
135
+ end
136
+ Distribution.new(:MAP, tmp)
137
+ end
138
+
139
+ def filter
140
+ Distribution.new :MAP, @map.reject { |k,v|
141
+ !(yield k)
142
+ }
143
+ end
144
+
145
+ def query?
146
+ @map.reduce(0) {|probability, (dat,dp)|
147
+ if yield dat then probability + dp
148
+ else probability end
149
+ }
150
+ end
151
+
152
+ def join
153
+ tmp = Hash.new(0)
154
+
155
+ for dist,p1 in @map
156
+ for p2, k in dist
157
+ tmp[k] += p1 * p2
158
+ end
159
+ end
160
+ Distribution.new(:MAP, tmp)
161
+ end
162
+
163
+ def dep
164
+ m = Hash.new(0)
165
+ for k1,p1 in @map
166
+ tmp = yield k1
167
+ if tmp != nil
168
+ for p2, k in tmp
169
+ m[k] += p1 * p2
170
+ end
171
+ end
172
+ end
173
+ Distribution.new(:MAP, m)
174
+ end
175
+
176
+ def event_dep(pred)
177
+ self.dep {|x|
178
+ if !pred.call x
179
+ mkState nil
180
+ else
181
+ yield x
182
+ end
183
+ }
184
+ end
185
+
186
+ def mult(dist2)
187
+ self.dep do |k|
188
+ if block_given? then dist2.map { |k2| yield(k, k2) }
189
+ else dist2.map { |k2| [k, k2] }
190
+ end
191
+ end
192
+ end
193
+
194
+ def * (dist2)
195
+ self.mult dist2
196
+ end
197
+
198
+ # computes expectation given that keys in distribution
199
+ # are numeric
200
+ def expectation
201
+ @map.reduce(0) {|sum, (k,p)| sum + k.to_f * p }
202
+ end
203
+
204
+ # computes variance given that keys in distribution
205
+ # are numeric
206
+ def variance
207
+ expected = self.expectation
208
+ @map.reduce(0) {|sum, (k,p)|
209
+ tmp = (k.to_f - expectation)
210
+ sum + tmp * tmp * p
211
+ }
212
+ end
213
+
214
+ # computes standard deviation given that keys in distribution
215
+ # are numeric
216
+ def std_dev
217
+ Math.sqrt( self.variance )
218
+ end
219
+
220
+ def to_s
221
+ @map.reduce("") { |str,(k,p)|
222
+ str + "#{k} : #{p * 100} %\n"
223
+ }
224
+ end
225
+ end
226
+
227
+ # create uniformly distributed Distribution from array of values
228
+ def uniform(data)
229
+ Distribution.new :LISTS, data, mkShapeFunction {|x| 1}
230
+ end
231
+
232
+ # creates linearly distributed Distribution from array of values
233
+ def linear(data)
234
+ Distribution.new :LISTS, data, mkShapeFunction {|x| x }
235
+ end
236
+
237
+ # creates exp(-x) distributed Distribution from array of values
238
+ def negExp(data)
239
+ Distribution.new :LISTS, data, mkShapeFunction {|x| Math.exp(-x) }
240
+ end
241
+
242
+ # creates Distribution from array of values using a gaussian distribution
243
+ def normal(data, mean = 0.5, dev = 0.5)
244
+ Distribution.new :LISTS, data, normalDistShape(mean, dev)
245
+ end
246
+
247
+ # creates a distribution from first array holding the distribution
248
+ # values and second one the corresponding probabilities (do be normalized)
249
+ # - data: array of input values
250
+ # - dist: array of probabilities
251
+ def enumDist(data, dist)
252
+ if data.length != dist.length
253
+ raise "data and distribution length must be equal"
254
+ end
255
+
256
+ Distribution.new :LISTS, data, mkShapeFunction {|i| dist[i * dist.length - 1]}
257
+ end
258
+
259
+ # Creates a new probability distribution from given map:
260
+ # m = { key1 => probability1, key2 => probability2, key3 => ... }
261
+ def mapDist(m)
262
+ Distribution.new :MAPCOPY, m
263
+ end
264
+
265
+ def distWithShape(data, &blk)
266
+ Distribution.new :LISTS, data, mkShapeFunction(&blk)
267
+ end
268
+
269
+ def choose(p, elem1, elem2)
270
+ tmp = Hash.new(0)
271
+ tmp[elem1] = p
272
+ tmp[elem2] = 1 - p
273
+ Distribution.new :MAP, tmp
274
+ end
275
+
276
+ def mkState(a)
277
+ tmp = Hash.new(0)
278
+ tmp[a] = 1
279
+ Distribution.new :MAP, tmp
280
+ end
281
+
282
+ def histogram(a)
283
+ block1(Hash.new(0)) do |r|
284
+ for x in a
285
+ r[x] += 1
286
+ end
287
+ end
288
+ end
289
+
290
+ def condition(b)
291
+ if b then yield else mkState nil end
292
+ end
293
+
294
+ # events
295
+ def mkEvent(&f)
296
+ f
297
+ end
298
+
299
+ def just(x)
300
+ mkEvent {|y| x == y}
301
+ end
302
+
303
+ def ifJust(x)
304
+ if x == nil then proc {|y| true }
305
+ else proc {|y| x == y } end
306
+ end
307
+
308
+ def oneOf(*elems)
309
+ proc {|y| elems.include? y }
310
+ end
311
+
312
+ def pDistance(dist1, dist2)
313
+ (dist1.keys | dist2.keys).reduce(0) {|sum,k|
314
+ tmp = dist1.probability(k) - dist2.probability(k)
315
+ sum + tmp * tmp
316
+ }
317
+ end
318
+
319
+ def adjustMinimums(dist, newMin = 0.01)
320
+ tmp = Hash.new(0)
321
+ dist.each do |p,k|
322
+ tmp[k] = if p > newMin then p else newMin end
323
+ end
324
+ Distribution.new :MAP, tmp
325
+ end
326
+
327
+ end
328
+
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rb_prob
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Steffen Siering
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-03-21 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: "monad programming programming library for ruby. for examples see github repository: http://github.com/urso/rb_prob"
17
+ email: steffen <dot> siering -> gmail <dot> com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/prob.rb
26
+ - examples/alarm.rb
27
+ - examples/diagnosis.rb
28
+ - examples/drugtest.rb
29
+ - examples/montyhall.rb
30
+ - examples/spamplan.rb
31
+ - LICENSE
32
+ has_rdoc: true
33
+ homepage: http://github.com/urso/rb_prob
34
+ licenses:
35
+ - BDS3
36
+ post_install_message:
37
+ rdoc_options: []
38
+
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: "0"
52
+ version:
53
+ requirements: []
54
+
55
+ rubyforge_project:
56
+ rubygems_version: 1.3.5
57
+ signing_key:
58
+ specification_version: 3
59
+ summary: monadic probabilistic programming for ruby
60
+ test_files: []
61
+