rb_probdsl 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/examples/alarm.rb CHANGED
@@ -88,11 +88,8 @@ puts(norm_prob do
88
88
  b = p_burglary
89
89
  e = p_earthquake
90
90
  a = p_alarm(b,e)
91
- if (p_john(a) == :J && p_mary(a) == :M)
92
- b
93
- else
94
- nil
95
- end
91
+ guard p_john(a) == :J && p_mary(a) == :M
92
+ b
96
93
  end)
97
94
 
98
95
  puts "\nP(A|John=true, Mary=true)"
@@ -100,11 +97,8 @@ puts(norm_prob do
100
97
  b = p_burglary
101
98
  e = p_earthquake
102
99
  a = p_alarm(b, e)
103
- if p_john(a) == :J && p_mary(a) == :M
104
- a
105
- else
106
- nil
107
- end
100
+ guard p_john(a) == :J && p_mary(a) == :M
101
+ a
108
102
  end)
109
103
 
110
104
  # john and mary tell us for sure, the alarm went of and we know
@@ -114,11 +108,8 @@ puts(norm_prob do
114
108
  b = p_burglary
115
109
  e = p_earthquake
116
110
  a = p_alarm(b,e)
117
- if (a == :A && p_john(a) == :J && p_mary(a) == :M)
118
- b
119
- else
120
- nil
121
- end
111
+ guard a == :A && p_john(a) == :J && p_mary(a) == :M
112
+ b
122
113
  end)
123
114
 
124
115
  # what is the probability john will call, if mary called?
@@ -127,11 +118,8 @@ puts(norm_prob do
127
118
  b = p_burglary
128
119
  e = p_earthquake
129
120
  a = p_alarm(b,e)
130
- if (p_mary(a) == :M)
131
- p_john(a)
132
- else
133
- nil
134
- end
121
+ guard p_mary(a) == :M
122
+ p_john(a)
135
123
  end)
136
124
 
137
125
  # and probability mary will call, if john did
@@ -140,10 +128,7 @@ puts(norm_prob do
140
128
  b = p_burglary
141
129
  e = p_earthquake
142
130
  a = p_alarm(b,e)
143
- if (p_john(a) == :J)
144
- p_mary(a)
145
- else
146
- nil
147
- end
131
+ guard p_john(a) == :J
132
+ p_mary(a)
148
133
  end)
149
134
 
@@ -53,10 +53,7 @@ end
53
53
  p "P(I|T=Positive)"
54
54
  puts norm_prob {
55
55
  i = p_disease
56
- if p_test(i) == :Positive
57
- i
58
- else
59
- nil
60
- end
56
+ guard p_test(i) == :Positive
57
+ i
61
58
  }
62
59
 
data/examples/paradox.rb CHANGED
@@ -20,11 +20,8 @@ HERE
20
20
 
21
21
  puts norm_prob {
22
22
  d1 = die; d2 = die
23
- if d1 == 4 || d2 == 4
24
- d1 + d2 == 7
25
- else
26
- nil
27
- end
23
+ guard d1 == 4 || d2 == 4
24
+ d1 + d2 == 7
28
25
  }.probability(true)
29
26
 
30
27
  puts <<HERE
@@ -33,10 +30,7 @@ The same experiment using a simulation (t = 10s):
33
30
  HERE
34
31
  puts collecting(loop_t 10) {
35
32
  d1 = die; d2 = die
36
- if d1 == 4 || d2 == 4
37
- d1 + d2 == 7
38
- else
39
- nil
40
- end
33
+ guard d1 == 4 || d2 == 4
34
+ d1 + d2 == 7
41
35
  }.normalize.probability(true)
42
36
 
@@ -0,0 +1,326 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+
5
+ require 'probdsl'
6
+ include ProbDSL
7
+
8
+ # Bayesian Spam filter example.
9
+ # We try to find the probability of a message it's classification being spam
10
+ # or ham using a naive bayesian filter and a second filter using fisher's
11
+ # methods to analyse the plausibility of the first filter its result.
12
+ #
13
+ # In essence the bayesian filter tries to find the probability for the message
14
+ # being spam using the message its features and previously seen messages.
15
+ #
16
+ # Suppose we have the random variables:
17
+ # S = {:Spam, :Ham}
18
+ # Document = Set of words/features = {Wi ... Wn}
19
+ # Wi = word Wi present or not present {true, false}
20
+ #
21
+ # then
22
+ #
23
+ # P(S|Document) = P(S|W1) * P(S|W2) * ... * P(S|Wn)
24
+ #
25
+ # meaning we assume all feature/words to be statistically independent (hence
26
+ # naive bayesian filter).
27
+ #
28
+ # Finding words in old message and their spam/ham count we can drive the
29
+ # filter.
30
+ #
31
+ # Next let's find the probability for spam given a word P(S|Wi):
32
+ #
33
+ # P(Wi|S) * P(S)
34
+ # P(S|Wi) = ---------------
35
+ # P(Wi)
36
+ #
37
+ # But to minimize computational effort a classifier for each word assuming a
38
+ # uniform prior distribution P(S) is precomputed and the true prior is used
39
+ # later on inference. So we can store the classifiers directly in our database
40
+ # instead of recomputing them over and over again.
41
+ #
42
+ # P(S|Document) = < P(S|W1) * P(S|W2) * ... >
43
+ # = < P(W1|S) * prior * P(W2|S) * prior * ... >
44
+ #
45
+ # here < P(...) > stands for "alpha * P(...)" and expresses normalization which
46
+ # is done automatically by our library. Thus
47
+ #
48
+ # P(Wi|S) * P(S)
49
+ # P(S|Wi) = ---------------- = < P(Wi|S) * P(S) >
50
+ # P(Wi)
51
+ #
52
+ # First we need to explain how the classifiers are precomputed and how these
53
+ # precomputed classifiers are used to do the classification:
54
+ #
55
+ # Suppose P_uni is uniform distribution for spam/ham, thus P_uni(spam) = 0.5
56
+ # and P_uni(ham) = 0.5. Then
57
+ #
58
+ # P(Wi | S) * P_uni(S) P(Wi | S) * P_uni(S)
59
+ # P_uni(S | Wi) = -------------------- = ------------------------------------
60
+ # P(Wi) Sum(s={spam,ham}) P(Wi|s) * P_uni(s)
61
+ #
62
+ # = < P(Wi|S) * P_uni(S) >
63
+ #
64
+ # now Suppose the real prior is given, thus with new prior:
65
+ #
66
+ # P_prior(S|Wi) = < P(Wi|S) * P_prior(S) >
67
+ #
68
+ # P(Wi|S) * P_prior(S) P_uni(S|Wi) * P_prior(S)
69
+ # = -------------------- = ------------------------
70
+ # P(Wi) P_uni(S)
71
+ #
72
+ # = < P_uni(S|Wi) * P_prior(S) >
73
+ #
74
+ # = P(S|Wi)
75
+ #
76
+ # P(S|Document) = < P(S|W1) * P(S|W2) * ... >
77
+ # = < P(W1|S) * P_prior(S) * P(W2|S) * P_prior(S) * ... >
78
+ # = < P_uni(S|W1) * P_prior(S) * P_uni(S|W2) * P_prior(S) * ... >
79
+ #
80
+ # Using these, our classifiers to store in the database are P_uni(S|Wi) for
81
+ # each word found during learning. So when learning from new message not all
82
+ # classifiers need to be recomputed. Alternatively one may want to store
83
+ # P_prior(S|Wi) in the database, but when learning from new messages all
84
+ # classifiers need to be updated then. One may even assume the prior to always
85
+ # be distributed uniform. In that case P(S|Document) becomes
86
+ # P(S|Document) = < P_uni(S|W1) * P_uni(S|W2) ... >
87
+ #
88
+ # Instead of using all classifiers for all words found only a subset is used.
89
+ # This subset of classifiers to use is found by scoring the classifiers and
90
+ # using the classifiers with highest scores for the words found in the
91
+ # document.
92
+ #
93
+ # Scoring is done by computing the 'quadratic distance' of a classifier to the
94
+ # uniform distribution:
95
+ # score = ( 0.5 - P_uni(S=spam|Wi) )^2 + ( 0.5 - P_uni(S=ham|Wi))^2
96
+ #
97
+ # Furthermore if a classifier assumes P_uni(S=spam|Wi) = 0 or P_uni(S=ham|Wi) = 0
98
+ # the probability will be adjusted to 0.01.
99
+ #
100
+
101
+ S = [:Spam, :Ham]
102
+
103
+ # module to be mixed into a 'Spam Feature Database' to compute probabilities
104
+ # from the database.
105
+ #
106
+ # It's assumed that the 'Spam Feature Database' provides the following
107
+ # functions:
108
+ #
109
+ # countWord(word:String, type:{:Spam, :Ham}) => Int # occurences of word given
110
+ # # Spam/Ham messages
111
+ #
112
+ # countType(type:{:Spam, :Ham}) => Int # number of Spam/Ham messages learned
113
+ #
114
+ module SpamDatabaseProbabilities
115
+ # probabilities
116
+ #
117
+ # S = {:Spam, :Ham} ; Set of possible message type
118
+ # P(S) <- prior probability
119
+ #
120
+ # W = {set of known words}
121
+ # P(W|S) <- likelyhood
122
+
123
+ def pMsgType # P(S)
124
+ prob do
125
+ dist @msgCounts.zip(types)
126
+ end
127
+ end
128
+
129
+ def pWord(word, type) # P(W == word | S == type)
130
+ n = countWord(word, type).to_f
131
+ total = countType(type).to_f
132
+ flip n / total, true, false
133
+ end
134
+
135
+ # P(S | W == word) = < P(W == word | S) * prior >
136
+ def pHasWord(word, clazz)
137
+ guard( pWord(word, clazz) )
138
+ clazz
139
+ end
140
+ end
141
+
142
+ # our test database
143
+ class SpamBaseKnowledge
144
+ include SpamDatabaseProbabilities
145
+
146
+ def initialize
147
+ @msgCounts = [103, 57]
148
+ @wordCountTable = block1({
149
+ "the" => [1, 2],
150
+ "quick" => [1, 1],
151
+ "brown" => [0, 1],
152
+ "fox" => [0, 1],
153
+ "jumps" => [0, 1],
154
+ "over" => [0, 1],
155
+ "lazy" => [0, 1],
156
+ "dog" => [0, 1],
157
+ "make" => [1, 0],
158
+ "money" => [1, 0],
159
+ "in" => [1,0],
160
+ "online" => [1,0],
161
+ "casino" => [1, 0],
162
+ "free" => [57, 6],
163
+ "bayes" => [1, 10],
164
+ "monad" => [0, 22],
165
+ "hello" => [30, 32],
166
+ "asdf" => [40, 2]
167
+ }) { |h| h.default = [0,0] }
168
+ end
169
+
170
+ def types
171
+ S
172
+ end
173
+
174
+ def knownWords
175
+ @wordCountTable.keys
176
+ end
177
+
178
+ def countType(type)
179
+ if type != :Spam && type != :Ham
180
+ return 0
181
+ else
182
+ @msgCounts[ type2Index type ]
183
+ end
184
+ end
185
+
186
+ def countWord(word, type)
187
+ @wordCountTable[word][ type2Index type ]
188
+ end
189
+
190
+ private
191
+ def type2Index(type)
192
+ if type == :Spam then 0 else 1 end
193
+ end
194
+ end
195
+
196
+ # The naive bayesian classifier.
197
+ BayesianStrategy = proc {|classifiers, prior, _, _|
198
+ classifiers.map { |c|
199
+ # compute < P_uni(S|Wi) * P_prior(S) >
200
+ # and use nil for invalid cases to do doing bayesian inference (it is
201
+ # important to use nil for invalid cases until the end for invalid
202
+ # cases for normalization).
203
+ prior.dep { |t|
204
+ c.map { |t_c| t == t_c ? t : nil }
205
+ }
206
+ }.inject { |da, db| # multiply all probabilities (naive bayesian part)
207
+ da.dep { |t|
208
+ db.map { |t_b| t == t_b ? t : nil }
209
+ }
210
+ }.normalize
211
+ }
212
+
213
+ # use bayesian classifier and analyse using fisher's method
214
+ FisherStrategy = proc {|classifiers, prior, n, words|
215
+ hypothesis = BayesianStrategy.call(classifiers, prior, n, words)
216
+ dof = classifiers.length # dof / 2
217
+ map = Hash.new(0)
218
+
219
+ for p,k in hypothesis
220
+ # chi_square = -2.0 * sum(i) { log(p_i) }
221
+ # = -2.0 * log(p)
222
+ #
223
+ # copmute p-value by solving
224
+ #
225
+ # integral( x^(n-1) * exp(-x/2) / (gamma(n) * 2^n) , -2 log(p), inf, dx)
226
+ #
227
+ # integral ( x^(n-1) * exp(-x/2), -2 log(p), inf, dx)
228
+ # = ---------------------------------------------------
229
+ # gamma(n) * 2^n
230
+ #
231
+ # = p * Sum(i = 1 to n) { (-log(p))^(n - i) / (n - i)! }
232
+ #
233
+ # = p + p * Sum(i = 1 to n-1) { (-log(p))^(n - i) / (n - i)! }
234
+ #
235
+ # with n = dof
236
+
237
+ m = -Math.log(p) # 0.5 chi
238
+ t = p # exp(-m) = exp(log(p)) = p
239
+
240
+ # compute p value
241
+ tmp = 1.upto(dof-1).reduce(t) {|sum,i|
242
+ t *= m / i.to_f
243
+ sum + t
244
+ }
245
+
246
+ map[k] = if tmp < 1.0 then tmp else 1.0 end
247
+ end
248
+ map
249
+ }
250
+
251
+ # other part of the database computing, scoring and storing the classifiers
252
+ # P_uni(S|Wi)
253
+ class SpamClassifier
254
+
255
+ def initialize(knowledge, strategie)
256
+ @knowledge = knowledge # our database
257
+ @classifiers = {} # the classifiers
258
+ @strategie = strategie # the strategy to use, naive bayesian or fisher's method
259
+
260
+ buildClassifiers {|w,s,probs|
261
+ @classifiers[w] = [s,probs]
262
+ }
263
+ end
264
+
265
+ def pMsgTypeByWords(words, n = 15, prior = @knowledge.pMsgType)
266
+ @strategie.call(findClassifiers(words, n), prior, n, words)
267
+ end
268
+
269
+ # classify a message using the n most prominent classifiers
270
+ def classify(words, n = 15)
271
+ pMsgTypeByWords(words, n).most_probable
272
+ end
273
+
274
+ private
275
+ def characteristic(f)
276
+ norm_prob do
277
+ f.call uniform(@knowledge.types)
278
+ end
279
+ end
280
+
281
+ def score(&blk)
282
+ characteristic(blk).distance prob{ uniform(@knowledge.types) }
283
+ end
284
+
285
+ def buildClassifiers
286
+ @knowledge.knownWords.each {|w,types|
287
+ s = score do |prior|
288
+ @knowledge.pHasWord(w,prior)
289
+ end
290
+ probs = norm_prob do
291
+ @knowledge.pHasWord(w, uniform(@knowledge.types))
292
+ end
293
+ yield w, s, probs.adjust_min
294
+ }
295
+ end
296
+
297
+ def findClassifiers(words, n)
298
+ classifiers = words.map {|w| [w, @classifiers[w]] }.delete_if {|w,c| c == nil}
299
+ classifiers.sort! {|x,y| x[1][0] <=> y[1][0]}
300
+ classifiers[0,n].map {|w,(s,prob)|
301
+ prob
302
+ }
303
+ end
304
+ end
305
+
306
+ # run some tests using the test database, some key words and the different
307
+ # strategies
308
+ classifiers = [ ["bayesian", SpamClassifier.new(SpamBaseKnowledge.new, BayesianStrategy)],
309
+ ["fisher's method", SpamClassifier.new(SpamBaseKnowledge.new, FisherStrategy)] ]
310
+
311
+ testCorpus = [["free"],
312
+ ["monad"],
313
+ ["free", "asdf", "bayes", "quick", "jump", "test"],
314
+ ["free", "monad", "asdf", "bayes", "quick", "jump", "test"]
315
+ ]
316
+
317
+ puts "\ntest classifier"
318
+ testCorpus.each do |data|
319
+ printf "use corpus: #{data}\n"
320
+ classifiers.each do |n, c|
321
+ puts n
322
+ puts c.pMsgTypeByWords(data)
323
+ puts ""
324
+ end
325
+ end
326
+
data/lib/probdsl.rb CHANGED
@@ -112,6 +112,14 @@ module ProbDSL
112
112
  }
113
113
  end
114
114
 
115
+ def guard(bool)
116
+ if !bool
117
+ shift do |cont|
118
+ PNil
119
+ end
120
+ end
121
+ end
122
+
115
123
  def dist(data)
116
124
  shift { |cont|
117
125
  PChoice.new do
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 2
9
- version: 0.0.2
8
+ - 3
9
+ version: 0.0.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - Steffen Siering
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-03-24 00:00:00 +01:00
17
+ date: 2010-03-26 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -57,6 +57,7 @@ files:
57
57
  - examples/diagnosis.rb
58
58
  - examples/montyhall.rb
59
59
  - examples/paradox.rb
60
+ - examples/spamplan.rb
60
61
  - examples/test.rb
61
62
  - LICENSE
62
63
  has_rdoc: true