rb_probdsl 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/examples/alarm.rb +10 -25
- data/examples/diagnosis.rb +2 -5
- data/examples/paradox.rb +4 -10
- data/examples/spamplan.rb +326 -0
- data/lib/probdsl.rb +8 -0
- metadata +4 -3
data/examples/alarm.rb
CHANGED
@@ -88,11 +88,8 @@ puts(norm_prob do
|
|
88
88
|
b = p_burglary
|
89
89
|
e = p_earthquake
|
90
90
|
a = p_alarm(b,e)
|
91
|
-
|
92
|
-
|
93
|
-
else
|
94
|
-
nil
|
95
|
-
end
|
91
|
+
guard p_john(a) == :J && p_mary(a) == :M
|
92
|
+
b
|
96
93
|
end)
|
97
94
|
|
98
95
|
puts "\nP(A|John=true, Mary=true)"
|
@@ -100,11 +97,8 @@ puts(norm_prob do
|
|
100
97
|
b = p_burglary
|
101
98
|
e = p_earthquake
|
102
99
|
a = p_alarm(b, e)
|
103
|
-
|
104
|
-
|
105
|
-
else
|
106
|
-
nil
|
107
|
-
end
|
100
|
+
guard p_john(a) == :J && p_mary(a) == :M
|
101
|
+
a
|
108
102
|
end)
|
109
103
|
|
110
104
|
# john and mary tell us for sure, the alarm went of and we know
|
@@ -114,11 +108,8 @@ puts(norm_prob do
|
|
114
108
|
b = p_burglary
|
115
109
|
e = p_earthquake
|
116
110
|
a = p_alarm(b,e)
|
117
|
-
|
118
|
-
|
119
|
-
else
|
120
|
-
nil
|
121
|
-
end
|
111
|
+
guard a == :A && p_john(a) == :J && p_mary(a) == :M
|
112
|
+
b
|
122
113
|
end)
|
123
114
|
|
124
115
|
# what is the probability john will call, if mary called?
|
@@ -127,11 +118,8 @@ puts(norm_prob do
|
|
127
118
|
b = p_burglary
|
128
119
|
e = p_earthquake
|
129
120
|
a = p_alarm(b,e)
|
130
|
-
|
131
|
-
|
132
|
-
else
|
133
|
-
nil
|
134
|
-
end
|
121
|
+
guard p_mary(a) == :M
|
122
|
+
p_john(a)
|
135
123
|
end)
|
136
124
|
|
137
125
|
# and probability mary will call, if john did
|
@@ -140,10 +128,7 @@ puts(norm_prob do
|
|
140
128
|
b = p_burglary
|
141
129
|
e = p_earthquake
|
142
130
|
a = p_alarm(b,e)
|
143
|
-
|
144
|
-
|
145
|
-
else
|
146
|
-
nil
|
147
|
-
end
|
131
|
+
guard p_john(a) == :J
|
132
|
+
p_mary(a)
|
148
133
|
end)
|
149
134
|
|
data/examples/diagnosis.rb
CHANGED
data/examples/paradox.rb
CHANGED
@@ -20,11 +20,8 @@ HERE
|
|
20
20
|
|
21
21
|
puts norm_prob {
|
22
22
|
d1 = die; d2 = die
|
23
|
-
|
24
|
-
|
25
|
-
else
|
26
|
-
nil
|
27
|
-
end
|
23
|
+
guard d1 == 4 || d2 == 4
|
24
|
+
d1 + d2 == 7
|
28
25
|
}.probability(true)
|
29
26
|
|
30
27
|
puts <<HERE
|
@@ -33,10 +30,7 @@ The same experiment using a simulation (t = 10s):
|
|
33
30
|
HERE
|
34
31
|
puts collecting(loop_t 10) {
|
35
32
|
d1 = die; d2 = die
|
36
|
-
|
37
|
-
|
38
|
-
else
|
39
|
-
nil
|
40
|
-
end
|
33
|
+
guard d1 == 4 || d2 == 4
|
34
|
+
d1 + d2 == 7
|
41
35
|
}.normalize.probability(true)
|
42
36
|
|
@@ -0,0 +1,326 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
require 'probdsl'
|
6
|
+
include ProbDSL
|
7
|
+
|
8
|
+
# Bayesian Spam filter example.
|
9
|
+
# We try to find the probability of a message it's classification being spam
|
10
|
+
# or ham using a naive bayesian filter and a second filter using fisher's
|
11
|
+
# methods to analyse the plausibility of the first filter its result.
|
12
|
+
#
|
13
|
+
# In essence the bayesian filter tries to find the probability for the message
|
14
|
+
# being spam using the message its features and previously seen messages.
|
15
|
+
#
|
16
|
+
# Suppose we have the random variables:
|
17
|
+
# S = {:Spam, :Ham}
|
18
|
+
# Document = Set of words/features = {Wi ... Wn}
|
19
|
+
# Wi = word Wi present or not present {true, false}
|
20
|
+
#
|
21
|
+
# then
|
22
|
+
#
|
23
|
+
# P(S|Document) = P(S|W1) * P(S|W2) * ... * P(S|Wn)
|
24
|
+
#
|
25
|
+
# meaning we assume all feature/words to be statistically independent (hence
|
26
|
+
# naive bayesian filter).
|
27
|
+
#
|
28
|
+
# Finding words in old message and their spam/ham count we can drive the
|
29
|
+
# filter.
|
30
|
+
#
|
31
|
+
# Next let's find the probability for spam given a word P(S|Wi):
|
32
|
+
#
|
33
|
+
# P(Wi|S) * P(S)
|
34
|
+
# P(S|Wi) = ---------------
|
35
|
+
# P(Wi)
|
36
|
+
#
|
37
|
+
# But to minimize computational effort a classifier for each word assuming a
|
38
|
+
# uniform prior distribution P(S) is precomputed and the true prior is used
|
39
|
+
# later on inference. So we can store the classifiers directly in our database
|
40
|
+
# instead of recomputing them over and over again.
|
41
|
+
#
|
42
|
+
# P(S|Document) = < P(S|W1) * P(S|W2) * ... >
|
43
|
+
# = < P(W1|S) * prior * P(W2|S) * prior * ... >
|
44
|
+
#
|
45
|
+
# here < P(...) > stands for "alpha * P(...)" and expresses normalization which
|
46
|
+
# is done automatically by our library. Thus
|
47
|
+
#
|
48
|
+
# P(Wi|S) * P(S)
|
49
|
+
# P(S|Wi) = ---------------- = < P(Wi|S) * P(S) >
|
50
|
+
# P(Wi)
|
51
|
+
#
|
52
|
+
# First we need to explain how the classifiers are precomputed and how these
|
53
|
+
# precomputed classifiers are used to do the classification:
|
54
|
+
#
|
55
|
+
# Suppose P_uni is uniform distribution for spam/ham, thus P_uni(spam) = 0.5
|
56
|
+
# and P_uni(ham) = 0.5. Then
|
57
|
+
#
|
58
|
+
# P(Wi | S) * P_uni(S) P(Wi | S) * P_uni(S)
|
59
|
+
# P_uni(S | Wi) = -------------------- = ------------------------------------
|
60
|
+
# P(Wi) Sum(s={spam,ham}) P(Wi|s) * P_uni(s)
|
61
|
+
#
|
62
|
+
# = < P(Wi|S) * P_uni(S) >
|
63
|
+
#
|
64
|
+
# now Suppose the real prior is given, thus with new prior:
|
65
|
+
#
|
66
|
+
# P_prior(S|Wi) = < P(Wi|S) * P_prior(S) >
|
67
|
+
#
|
68
|
+
# P(Wi|S) * P_prior(S) P_uni(S|Wi) * P_prior(S)
|
69
|
+
# = -------------------- = ------------------------
|
70
|
+
# P(Wi) P_uni(S)
|
71
|
+
#
|
72
|
+
# = < P_uni(S|Wi) * P_prior(S) >
|
73
|
+
#
|
74
|
+
# = P(S|Wi)
|
75
|
+
#
|
76
|
+
# P(S|Document) = < P(S|W1) * P(S|W2) * ... >
|
77
|
+
# = < P(W1|S) * P_prior(S) * P(W2|S) * P_prior(S) * ... >
|
78
|
+
# = < P_uni(S|W1) * P_prior(S) * P_uni(S|W2) * P_prior(S) * ... >
|
79
|
+
#
|
80
|
+
# Using these, our classifiers to store in the database are P_uni(S|Wi) for
|
81
|
+
# each word found during learning. So when learning from new message not all
|
82
|
+
# classifiers need to be recomputed. Alternatively one may want to store
|
83
|
+
# P_prior(S|Wi) in the database, but when learning from new messages all
|
84
|
+
# classifiers need to be updated then. One may even assume the prior to always
|
85
|
+
# be distributed uniform. In that case P(S|Document) becomes
|
86
|
+
# P(S|Document) = < P_uni(S|W1) * P_uni(S|W2) ... >
|
87
|
+
#
|
88
|
+
# Instead of using all classifiers for all words found only a subset is used.
|
89
|
+
# This subset of classifiers to use is found by scoring the classifiers and
|
90
|
+
# using the classifiers with highest scores for the words found in the
|
91
|
+
# document.
|
92
|
+
#
|
93
|
+
# Scoring is done by computing the 'quadratic distance' of a classifier to the
|
94
|
+
# uniform distribution:
|
95
|
+
# score = ( 0.5 - P_uni(S=spam|Wi) )^2 + ( 0.5 - P_uni(S=ham|Wi))^2
|
96
|
+
#
|
97
|
+
# Furthermore if a classifier assumes P_uni(S=spam|Wi) = 0 or P_uni(S=ham|Wi) = 0
|
98
|
+
# the probability will be adjusted to 0.01.
|
99
|
+
#
|
100
|
+
|
101
|
+
S = [:Spam, :Ham]
|
102
|
+
|
103
|
+
# module to be mixed into a 'Spam Feature Database' to compute probabilities
|
104
|
+
# from the database.
|
105
|
+
#
|
106
|
+
# It's assumed that the 'Spam Feature Database' provides the following
|
107
|
+
# functions:
|
108
|
+
#
|
109
|
+
# countWord(word:String, type:{:Spam, :Ham}) => Int # occurences of word given
|
110
|
+
# # Spam/Ham messages
|
111
|
+
#
|
112
|
+
# countType(type:{:Spam, :Ham}) => Int # number of Spam/Ham messages learned
|
113
|
+
#
|
114
|
+
module SpamDatabaseProbabilities
|
115
|
+
# probabilities
|
116
|
+
#
|
117
|
+
# S = {:Spam, :Ham} ; Set of possible message type
|
118
|
+
# P(S) <- prior probability
|
119
|
+
#
|
120
|
+
# W = {set of known words}
|
121
|
+
# P(W|S) <- likelyhood
|
122
|
+
|
123
|
+
def pMsgType # P(S)
|
124
|
+
prob do
|
125
|
+
dist @msgCounts.zip(types)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def pWord(word, type) # P(W == word | S == type)
|
130
|
+
n = countWord(word, type).to_f
|
131
|
+
total = countType(type).to_f
|
132
|
+
flip n / total, true, false
|
133
|
+
end
|
134
|
+
|
135
|
+
# P(S | W == word) = < P(W == word | S) * prior >
|
136
|
+
def pHasWord(word, clazz)
|
137
|
+
guard( pWord(word, clazz) )
|
138
|
+
clazz
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# our test database
|
143
|
+
class SpamBaseKnowledge
|
144
|
+
include SpamDatabaseProbabilities
|
145
|
+
|
146
|
+
def initialize
|
147
|
+
@msgCounts = [103, 57]
|
148
|
+
@wordCountTable = block1({
|
149
|
+
"the" => [1, 2],
|
150
|
+
"quick" => [1, 1],
|
151
|
+
"brown" => [0, 1],
|
152
|
+
"fox" => [0, 1],
|
153
|
+
"jumps" => [0, 1],
|
154
|
+
"over" => [0, 1],
|
155
|
+
"lazy" => [0, 1],
|
156
|
+
"dog" => [0, 1],
|
157
|
+
"make" => [1, 0],
|
158
|
+
"money" => [1, 0],
|
159
|
+
"in" => [1,0],
|
160
|
+
"online" => [1,0],
|
161
|
+
"casino" => [1, 0],
|
162
|
+
"free" => [57, 6],
|
163
|
+
"bayes" => [1, 10],
|
164
|
+
"monad" => [0, 22],
|
165
|
+
"hello" => [30, 32],
|
166
|
+
"asdf" => [40, 2]
|
167
|
+
}) { |h| h.default = [0,0] }
|
168
|
+
end
|
169
|
+
|
170
|
+
def types
|
171
|
+
S
|
172
|
+
end
|
173
|
+
|
174
|
+
def knownWords
|
175
|
+
@wordCountTable.keys
|
176
|
+
end
|
177
|
+
|
178
|
+
def countType(type)
|
179
|
+
if type != :Spam && type != :Ham
|
180
|
+
return 0
|
181
|
+
else
|
182
|
+
@msgCounts[ type2Index type ]
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def countWord(word, type)
|
187
|
+
@wordCountTable[word][ type2Index type ]
|
188
|
+
end
|
189
|
+
|
190
|
+
private
|
191
|
+
def type2Index(type)
|
192
|
+
if type == :Spam then 0 else 1 end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# The naive bayesian classifier.
|
197
|
+
BayesianStrategy = proc {|classifiers, prior, _, _|
|
198
|
+
classifiers.map { |c|
|
199
|
+
# compute < P_uni(S|Wi) * P_prior(S) >
|
200
|
+
# and use nil for invalid cases to do doing bayesian inference (it is
|
201
|
+
# important to use nil for invalid cases until the end for invalid
|
202
|
+
# cases for normalization).
|
203
|
+
prior.dep { |t|
|
204
|
+
c.map { |t_c| t == t_c ? t : nil }
|
205
|
+
}
|
206
|
+
}.inject { |da, db| # multiply all probabilities (naive bayesian part)
|
207
|
+
da.dep { |t|
|
208
|
+
db.map { |t_b| t == t_b ? t : nil }
|
209
|
+
}
|
210
|
+
}.normalize
|
211
|
+
}
|
212
|
+
|
213
|
+
# use bayesian classifier and analyse using fisher's method
|
214
|
+
FisherStrategy = proc {|classifiers, prior, n, words|
|
215
|
+
hypothesis = BayesianStrategy.call(classifiers, prior, n, words)
|
216
|
+
dof = classifiers.length # dof / 2
|
217
|
+
map = Hash.new(0)
|
218
|
+
|
219
|
+
for p,k in hypothesis
|
220
|
+
# chi_square = -2.0 * sum(i) { log(p_i) }
|
221
|
+
# = -2.0 * log(p)
|
222
|
+
#
|
223
|
+
# copmute p-value by solving
|
224
|
+
#
|
225
|
+
# integral( x^(n-1) * exp(-x/2) / (gamma(n) * 2^n) , -2 log(p), inf, dx)
|
226
|
+
#
|
227
|
+
# integral ( x^(n-1) * exp(-x/2), -2 log(p), inf, dx)
|
228
|
+
# = ---------------------------------------------------
|
229
|
+
# gamma(n) * 2^n
|
230
|
+
#
|
231
|
+
# = p * Sum(i = 1 to n) { (-log(p))^(n - i) / (n - i)! }
|
232
|
+
#
|
233
|
+
# = p + p * Sum(i = 1 to n-1) { (-log(p))^(n - i) / (n - i)! }
|
234
|
+
#
|
235
|
+
# with n = dof
|
236
|
+
|
237
|
+
m = -Math.log(p) # 0.5 chi
|
238
|
+
t = p # exp(-m) = exp(log(p)) = p
|
239
|
+
|
240
|
+
# compute p value
|
241
|
+
tmp = 1.upto(dof-1).reduce(t) {|sum,i|
|
242
|
+
t *= m / i.to_f
|
243
|
+
sum + t
|
244
|
+
}
|
245
|
+
|
246
|
+
map[k] = if tmp < 1.0 then tmp else 1.0 end
|
247
|
+
end
|
248
|
+
map
|
249
|
+
}
|
250
|
+
|
251
|
+
# other part of the database computing, scoring and storing the classifiers
|
252
|
+
# P_uni(S|Wi)
|
253
|
+
class SpamClassifier
|
254
|
+
|
255
|
+
def initialize(knowledge, strategie)
|
256
|
+
@knowledge = knowledge # our database
|
257
|
+
@classifiers = {} # the classifiers
|
258
|
+
@strategie = strategie # the strategy to use, naive bayesian or fisher's method
|
259
|
+
|
260
|
+
buildClassifiers {|w,s,probs|
|
261
|
+
@classifiers[w] = [s,probs]
|
262
|
+
}
|
263
|
+
end
|
264
|
+
|
265
|
+
def pMsgTypeByWords(words, n = 15, prior = @knowledge.pMsgType)
|
266
|
+
@strategie.call(findClassifiers(words, n), prior, n, words)
|
267
|
+
end
|
268
|
+
|
269
|
+
# classify a message using the n most prominent classifiers
|
270
|
+
def classify(words, n = 15)
|
271
|
+
pMsgTypeByWords(words, n).most_probable
|
272
|
+
end
|
273
|
+
|
274
|
+
private
|
275
|
+
def characteristic(f)
|
276
|
+
norm_prob do
|
277
|
+
f.call uniform(@knowledge.types)
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
def score(&blk)
|
282
|
+
characteristic(blk).distance prob{ uniform(@knowledge.types) }
|
283
|
+
end
|
284
|
+
|
285
|
+
def buildClassifiers
|
286
|
+
@knowledge.knownWords.each {|w,types|
|
287
|
+
s = score do |prior|
|
288
|
+
@knowledge.pHasWord(w,prior)
|
289
|
+
end
|
290
|
+
probs = norm_prob do
|
291
|
+
@knowledge.pHasWord(w, uniform(@knowledge.types))
|
292
|
+
end
|
293
|
+
yield w, s, probs.adjust_min
|
294
|
+
}
|
295
|
+
end
|
296
|
+
|
297
|
+
def findClassifiers(words, n)
|
298
|
+
classifiers = words.map {|w| [w, @classifiers[w]] }.delete_if {|w,c| c == nil}
|
299
|
+
classifiers.sort! {|x,y| x[1][0] <=> y[1][0]}
|
300
|
+
classifiers[0,n].map {|w,(s,prob)|
|
301
|
+
prob
|
302
|
+
}
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
# run some tests using the test database, some key words and the different
|
307
|
+
# strategies
|
308
|
+
classifiers = [ ["bayesian", SpamClassifier.new(SpamBaseKnowledge.new, BayesianStrategy)],
|
309
|
+
["fisher's method", SpamClassifier.new(SpamBaseKnowledge.new, FisherStrategy)] ]
|
310
|
+
|
311
|
+
testCorpus = [["free"],
|
312
|
+
["monad"],
|
313
|
+
["free", "asdf", "bayes", "quick", "jump", "test"],
|
314
|
+
["free", "monad", "asdf", "bayes", "quick", "jump", "test"]
|
315
|
+
]
|
316
|
+
|
317
|
+
puts "\ntest classifier"
|
318
|
+
testCorpus.each do |data|
|
319
|
+
printf "use corpus: #{data}\n"
|
320
|
+
classifiers.each do |n, c|
|
321
|
+
puts n
|
322
|
+
puts c.pMsgTypeByWords(data)
|
323
|
+
puts ""
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
data/lib/probdsl.rb
CHANGED
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Steffen Siering
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-03-
|
17
|
+
date: 2010-03-26 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -57,6 +57,7 @@ files:
|
|
57
57
|
- examples/diagnosis.rb
|
58
58
|
- examples/montyhall.rb
|
59
59
|
- examples/paradox.rb
|
60
|
+
- examples/spamplan.rb
|
60
61
|
- examples/test.rb
|
61
62
|
- LICENSE
|
62
63
|
has_rdoc: true
|