RubyGems - rb_prob - Versions diffs - 0.0.1 - Mend

rb_prob 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/LICENSE ADDED Viewed

@@ -0,0 +1,25 @@
+Copyright (c) 2010, Steffen Siering
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Steffen Siering nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/examples/alarm.rb ADDED Viewed

@@ -0,0 +1,283 @@
+require 'rubygems'
+require 'prob'
+include Probably
+# Alarm example from "Artificial Intelligence - A Modern Approach" by Russel
+# and Norvig Page 493 cc.
+#
+# Suppose you have a new fairly reliable burglar alarm at home but occasionally
+# it responds to minor earthquakes. You also have two neighbors John and Mary,
+# who have promised to call you at work when they hear the alarm. John always
+# calls when he hears the alarm, but sometimes confuses the telephone ringing
+# with the alarm and calls then, too. Mary, on the other hand, is too much in
+# loud music and sometimes misses the alarm altogether.
+#
+# So the bayesian network will be:
+#
+#           B         E
+#            \       /
+#            _\|   |/_
+#                A
+#             /    \
+#           |/_    _\|
+#          J          M
+#
+#  with probabilities:
+#  P(B) = 0.001
+#  P(E) = 0.002
+#
+#  P(A| B=true, E=true)   = 0.95
+#  P(A| B=true, E=false)  = 0.94
+#  P(A| B=false, E=true)  = 0.29
+#  P(A| B=false, E=false) = 0.001
+#
+#  P(J| A=true)  = 0.9
+#  P(J| A=false) = 0.05
+#
+#  P(M| A=true)  = 0.7
+#  P(M| A=false) = 0.01
+#
+#  where B = burglar, E = earthquake, A = alarm, J = John calls and
+#  M = Mary calls
+#
+#  ----------------------------------------------------------------------------
+#
+#  Next we want to develop some 'equivalent' functions for querying that
+#  network and do some benchmarks.
+#
+# first let's encode the probabilities from the network
+# P(B)
+PBurglary = choose(0.001,  :B, :notB )
+# P(E)
+PEarthquake = choose(0.002,  :E, :notE)
+# P(A|B = b,E = e)
+def p_alarm(b, e)
+    pAlarmTable = {
+        [:B, :E] => 0.95,
+        [:B, :notE] => 0.94,
+        [:notB, :E] => 0.29,
+        [:notB, :notE] => 0.001
+    }
+    choose(pAlarmTable[[b, e]],  :A, :notA)
+end
+# P(J|A = a)
+def p_john(a)
+    choose( a == :A ? 0.9 : 0.05, :J, :notJ)
+end
+# P(M|A = a)
+def p_mary(a)
+    choose( a == :A ? 0.7 : 0.01, :M, :notM)
+end
+# computes the joint probability and transform result using block (if given)
+# allowing to do some marginalization over one random variable by
+# "leaving it out"
+#
+# for example:
+# mk_joint_p {|b,e,a,j,m| [b,e,a]} will find P(b,e,a) = Sum(j,m) { P(b,e,a,j,m) }
+#
+def mk_joint_p(&blk)
+    PBurglary.dep { |b|
+        PEarthquake.dep {|e|
+            p_alarm(b, e).dep {|a|
+                p_john(a).dep { |j|
+                    p_mary(a).dep {|m|
+                        mkState(if blk then blk.call([b,e,a,j,m])
+                                else [b,e,a,j,m] end)
+                    }
+                }
+            }
+        }
+    }
+end
+# compute (optionally conditional) joint probability of (free) random
+# variables like mk_joint_p.
+#
+# To compute conditional probability set random variables to a known state.
+# for example
+# mk_joint_p2( {:john = :J, :mary = :M} )
+# will compute
+# P(B,E,A| J = true, M = true)
+#
+# or
+# mk_joint_p2({:john = :J, :mary = :M}) {|b,e,a,j,m| b} will find
+# P(B | J = true, M = true)
+def mk_joint_p2( tsts = {}, &blk )
+    PBurglary.dep { |b|
+    condition(!tsts[:burglary] || tsts[:burglary] == b) {
+        PEarthquake.dep {|e|
+        condition(!tsts[:earthquake] || tsts[:earthquake] == e) {
+            p_alarm(b,e).dep {|a|
+            condition(!tsts[:alarm] || tsts[:alarm] == a) {
+                p_john(a).dep {|j|
+                condition(!tsts[:john] || tsts[:john] == j) {
+                    p_mary(a).dep {|m|
+                    condition(!tsts[:mary] || tsts[:mary] == m) {
+                        mkState(if blk then blk.call [b,e,a,j,m] else [b,e,a,j,m] end)
+                    }}
+                }}
+            }}
+        }}
+    }}.normalize
+end
+# like mk_joint_p2, but using event_dep directly instead of mixing in
+# condition-statements
+def mk_joint_p3 (tsts = {}, &blk)
+    tst_b = ifJust tsts[:burglary]
+    tst_e = ifJust tsts[:earthquake]
+    tst_a = ifJust tsts[:alarm]
+    tst_j = ifJust tsts[:john]
+    tst_m = ifJust tsts[:mary]
+    PBurglary.event_dep(tst_b) {|b|
+        PEarthquake.event_dep(tst_e) {|e|
+            p_alarm(b,e).event_dep(tst_a) {|a|
+                p_john(a).event_dep(tst_j) {|j|
+                    p_mary(a).event_dep(tst_m) {|m|
+                        mkState(if blk then blk.call [b,e,a,j,m] else [b,e,a,j,m] end)
+                    }
+                }
+            }
+        }
+    }.normalize
+end
+# precompute joint probability to do bayesian inference using filter, map and
+# query?
+PJoint = mk_joint_p
+puts 'P(B|M=true, J=true) :'
+puts mk_joint_p3({:mary => :M, :john => :J}) {|b,e,a,j,m| b }
+# puts "\njoint probability:"
+# puts "=================="
+# puts PJoint
+# compute P(B | M=true, J=true, E=false, A=true) using all 3 different
+# functions mk_joint_p, mk_joint_p2 and mk_joint_p3:
+puts "\nP(B | M=true, J=true, E=false, A=true)"
+puts "====================================="
+puts mk_joint_p2({:mary => :M, :john => :J, :earthquake => :notE, :alarm => :A}) { |b,e,a,j,m| b }.query?(&just(:B))
+puts mk_joint_p3({:mary => :M, :john => :J, :earthquake => :notE, :alarm => :A}) { |b,e,a,j,m| b }.probability(:B)
+puts PJoint.filter {|b,e,a,j,m| e == :notE && j == :J && m == :M && a == :A }.query? {|b,e,a,j,m| b == :B }
+# do some benchmarking:
+require 'benchmark'
+Benchmark.bmbm {|x|
+    i = 1000
+    x.report('joint probability:') {
+        (1..i).each {
+            mk_joint_p.filter {|b,e,a,j,m| e == :notE && j == :J && m == :M && a == :A }.query? {|b,e,a,j,m| b == :B }
+        }
+    }
+    x.report('joint probability precomputed:') {
+        (1..i).each {
+            PJoint.filter {|b,e,a,j,m| e == :notE && j == :J && m == :M && a == :A}.query? {|b,e,a,j,m| b == :B}
+        }
+    }
+    x.report('direkt:') {
+        (1..i).each {
+            mk_joint_p {|b,e,a,j,m|
+                if e == :notE && j == :J && m == :M  && a == :A
+                    [b,a]
+                else
+                    nil
+                end
+            }.query? {|b,a| b == :B}
+        }
+    }
+    x.report('direkt with conditions:') {
+        (1..i).each {
+            mk_joint_p2({:mary => :M, :john => :J, :earthquake => :notE, :alarm => :A}) { |b,e,a,j,m| b }.query?(&just(:B))
+        }
+    }
+    x.report('direkt with event condition:') {
+        (1..i).each {
+            mk_joint_p3({:mary => :M, :john => :J, :earthquake => :notE, :alarm => :A}) { |b,e,a,j,m| b }.query?(&just(:B))
+        }
+    }
+}
+# I'm too lazy now to write an interpretation of benchmarking,
+# but I guess you can make up your mind yourself...
+# In short: it's always a trade of between space/time usage and macruby must
+# improve floating point...
+#
+# my results (on unibody MacBook 2GHz with snow leopard):
+#
+# ===========================================================================
+#
+# $ ruby -version
+# ruby 1.8.7 (2008-08-11 patchlevel 72) [universal-darwin10.0]
+#
+# Rehearsal ------------------------------------------------------------------
+# joint probability:               3.080000   0.190000   3.270000 (  3.273073)
+# joint probability precomputed:   0.170000   0.000000   0.170000 (  0.171786)
+# direkt:                          2.450000   0.180000   2.630000 (  2.638515)
+# direkt with conditions:          0.780000   0.050000   0.830000 (  0.829055)
+# direkt with event condition:     0.960000   0.070000   1.030000 (  1.024606)
+#--------------------------------------------------------- total: 7.930000sec
+#
+#                                     user     system      total        real
+# joint probability:               3.010000   0.110000   3.120000 (  3.132044)
+# joint probability precomputed:   0.170000   0.000000   0.170000 (  0.165960)
+# direkt:                          2.470000   0.150000   2.620000 (  2.634326)
+# direkt with conditions:          0.770000   0.050000   0.820000 (  0.810167)
+# direkt with event condition:     0.930000   0.050000   0.980000 (  0.995371)
+#
+# ===========================================================================
+#
+# $ jruby -version
+# jruby 1.4.0 (ruby 1.8.7 patchlevel 174) (2009-11-02 69fbfa3) (Java HotSpot(TM) 64-Bit Server VM 1.6.0_17) [x86_64-java]
+#
+# Rehearsal ------------------------------------------------------------------
+# joint probability:               3.100000   0.000000   3.100000 (  3.100000)
+# joint probability precomputed:   0.148000   0.000000   0.148000 (  0.148000)
+# direkt:                          0.988000   0.000000   0.988000 (  0.988000)
+# direkt with conditions:          0.424000   0.000000   0.424000 (  0.424000)
+# direkt with event condition:     0.558000   0.000000   0.558000 (  0.558000)
+# --------------------------------------------------------- total: 5.217999sec
+#
+#                                      user     system      total        real
+# joint probability:               0.992000   0.000000   0.992000    0.992000
+# joint probability precomputed:   0.087000   0.000000   0.087000    0.087000
+# direkt:                          0.621000   0.000000   0.621000    0.621000
+# direkt with conditions:          0.321000   0.000000   0.321000    0.321000
+# direkt with event condition:     0.327000   0.000000   0.327000    0.327000
+#
+# ===========================================================================
+#
+# $ macruby -version
+# MacRuby version 0.5 (ruby 1.9.0) [universal-darwin10.0, x86_64]
+#
+# Rehearsal ------------------------------------------------------------------
+# joint probability:               7.710000   0.220000   7.930000 (  6.988403)
+# joint probability precomputed:   0.140000   0.000000   0.140000 (  0.135137)
+# direkt:                          5.550000   0.170000   5.720000 (  5.117666)
+# direkt with conditions:          1.740000   0.060000   1.800000 (  1.490908)
+# direkt with event condition:     1.750000   0.060000   1.810000 (  1.526937)
+# -------------------------------------------------------- total: 17.400000sec
+#
+#                                      user     system      total        real
+# joint probability:               7.610000   0.230000   7.840000    6.693219
+# joint probability precomputed:   0.120000   0.010000   0.130000    0.118537
+# direkt:                          5.600000   0.190000   5.790000    4.846050
+# direkt with conditions:          1.720000   0.070000   1.790000    1.484840
+# direkt with event condition:     1.750000   0.060000   1.810000    1.507850
+#

data/examples/diagnosis.rb ADDED Viewed

@@ -0,0 +1,87 @@
+require 'rubygems'
+require 'prob'
+include Probably
+#
+# Problem:
+# Given a positive or negative test for a specific illness we want to know the
+# probability for being ill or healthy.
+#
+# Suppose the random variables I and T are given with I = {Ill, Healthy}
+# being the health status and T = {Negative, Positive} the test result.
+#
+# It is known that the probability of being 'ill' is 1 in a 1000,
+# thus:
+# P(I = Ill) = 0.001 and P(I = Healthy) = 0.999
+#
+# Furthermore we do know that the test has an accuracy of 99%, thus
+# P(T = Positive | I = Ill ) = 0.99
+# P(T = Negative | I = Ill ) = 0.01
+# P(T = Positive | I = Healthy ) = 0.01
+# P(T = Negative | I = Healthy ) = 0.99
+#
+# Task:
+# compute the probability of being 'ill', given the test was positive.
+# Using bayes rule:
+#
+# P(T, I) = P(T|I) * P(I) = P(I|T) * P(T)
+#
+# =>
+#
+#           P(T |I) * P(I)
+# P(I|T) = ---------------- = < P(T|I) * P(I) >
+#                P(T)
+#
+#
+PFalseNegative = 0.01 # constant for P( T | I = Ill)
+PFalsePositive = 0.01 # constant for P( T | I = Healthy)
+# define: P(I)
+PDisease = choose 0.001, :ILL, :HEALTHY
+# P(T|I)
+def pTest(i)
+    choose(i == :ILL ? PFalseNegative : 1 - PFalsePositive,
+           :Negative, :Positive)
+end
+# P(T|I)
+# but combine states and save final distribution in constant
+PTest = PDisease.dep {|i|
+    pTest(i).dep {|t| mkState([i,t]) }
+}
+testpred = Proc.new {|disease, test| disease == :ILL}
+p PTest
+# using filter we find on PTest which is P(T|I) we find
+# P( I | T = Positive )
+p "probability of I if test is Positive:"
+p PTest.filter{|disease, test| test == :Positive}
+# using the testpred function and query we can find the probability of all
+# events testpred returns true for. In this case P( I = Ill | T = Positive)
+p "probability of being ill"
+p PTest.filter{|disease,test| test == :Positive}.query? &testpred
+# next find the most probable explanation if Test was Positive:
+p "most probable"
+p PTest.filter{|disease,test| test == :Positive}.most_probable
+# alternatively using condition on the monadic computation directly
+# and normalizing the result needed multiplications and memory may be reduced:
+# event_dep is like 'dep {|var| condition(var == :Positive) { ... } }'
+p "another way of finding P(I|T=Positive)"
+p PDisease.dep {|i|
+    # event_dep will execute block only if
+    # Test was :Positive and return 'nil' else
+    pTest(i).event_dep(just :Positive) {
+        mkState(i)
+    }
+  }.normalize

data/examples/drugtest.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require 'rubygems'
+require 'prob'
+include Probably
+# same problem as in diagnosis.rb, but with drug users and Test.
+# just using some different methods to implement the same queries...
+def drugTest(puser = 0.001, p_posifuser = 0.99, p_posifclean = 0.01)
+    choose(puser, :User, :Clean).dep { |user|
+        choose(if user == :User then p_posifuser else p_posifclean end,
+               :Pos, :Neg).dep { |test|
+            mkState([user, test])
+        }
+    }
+end
+def drugTest2
+    drugTest.dep {|u,t|
+        if t == :Pos then mkState(u) else nil end
+    }
+end
+def drugTest3(puser = 0.001, p_posifuser = 0.99, p_posifclean = 0.01)
+    choose(puser, :User, :Clean).dep { |user|
+        choose(if user == :User then p_posifuser else p_posifclean end,
+               :Pos, :Neg).dep { |test|
+            condition(test == :Pos) {
+                mkState user
+            }
+        }
+    }.normalize
+end
+#p drugTest2
+p drugTest
+p drugTest.filter {|u,t| t == :Pos }
+p drugTest(0.5).filter {|u,t| t == :Pos}
+p drugTest3
+# p drugTest3(0.5)

data/examples/montyhall.rb ADDED Viewed

@@ -0,0 +1,106 @@
+require 'rubygems'
+require 'prob'
+include Probably
+# the monty hall problem is a simple game show based probability puzzle with
+# a puzzling outcome :)
+#
+# Suppose you are on a game show and you are given the choice of 3 doors.
+# Behind one of these doors is the price and behind the others a goat. Only the
+# moderator knows behind which door the price is and will open one door with a
+# goat after you did your first choice. Next you can choose if you want to
+# switch doors or not.
+#
+# Question:
+# What is the best strategie? Stay or switch?
+# What are the probabilities of winning for each of these strategies?
+#
+# first we want to encode our state.
+#
+# these are the doors one can choose from:
+$doors = [:A, :B, :C]
+# state final state is hashmap with keys:
+# :open     => door opened by entertainer
+# :prize    => door the prize is behind
+# :selected => by player selected door
+# testing function on state to find out if we win or loose
+$testWinner = proc do |s|
+    if s[:prize] == s[:selected]
+        :Winner
+    else
+        :Looser
+    end
+end
+# apply event function $testWinner on
+# each possible state
+def winnerProb(prob)
+    prob.map &$testWinner
+end
+# Let us encode the problem with random variables:
+#
+# P  = doors : door prize was put behind
+# C1 = doors : the door chosen in the first round by player
+# O  = doors :  the door opened by show's host
+#
+# first step: let's hide the price
+# P(P = A) = 1/3
+# P(P = B) = 1/3
+# P(P = C) = 1/3
+hide   = uniform( $doors.map { |d| {:prize => d} }   )
+# and then let the player choose one door:
+# P(C1 = A) = 1/3
+# P(C1 = B) = 1/3
+# P(C1 = C) = 1/3
+choose = uniform( $doors.map { |d| {:selected => d}} )
+# combine event P and C1 and create state representation:
+# P(C1|P) = P(C1) * P(P)     <- because event P and C1 are independent
+hideThenChoose = hide.mult(choose) { |p,s|
+    {:prize => p[:prize], :selected => s[:selected]}
+}
+# compute probability distribution of host opening a specific door
+# given the event P and C1:
+# P(O|C1,P)
+# with O != C1 and O != P
+opened = hideThenChoose.dep do |s|
+    s_ = ($doors - [s[:prize], s[:selected]]).map do |d|
+        {:open => d, :prize => s[:prize], :selected => s[:selected]}
+    end
+    uniform s_
+end
+#p opened
+# finally implement strategie 'stay'
+def stay(prob)
+    prob
+end
+# and strategy 'switch' choosing a door C2 with
+# C2 != O and C2 != C1.
+# find P(C2|O, C1, P)
+def switch(prob)
+    prob.dep do |s|
+        s_ = ($doors - [s[:selected], s[:open]]).map do |d|
+            {:open => s[:open], :selected => d, :prize => s[:prize]}
+        end
+        uniform s_
+    end
+end
+# print some results
+puts 'if stay most probable result: ', winnerProb(stay(opened)).most_probable
+puts 'if switch most probable result: ', winnerProb(switch(opened)).most_probable
+puts ''
+puts 'if stay porbability of winning: ', winnerProb(stay(opened)).probability(:Winner)
+puts 'if switch porbability of winning: ', winnerProb(switch(opened)).probability(:Winner)

data/examples/spamplan.rb ADDED Viewed

@@ -0,0 +1,326 @@
+#!/usr/bin/env ruby
+#require 'rubygems'
+require '../lib/prob'
+include Probably
+# Bayesian Spam filter example.
+# We try to find the probability of a message it's classification being spam
+# or ham using a naive bayesian filter and a second filter using fisher's
+# methods to analyse the plausibility of the first filter its result.
+#
+# In essence the bayesian filter tries to find the probability for the message
+# being spam using the message its features and previously seen messages.
+#
+# Suppose we have the random variables:
+# S = {:Spam, :Ham}
+# Document = Set of words/features = {Wi ... Wn}
+# Wi = word Wi present or not present {true, false}
+#
+# then
+#
+# P(S|Document) = P(S|W1) * P(S|W2) * ... * P(S|Wn)
+#
+# meaning we assume all feature/words to be statistically independent (hence
+# naive bayesian filter).
+#
+# Finding words in old message and their spam/ham count we can drive the
+# filter.
+#
+# Next let's find the probability for spam given a word P(S|Wi):
+#
+#            P(Wi|S) * P(S)
+# P(S|Wi) = ---------------
+#                P(Wi)
+#
+# But to minimize computational effort a classifier for each word assuming a
+# uniform prior distribution P(S) is precomputed and the true prior is used
+# later on inference. So we can store the classifiers directly in our database
+# instead of recomputing them over and over again.
+#
+# P(S|Document) = < P(S|W1) * P(S|W2) * ... >
+#            = < P(W1|S) * prior * P(W2|S) * prior * ... >
+#
+# here < P(...) > stands for "alpha * P(...)" and expresses normalization which
+# is done automatically by our library. Thus
+#
+#             P(Wi|S) * P(S)
+#  P(S|Wi) = ---------------- = < P(Wi|S) * P(S) >
+#                 P(Wi)
+#
+# First we need to explain how the classifiers are precomputed and how these
+# precomputed classifiers are used to do the classification:
+#
+# Suppose P_uni is uniform distribution for spam/ham, thus P_uni(spam) = 0.5
+# and P_uni(ham) = 0.5. Then
+#
+#                  P(Wi | S) * P_uni(S)             P(Wi | S) * P_uni(S)
+#  P_uni(S | Wi) = --------------------  =  ------------------------------------
+#                       P(Wi)               Sum(s={spam,ham}) P(Wi|s) * P_uni(s)
+#
+#                = < P(Wi|S) * P_uni(S) >
+#
+# now Suppose the real prior is given, thus with new prior:
+#
+# P_prior(S|Wi) = < P(Wi|S) * P_prior(S) >
+#
+#                 P(Wi|S) * P_prior(S)     P_uni(S|Wi) * P_prior(S)
+#               = --------------------  =  ------------------------
+#                       P(Wi)                      P_uni(S)
+#
+#               = < P_uni(S|Wi) * P_prior(S) >
+#
+#               = P(S|Wi)
+#
+# P(S|Document) = < P(S|W1) * P(S|W2) * ... >
+#               = < P(W1|S) * P_prior(S) * P(W2|S) * P_prior(S) * ... >
+#               = < P_uni(S|W1) * P_prior(S) * P_uni(S|W2) * P_prior(S)  * ... >
+#
+# Using these, our classifiers to store in the database are P_uni(S|Wi) for
+# each word found during learning. So when learning from new message not all
+# classifiers need to be recomputed. Alternatively one may want to store
+# P_prior(S|Wi) in the database, but when learning from new messages all
+# classifiers need to be updated then. One may even assume the prior to always
+# be distributed uniform. In that case P(S|Document) becomes
+# P(S|Document) = < P_uni(S|W1) * P_uni(S|W2) ... >
+#
+# Instead of using all classifiers for all words found only a subset is used.
+# This subset of classifiers to use is found by scoring the classifiers and
+# using the classifiers with highest scores for the words found in the
+# document.
+#
+# Scoring is done by computing the 'quadratic distance' of a classifier to the
+# uniform distribution:
+# score = ( 0.5 - P_uni(S=spam|Wi) )^2 + ( 0.5 - P_uni(S=ham|Wi))^2
+#
+# Furthermore if a classifier assumes P_uni(S=spam|Wi) = 0 or P_uni(S=ham|Wi) = 0
+# the probability will be adjusted to 0.01.
+#
+S = [:Spam, :Ham]
+# module to be mixed into a 'Spam Feature Database' to compute probabilities
+# from the database.
+#
+# It's assumed that the 'Spam Feature Database' provides the following
+# functions:
+#
+# countWord(word:String, type:{:Spam, :Ham}) => Int # occurences of word given
+#                                                   # Spam/Ham messages
+#
+# countType(type:{:Spam, :Ham}) => Int # number of Spam/Ham messages learned
+#
+module SpamDatabaseProbabilities
+    # probabilities
+    #
+    # S = {:Spam, :Ham} ; Set of possible message type
+    # P(S) <- prior probability
+    #
+    # W = {set of known words}
+    # P(W|S) <- likelyhood
+    def pMsgType # P(S)
+        enumDist types, @msgCounts
+    end
+    def pWord(word, type) # P(W == word | S == type)
+        n = countWord(word, type).to_f
+        total = countType(type).to_f
+        choose n / total, true, false
+    end
+    # P(S | W == word) = < P(W == word | S) * prior >
+    def pHasWord(word, prior = pMsgType)
+        prior.dep {|t|
+            pWord(word, t).event_dep(just true) {
+                mkState(t)
+            }
+        }.normalize
+    end
+    #P(S | W1 == word1, W2 == word2, ...) = < P(S|W1) * P(S|W2) * ...>
+    def pHasWords(words, prior = pMsgType)
+        words.reduce(prior) {|p,w| pHasWord(w, p) }
+    end
+end
+# our test database
+class SpamBaseKnowledge
+    include SpamDatabaseProbabilities
+    def initialize
+        @msgCounts = [103, 57]
+        @wordCountTable = block1({
+            "the" => [1, 2],
+            "quick" => [1, 1],
+            "brown" => [0, 1],
+            "fox" => [0, 1],
+            "jumps" => [0, 1],
+            "over" => [0, 1],
+            "lazy" => [0, 1],
+            "dog" => [0, 1],
+            "make" => [1, 0],
+            "money" => [1, 0],
+            "in" => [1,0],
+            "online" => [1,0],
+            "casino" => [1, 0],
+            "free" =>  [57, 6],
+            "bayes" => [1, 10],
+            "monad" => [0, 22],
+            "hello" => [30, 32],
+            "asdf"  => [40, 2]
+        }) { |h| h.default = [0,0] }
+    end
+    def types
+        S
+    end
+    def knownWords
+        @wordCountTable.keys
+    end
+    def countType(type)
+        if type != :Spam && type != :Ham
+            return 0
+        else
+            @msgCounts[ type2Index type ]
+        end
+    end
+    def countWord(word, type)
+        @wordCountTable[word][ type2Index type ]
+    end
+    private
+    def type2Index(type)
+        if type == :Spam then 0 else 1 end
+    end
+end
+# The naive bayesian classifier.
+BayesianStrategy = proc {|classifiers, prior, _, _|
+    classifiers.map { |c|
+        # compute < P_uni(S|Wi) * P_prior(S) >
+        # and use nil for invalid cases to do doing bayesian inference  (it is
+        # important to use nil for invalid cases until the end for invalid
+        # cases for normalization).
+        prior.dep { |t|
+            c.map { |t_c| t == t_c ? t : nil }
+        }
+    }.inject { |da, db| # multiply all probabilities (naive bayesian part)
+        da.dep { |t|
+            db.map { |t_b| t == t_b ? t : nil }
+        }
+    }.normalize
+}
+# use bayesian classifier and analyse using fisher's method
+FisherStrategy = proc {|classifiers, prior, n, words|
+    hypothesis = BayesianStrategy.call(classifiers, prior, n, words)
+    dof = classifiers.length # dof / 2
+    map = Hash.new(0)
+    for p,k in hypothesis
+        # chi_square = -2.0 * sum(i) { log(p_i) }
+        #            = -2.0 * log(p)
+        #
+        # copmute p-value by solving
+        #
+        # integral( x^(n-1) * exp(-x/2) / (gamma(n) * 2^n) , -2 log(p), inf, dx)
+        #
+        #   integral ( x^(n-1) * exp(-x/2), -2 log(p), inf, dx)
+        # = ---------------------------------------------------
+        #                       gamma(n) * 2^n
+        #
+        # = p * Sum(i = 1 to n) { (-log(p))^(n - i) / (n - i)! }
+        #
+        # = p + p * Sum(i = 1 to n-1) { (-log(p))^(n - i) / (n - i)! }
+        #
+        # with n = dof
+        m = -Math.log(p) # 0.5 chi
+        t = p # exp(-m) = exp(log(p)) = p
+        # compute p value
+        tmp = 1.upto(dof-1).reduce(t) {|sum,i|
+            t *= m / i.to_f
+            sum + t
+        }
+        map[k] = if tmp < 1.0 then tmp else 1.0 end
+    end
+    map
+}
+# other part of the database computing, scoring and storing the classifiers
+# P_uni(S|Wi)
+class SpamClassifier
+    def initialize(knowledge, strategie)
+        @knowledge = knowledge # our database
+        @classifiers = {}      # the classifiers
+        @strategie = strategie # the strategy to use, naive bayesian or fisher's method
+        buildClassifiers {|w,s,probs|
+            @classifiers[w] = [s,probs]
+        }
+    end
+    def pMsgTypeByWords(words, n = 15, prior = @knowledge.pMsgType)
+        @strategie.call(findClassifiers(words, n), prior, n, words)
+    end
+    # classify a message using the n most prominent classifiers
+    def classify(words, n = 15)
+        pMsgTypeByWords(words, n).most_probable
+    end
+    private
+    def characteristic(f)
+        f.call uniform(@knowledge.types)
+    end
+    def score(f = nil, &blk)
+        pDistance( characteristic(f || blk), uniform(@knowledge.types))
+    end
+    def buildClassifiers
+        @knowledge.knownWords.each {|w,types|
+            s = score {|prior| @knowledge.pHasWord(w,prior)}
+            probs = adjustMinimums(@knowledge.pHasWord(w, uniform(S)))
+            yield w, s, probs
+        }
+    end
+    def findClassifiers(words, n)
+        classifiers = words.map {|w| [w, @classifiers[w]] }.delete_if {|w,c| c == nil}
+        classifiers.sort! {|x,y| x[1][0] <=> y[1][0]}
+        classifiers[0,n].map {|w,(s,prob)|
+            prob
+        }
+    end
+end
+# run some tests using the test database, some key words and the different
+# strategies
+classifiers = [ ["bayesian", SpamClassifier.new(SpamBaseKnowledge.new, BayesianStrategy)],
+                ["fisher's method", SpamClassifier.new(SpamBaseKnowledge.new, FisherStrategy)] ]
+testCorpus = [["free"],
+              ["monad"],
+              ["free", "asdf", "bayes", "quick", "jump", "test"],
+              ["free", "monad", "asdf", "bayes", "quick", "jump", "test"]
+             ]
+puts "\ntest classifier"
+testCorpus.each do |data|
+    printf "use corpus: #{data}\n"
+    classifiers.each do |n, c|
+        puts n
+        puts c.pMsgTypeByWords(data)
+        puts ""
+    end
+end

data/lib/prob.rb ADDED Viewed

@@ -0,0 +1,328 @@
+# The Probably module provides functions and a discrete Distribution class for
+# monadic functional probabilistic programming in ruby.
+puts 'loading rb_prob'
+module Probably
+    # simple helper function running a given block with its first argument and
+    # returns first argument
+    def block1(x, &blk)
+        blk.call(x)
+        x
+    end
+    # given a block return a new Proc defined on range [0..1]
+    def mkShapeFunction
+        proc { |x|
+            if x < 0 || x > 1.0 then 0 else yield x end
+        }
+    end
+    # creates a Proc computing a gaussian distribution
+    # in range [0..1] given a mean and deviation
+    def normalDistShape(mean, dev)
+        include Math
+        mkShapeFunction { |x|
+            u = (x - mean) / dev
+            exp (-0.5 * u * u) / sqrt(2 * PI)
+        }
+    end
+    # The Discrete Distribution representation class
+    class Distribution
+        include Enumerable
+        protected
+        def initializeLists(data, shape)
+            @map = Hash.new(0)
+            count = data.length
+            data.each_with_index { |val, i|
+                @map[val] += shape.call( Float(i + 1) / count  )
+            }
+        end
+        def initializeMap(m)
+            @map = Hash.new(0)
+            m.each { |k,v| @map[k] = v }
+            self.normalizeProbabilities
+        end
+        def normalizeProbabilities
+            sum = Float( @map.values.inject(:+) )
+            @map.keys.each { |k| @map[k] /= sum } if sum != 1.0
+        end
+        public
+        # Creates a new Discrete Distribution with
+        # said constructor type (init_type) and initial data
+        # upon construction the data are automatically normalized
+        # if init_type is:
+        # - :MAP then the given map use used directly and should not
+        #        be used anymore by someone else but the current
+        #        distribution class
+        # - :MAPCOPY then the given map is copied for further use
+        # - :LISTS then the second parameter is the list of keys and the
+        #          third parameter the corresponding list of probabilities
+        def initialize(init_type, *data)
+            case init_type
+                when :MAP
+                    @map = data[0]
+                when :MAPCOPY
+                    initializeMap(data[0])
+                when :LISTS
+                    initializeLists(data[0], data[1])
+                else
+                    raise "unable to create probability distribution"
+            end
+            self.normalizeProbabilities
+        end
+        # set of keys in distribution
+        def keys
+            @map.keys
+        end
+        # returns normalized distribution removing
+        # all nil values.
+        # In combination with condition, normalize must be used
+        # to compute normalization of bayes theorem
+        def normalize
+            if @map[nil] > 0.0
+                filter { |v| v != nil }
+            else
+                @self
+            end
+        end
+        # returns probability of event val from
+        # distribution
+        def probability(val)
+            @map[val]
+        end
+        # use most_probable to retrieve most probable event and
+        # its probability from given distribution
+        def most_probable
+            @map.reduce { |best, value|
+                if best[1] < value[1] then value else best end
+            }
+        end
+        # randomly pick a key-value with respect to its probability
+        # in given distribution
+        def pick
+            r = rand
+            sum = 0
+            for k,p in @map
+                sum += p
+                return k,p if r < sum
+            end
+            return nil
+        end
+        def each
+            @map.each { |k, p| yield p, k }
+        end
+        def map
+            tmp = Hash.new(0)
+            for k,p in @map
+                tmp[yield(k)] += p
+            end
+            Distribution.new(:MAP, tmp)
+        end
+        def filter
+            Distribution.new :MAP, @map.reject { |k,v|
+                !(yield k)
+            }
+        end
+        def query?
+            @map.reduce(0) {|probability, (dat,dp)|
+                if yield dat then probability + dp
+                else probability end
+            }
+        end
+        def join
+            tmp = Hash.new(0)
+            for dist,p1 in @map
+                for p2, k in dist
+                    tmp[k] += p1 * p2
+                end
+            end
+            Distribution.new(:MAP, tmp)
+        end
+        def dep
+            m = Hash.new(0)
+            for k1,p1 in @map
+                tmp = yield k1
+                if tmp != nil
+                    for p2, k in tmp
+                        m[k] += p1 * p2
+                    end
+                end
+            end
+            Distribution.new(:MAP, m)
+        end
+        def event_dep(pred)
+            self.dep {|x|
+                if !pred.call x
+                    mkState nil
+                else
+                    yield x
+                end
+            }
+        end
+        def mult(dist2)
+            self.dep do |k|
+                if block_given? then dist2.map { |k2| yield(k, k2) }
+                else      dist2.map { |k2| [k, k2] }
+                end
+            end
+        end
+        def * (dist2)
+            self.mult dist2
+        end
+        # computes expectation given that keys in distribution
+        # are numeric
+        def expectation
+            @map.reduce(0) {|sum, (k,p)| sum + k.to_f * p }
+        end
+        # computes variance given that keys in distribution
+        # are numeric
+        def variance
+            expected = self.expectation
+            @map.reduce(0) {|sum, (k,p)|
+                tmp = (k.to_f - expectation)
+                sum + tmp * tmp * p
+            }
+        end
+        # computes standard deviation given that keys in distribution
+        # are numeric
+        def std_dev
+            Math.sqrt( self.variance )
+        end
+        def to_s
+            @map.reduce("") { |str,(k,p)|
+                str + "#{k} : #{p * 100} %\n"
+            }
+        end
+    end
+    # create uniformly distributed Distribution from array of values
+    def uniform(data)
+        Distribution.new :LISTS, data, mkShapeFunction {|x| 1}
+    end
+    # creates linearly distributed Distribution from array of values
+    def linear(data)
+        Distribution.new :LISTS, data, mkShapeFunction {|x| x }
+    end
+    # creates exp(-x) distributed Distribution from array of values
+    def negExp(data)
+        Distribution.new :LISTS, data, mkShapeFunction {|x| Math.exp(-x) }
+    end
+    # creates Distribution from array of values using a gaussian distribution
+    def normal(data, mean = 0.5, dev = 0.5)
+        Distribution.new :LISTS, data, normalDistShape(mean, dev)
+    end
+    # creates a distribution from first array holding the distribution
+    # values and second one the corresponding probabilities (do be normalized)
+    # - data: array of input values
+    # - dist: array of probabilities
+    def enumDist(data, dist)
+        if data.length != dist.length
+            raise "data and distribution length must be equal"
+        end
+        Distribution.new :LISTS, data, mkShapeFunction {|i| dist[i * dist.length - 1]}
+    end
+    # Creates a new probability distribution from given map:
+    # m = { key1 => probability1, key2 => probability2, key3 => ... }
+    def mapDist(m)
+        Distribution.new :MAPCOPY, m
+    end
+    def distWithShape(data, &blk)
+        Distribution.new :LISTS, data, mkShapeFunction(&blk)
+    end
+    def choose(p, elem1, elem2)
+        tmp = Hash.new(0)
+        tmp[elem1] = p
+        tmp[elem2] = 1 - p
+        Distribution.new :MAP, tmp
+    end
+    def mkState(a)
+        tmp = Hash.new(0)
+        tmp[a] = 1
+        Distribution.new :MAP, tmp
+    end
+    def histogram(a)
+        block1(Hash.new(0)) do |r|
+            for x in a
+                r[x] += 1
+            end
+        end
+    end
+    def condition(b)
+        if b then yield else mkState nil end
+    end
+    # events
+    def mkEvent(&f)
+        f
+    end
+    def just(x)
+        mkEvent {|y| x == y}
+    end
+    def ifJust(x)
+        if x == nil then proc {|y| true }
+        else proc {|y| x == y } end
+    end
+    def oneOf(*elems)
+        proc {|y| elems.include? y }
+    end
+    def pDistance(dist1, dist2)
+        (dist1.keys | dist2.keys).reduce(0) {|sum,k|
+            tmp = dist1.probability(k) - dist2.probability(k)
+            sum + tmp * tmp
+        }
+    end
+    def adjustMinimums(dist, newMin = 0.01)
+        tmp = Hash.new(0)
+        dist.each do |p,k|
+            tmp[k] = if p > newMin then p else newMin end
+        end
+        Distribution.new :MAP, tmp
+    end
+end

metadata ADDED Viewed

@@ -0,0 +1,61 @@
+--- !ruby/object:Gem::Specification
+name: rb_prob
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Steffen Siering
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-03-21 00:00:00 +01:00
+default_executable:
+dependencies: []
+description: "monad programming programming library for ruby. for examples see github repository: http://github.com/urso/rb_prob"
+email: steffen <dot> siering -> gmail <dot> com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/prob.rb
+- examples/alarm.rb
+- examples/diagnosis.rb
+- examples/drugtest.rb
+- examples/montyhall.rb
+- examples/spamplan.rb
+- LICENSE
+has_rdoc: true
+homepage: http://github.com/urso/rb_prob
+licenses:
+- BDS3
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: monadic probabilistic programming for ruby
+test_files: []