sbn 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/inference.rb ADDED
@@ -0,0 +1,65 @@
1
+ class Sbn
2
+ class Net
3
+ MCMC_DEFAULT_SAMPLE_COUNT = 2000
4
+
5
+ # Returns a hash containing the estimated posterior probability of each
6
+ # possible state for the specified variable, based on previously-supplied
7
+ # evidence, using the Markov Chain Monte Carlo algorithm. The MCMC algorithm
8
+ # generates each event by making a random change to the preceding event. The
9
+ # next state is generated by randomly sampling a value for one of the
10
+ # nonevidence variables Xi, conditioned on the current values of the
11
+ # variables in the Markov blanket of Xi. MCMC basically wanders randomly
12
+ # around the state space--the space of possible complete
13
+ # assignments--flipping one variable at a time, but keeping the evidence
14
+ # variables fixed. The sampling process works because it settles into a
15
+ # "dynamic equilibrium" in which the long-run fraction of time spent in each
16
+ # state is proportional to its posterior probability.
17
+ #
18
+ # Optionally accepts a block that receives a number between 0 and 1 indicating
19
+ # the percentage of completion.
20
+ def query_variable(varname, callback = nil)
21
+ # keep track of number of times a state has been observed
22
+ state_frequencies = {}
23
+ varname = varname.to_underscore_sym
24
+ states = @variables[varname].states
25
+ states.each {|s| state_frequencies[s] ||= 0 }
26
+
27
+ e = generate_random_event
28
+ relevant_evidence = e.reject {|key, val| @variables[key].set_in_evidence?(@evidence) }
29
+
30
+ MCMC_DEFAULT_SAMPLE_COUNT.times do |n|
31
+ state = e[varname]
32
+ state_frequencies[state] += 1
33
+
34
+ relevant_evidence.each do |vname, vstate|
35
+ e[vname] = @variables[vname].get_random_state_with_markov_blanket(e)
36
+ end
37
+ yield(n / MCMC_DEFAULT_SAMPLE_COUNT.to_f) if block_given?
38
+ end
39
+
40
+ # normalize results
41
+ magnitude = 0
42
+ returnval = {}
43
+ state_frequencies.each_value {|count| magnitude += count }
44
+ state_frequencies.each {|state, count| returnval[state] = count / magnitude.to_f }
45
+ returnval
46
+ end
47
+
48
+ private
49
+ # Returns an event in which variables that are not fixed by the evidence are set
50
+ # to random states whose frequencies (after repeated calls) are consistent
51
+ # with the network's joint probability distribution.
52
+ def generate_random_event
53
+ unset_variables = @variables.reject {|name, variable| variable.set_in_evidence?(@evidence) }
54
+ new_evidence = @evidence.dup
55
+ until unset_variables.empty? do
56
+ settable_variables = unset_variables.reject {|name, variable| !variable.can_be_evaluated?(new_evidence) }
57
+ settable_variables.each do |name, variable|
58
+ unset_variables.delete(name)
59
+ new_evidence[name] = variable.get_random_state(new_evidence)
60
+ end
61
+ end
62
+ new_evidence
63
+ end
64
+ end
65
+ end
data/lib/learning.rb ADDED
@@ -0,0 +1,141 @@
1
+ class Sbn
2
+ class Variable
3
+ NEGLIGIBLE_PROBABILITY = 0.0001
4
+
5
+ def is_complete_evidence?(evidence) # :nodoc:
6
+ varnames = [evidence_name.to_s]
7
+ @parents.each {|p| varnames << p.name.to_s }
8
+ yield(varnames) if block_given?
9
+
10
+ # ignore covariables when determining whether evidence is complete or not
11
+ varnames.map! do |n|
12
+ n = n.split('_').first if n =~ /covar/
13
+ n
14
+ end
15
+ varnames.uniq!
16
+ varnames.sort!
17
+
18
+ keys = evidence.keys.map {|k| k.to_s }
19
+ keys.sort!
20
+ varnames & keys == varnames
21
+ end
22
+
23
+ def add_sample_point(evidence)
24
+ # reject incomplete evidence sets
25
+ raise "Incomplete sample points" unless is_complete_evidence?(evidence)
26
+
27
+ # Because string variables add new variables to the net during learning,
28
+ # the process of determining state frequencies has to be deferred until
29
+ # the end. For now, we'll just store the evidence and use it later.
30
+ @sample_points ||= []
31
+ @sample_points << evidence
32
+ end
33
+
34
+ def set_probabilities_from_sample_points!
35
+ return unless @sample_points
36
+ accumulate_state_frequencies
37
+
38
+ # find the sums for each parent combination so we
39
+ # know how to normalize their associated states
40
+ sums = {}
41
+ state_combinations.each do |comb|
42
+ parent_comb = comb.dup
43
+
44
+ # remove state for this node so that all
45
+ # that is left is the parent combination
46
+ parent_comb.pop
47
+ @state_frequencies[comb] ||= 0
48
+ sums[parent_comb] ||= 0
49
+
50
+ sums[parent_comb] += @state_frequencies[comb]
51
+ end
52
+
53
+ probabilities = []
54
+ count_of_zero_prob_states = count_of_nonzero_prob_states = {}
55
+ last_state = @states.first
56
+ state_combinations.each do |comb|
57
+ state = comb.last
58
+ parent_comb = comb.dup
59
+ parent_comb.pop
60
+ prob = @state_frequencies[comb] / sums[parent_comb].to_f
61
+ probabilities << (prob == 0.0 ? NEGLIGIBLE_PROBABILITY : prob)
62
+
63
+ # Keep track of how many of this node's states were
64
+ # empty for this particular parent combination, so that
65
+ # we can pad them with tiny numbers later. Otherwise,
66
+ # some exact inference algorithms will fail.
67
+ if prob == 0.0
68
+ count_of_zero_prob_states[parent_comb] ||= 0
69
+ count_of_zero_prob_states[parent_comb] += 1
70
+ else
71
+ count_of_nonzero_prob_states[parent_comb] ||= 0
72
+ count_of_nonzero_prob_states[parent_comb] += 1
73
+ end
74
+ end
75
+
76
+ # pad the zero probabilities
77
+ count = 0
78
+ state_combinations.each do |comb|
79
+ state = comb.last
80
+ parent_comb = comb.dup
81
+ parent_comb.pop
82
+ amount_to_subtract = count_of_zero_prob_states[parent_comb] *
83
+ NEGLIGIBLE_PROBABILITY /
84
+ count_of_nonzero_prob_states[parent_comb].to_f
85
+ p = probabilities[count]
86
+ p = (p > NEGLIGIBLE_PROBABILITY ? p - amount_to_subtract : p)
87
+ probabilities[count] = p
88
+ count += 1
89
+ end
90
+
91
+ # assign new probabilities
92
+ set_probabilities(probabilities)
93
+ end
94
+
95
+ private
96
+ def accumulate_state_frequencies
97
+ @sample_points.each do |evidence|
98
+ combination_instance = []
99
+ @parents.each {|p| combination_instance << p.get_observed_state(evidence) }
100
+ combination_instance << get_observed_state(evidence)
101
+ @state_frequencies[combination_instance] ||= 0
102
+ @state_frequencies[combination_instance] += 1
103
+ end
104
+ end
105
+ end
106
+
107
+ class Net
108
+ # Expects data to be an array of hashes containing complete sets of evidence
109
+ # for all variables in the network. Constructs probability tables for each variable
110
+ # based on the data.
111
+ def learn(data)
112
+ data.each {|evidence| add_sample_point(evidence) }
113
+ set_probabilities_from_sample_points!
114
+ end
115
+
116
+ def add_sample_point(evidence)
117
+ evidence = symbolize_evidence(evidence)
118
+ @variables.keys.each {|key| @variables[key].add_sample_point(evidence) }
119
+ end
120
+
121
+ def set_probabilities_from_sample_points!
122
+ # we must first conduct learning on parents then their children
123
+ unlearned_variables = @variables.keys
124
+
125
+ count = 0
126
+ size = @variables.size.to_f
127
+ until unlearned_variables.empty?
128
+ learnable_variables = @variables.reject do |name, var|
129
+ reject = false
130
+ var.parents.each {|p| reject = true if unlearned_variables.include?(p.name) }
131
+ reject
132
+ end
133
+ learnable_variables.keys.each do |key|
134
+ @variables[key].set_probabilities_from_sample_points!
135
+ count += 1
136
+ unlearned_variables.delete(key)
137
+ end
138
+ end
139
+ end
140
+ end
141
+ end
data/lib/net.rb ADDED
@@ -0,0 +1,49 @@
1
+ class Sbn
2
+ class Net
3
+ attr_reader :name, :variables
4
+
5
+ def initialize(name = '')
6
+ @@net_count ||= 0
7
+ @@net_count += 1
8
+ @name = (name.empty? ? "net_#{@@net_count}" : name.to_underscore_sym)
9
+ @variables = {}
10
+ @evidence = {}
11
+ end
12
+
13
+ def ==(obj); test_equal(obj); end
14
+ def eql?(obj); test_equal(obj); end
15
+ def ===(obj); test_equal(obj); end
16
+
17
+ def add_variable(variable)
18
+ name = variable.name
19
+ if @variables.has_key? name
20
+ raise "Variable of name #{name} has already been added to this net"
21
+ end
22
+ @variables[name] = variable
23
+ end
24
+
25
+ def symbolize_evidence(evidence) # :nodoc:
26
+ newevidence = {}
27
+ evidence.each do |key, val|
28
+ key = key.to_underscore_sym
29
+ raise "Invalid variable name #{key}." unless @variables.has_key?(key)
30
+ newevidence[key] = @variables[key].transform_evidence_value(val)
31
+ end
32
+ newevidence
33
+ end
34
+
35
+ def set_evidence(event)
36
+ @evidence = symbolize_evidence(event)
37
+ end
38
+
39
+ private
40
+ def test_equal(net)
41
+ returnval = true
42
+ returnval = false unless net.class == self.class and self.class == Net
43
+ returnval = false unless net.name == @name
44
+ returnval = false unless @variables.keys.map {|k| k.to_s}.sort == net.variables.keys.map {|k| k.to_s}.sort
45
+ net.variables.each {|name, variable| returnval = false unless variable == @variables[name] }
46
+ returnval
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,94 @@
1
+ require File.dirname(__FILE__) + '/variable'
2
+
3
+ class Sbn
4
+ class NumericVariable < Variable
5
+ DEFAULT_FIRST_STDEV_STATE_COUNT = 14
6
+ DEFAULT_SECOND_STDEV_STATE_COUNT = 6
7
+
8
+ attr_reader :state_thresholds
9
+
10
+ def initialize(net, name, probabilities = [], state_thresholds = [], options = {})
11
+ @state_count_one = options.fetch(:first_stdev_state_count, DEFAULT_FIRST_STDEV_STATE_COUNT).to_f.round
12
+ @state_count_two = options.fetch(:second_stdev_state_count, DEFAULT_SECOND_STDEV_STATE_COUNT).to_f.round
13
+ @state_count_one += 1 if @state_count_one.odd?
14
+ @state_count_two += 1 if @state_count_two.odd?
15
+ @state_thresholds = state_thresholds
16
+ states = generate_states_from_thresholds
17
+ super(net, name, probabilities, states)
18
+ end
19
+
20
+ # alter the state table based on the variance of the sample points
21
+ def set_probabilities_from_sample_points! # :nodoc:
22
+ values = []
23
+ @sample_points.each {|evidence| values << evidence[@name] }
24
+ stdev = values.standard_deviation
25
+ average = values.average
26
+ increment_amount_for_first_stdev = stdev * 2.0 / @state_count_one.to_f
27
+ increment_amount_for_second_stdev = stdev * 2.0 / @state_count_two.to_f
28
+ current_position = average - (stdev * 2.0)
29
+ @state_thresholds = []
30
+
31
+ # start on the left, two standard deviations away from the average
32
+ (@state_count_two / 2).times do
33
+ @state_thresholds << current_position
34
+ current_position += increment_amount_for_second_stdev
35
+ end
36
+
37
+ # continue to add thresholds within the first standard deviation
38
+ @state_count_one.times do
39
+ @state_thresholds << current_position
40
+ current_position += increment_amount_for_first_stdev
41
+ end
42
+
43
+ # add thresholds to the second standard deviation on the right
44
+ (@state_count_two / 2).times do
45
+ @state_thresholds << current_position
46
+ current_position += increment_amount_for_second_stdev
47
+ end
48
+ @states = generate_states_from_thresholds
49
+
50
+ # Now that states have been determined, call parent
51
+ # class to finish processing sample points.
52
+ super
53
+ end
54
+
55
+ def to_xmlbif_variable(xml) # :nodoc:
56
+ super(xml) {|x| x.property("StateThresholds = #{@state_thresholds.join(',')}") }
57
+ end
58
+
59
+ def get_observed_state(evidence) # :nodoc:
60
+ num = evidence[@name]
61
+ thresholds = @state_thresholds.dup
62
+ index = 0
63
+ t = thresholds.shift
64
+ while num >= t and !thresholds.empty? do
65
+ t = thresholds.shift
66
+ index += 1
67
+ end
68
+ index += 1 if num >= t and thresholds.empty?
69
+ @states[index]
70
+ end
71
+
72
+ def transform_evidence_value(val) # :nodoc:
73
+ val.to_f
74
+ end
75
+
76
+ private
77
+ def generate_states_from_thresholds
78
+ returnval = []
79
+ unless @state_thresholds.empty?
80
+ th = @state_thresholds.map {|t| t.to_s.sub('\.', '_') }
81
+ th.each_index do |i|
82
+ if i == 0
83
+ returnval << "lt#{th[0]}"
84
+ else
85
+ returnval << "gte#{th[i - 1]}lt#{th[i]}"
86
+ end
87
+ end
88
+ returnval << "gte#{th[th.size - 1]}"
89
+ returnval.map! {|state| state.to_sym }
90
+ end
91
+ returnval
92
+ end
93
+ end
94
+ end
data/lib/sbn.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ require 'active_support'
3
+ gem 'builder', '>=2.0'
4
+ require 'builder'
5
+
6
+ Dir[File.join(File.dirname(__FILE__), '*.rb')].sort.each { |lib| require lib unless lib == 'sbn.rb' }
@@ -0,0 +1,176 @@
1
+ require File.dirname(__FILE__) + '/variable'
2
+
3
+ class Sbn
4
+ class StringCovariable < Variable # :nodoc:
5
+ attr_reader :text_to_match
6
+
7
+ def initialize(net, manager_name, text_to_match, probabilities)
8
+ @@covar_count ||= 0
9
+ @@covar_count += 1
10
+ @manager_name = manager_name
11
+ @text_to_match = text_to_match.downcase
12
+ super(net, "#{@manager_name}_covar_#{@@covar_count}", probabilities)
13
+ end
14
+
15
+ def to_xmlbif_variable(xml)
16
+ super(xml) do |x|
17
+ x.property("ManagerVariableName = #{@manager_name.to_s}")
18
+ x.property("TextToMatch = #{@text_to_match.inspect}")
19
+ end
20
+ end
21
+
22
+ def evidence_name # :nodoc:
23
+ @manager_name
24
+ end
25
+
26
+ def get_observed_state(evidence) # :nodoc:
27
+ evidence[@manager_name].include?(@text_to_match) ? :true : :false
28
+ end
29
+
30
+ def transform_evidence_value(val) # :nodoc:
31
+ raise "Evidence should not be provided for string covariables"
32
+ end
33
+
34
+ def set_in_evidence?(evidence) # :nodoc:
35
+ evidence.has_key?(@manager_name)
36
+ end
37
+
38
+ private
39
+ def test_equal(covariable)
40
+ returnval = true
41
+ returnval = false unless self.class == covariable.class and self.is_a? StringCovariable
42
+ returnval = false unless returnval and @manager_name == covariable.instance_eval('@manager_name')
43
+ returnval = false unless returnval and @text_to_match == covariable.instance_eval('@text_to_match')
44
+ returnval = false unless returnval and super(covariable)
45
+ returnval
46
+ end
47
+ end
48
+
49
+ class StringVariable < Variable
50
+ DEFAULT_NGRAM_SIZES = [3, 5, 10]
51
+
52
+ def initialize(net, name = '')
53
+ @net = net
54
+ @covariables = {}
55
+ @covariable_children = []
56
+ @covariable_parents = []
57
+ super(net, name, [], [])
58
+ end
59
+
60
+ # create co-variables when new n-grams are encountered
61
+ def add_sample_point(evidence) # :nodoc:
62
+ val = evidence[@name].downcase.strip
63
+ len = val.length
64
+ ngrams = []
65
+
66
+ # Make ngrams as small as 3 characters in length up to
67
+ # the length of the string. We may need to whittle this
68
+ # down significantly to avoid severe computational burdens.
69
+ DEFAULT_NGRAM_SIZES.each {|n| ngrams.concat val.ngrams(n) }
70
+ ngrams.uniq!
71
+ ngrams.each do |ng|
72
+ unless @covariables.has_key?(ng)
73
+ # these probabilities are temporary and will get erased after learning
74
+ newcovar = StringCovariable.new(@net, @name, ng, [0.5, 0.5])
75
+ count = 0
76
+ @covariable_parents.each {|p| newcovar.add_parent(p) }
77
+ @covariable_children.each {|p| newcovar.add_child(p) }
78
+ @covariables[ng] = newcovar
79
+ end
80
+ @covariables[ng].add_sample_point(evidence)
81
+ end
82
+ end
83
+
84
+ # returns an array of the variable's string covariables in alphabetical order
85
+ def covariables # :nodoc:
86
+ returnval = []
87
+ @covariables.keys.sort.each {|key| returnval << @covariables[key] }
88
+ returnval
89
+ end
90
+
91
+ def to_xmlbif_variable(xml) # :nodoc:
92
+ super(xml) do |x|
93
+ covars = @covariables.keys.sort
94
+ parents = @covariable_parents.map {|p| p.name }
95
+ x.property("Covariables = #{covars.join(',')}") unless covars.empty?
96
+
97
+ # A string variable's parents cannot be specified in the "given"
98
+ # section below, because only its covariables actually have them.
99
+ x.property("Parents = #{parents.join(',')}") unless parents.empty?
100
+ end
101
+ end
102
+
103
+ def to_xmlbif_definition(xml) # :nodoc:
104
+ # string variables do not have any direct probabilities--only their covariables
105
+ end
106
+
107
+ # This node never influences the probabilities. Its sole
108
+ # responsibility is to manage the co-variables, so it should
109
+ # always appear to be set in the evidence so that it won't
110
+ # waste time in the inference process.
111
+ def set_in_evidence?(evidence) # :nodoc:
112
+ true
113
+ # raise "String variables should never be used in inference--only their covariables"
114
+ end
115
+
116
+ # This method is used when reconstituting saved networks
117
+ def add_covariable(covariable) # :nodoc:
118
+ @covariable_children.each {|child| covariable.add_child(child) }
119
+ @covariable_parents.each {|parent| covariable.add_parent(parent) }
120
+ @covariables[covariable.text_to_match] = covariable
121
+ end
122
+
123
+ # This node never has any parents or children. It just
124
+ # sets the parents or children of its covariables.
125
+ def add_child_no_recurse(variable) # :nodoc:
126
+ return if variable == self or @covariable_children.include?(variable)
127
+ if variable.is_a?(StringVariable)
128
+ @covariable_children.concat variable.covariables
129
+ @covariables.each {|ng, covar| variable.covariables.each {|varcovar| covar.add_child(varcovar) } }
130
+ else
131
+ @covariable_children << variable
132
+ @covariables.each {|ng, covar| covar.add_child(variable) }
133
+ end
134
+ variable.generate_probability_table
135
+ end
136
+
137
+ def add_parent_no_recurse(variable) # :nodoc:
138
+ return if variable == self or @covariable_parents.include?(variable)
139
+ if variable.is_a?(StringVariable)
140
+ @covariable_parents.concat variable.covariables
141
+ @covariables.each {|ng, covar| variable.covariables.each {|varcovar| covar.add_parent(varcovar) } }
142
+ else
143
+ @covariable_parents << variable
144
+ @covariables.each {|ng, covar| covar.add_parent(variable) }
145
+ end
146
+ generate_probability_table
147
+ end
148
+
149
+ def generate_probability_table # :nodoc:
150
+ @covariables.each {|ng, covar| covar.generate_probability_table }
151
+ end
152
+
153
+ def is_complete_evidence?(evidence) # :nodoc:
154
+ parent_names = @covariable_parents.map {|p| p.name.to_s }
155
+ super(evidence) {|varnames| varnames.concat(parent_names) }
156
+ end
157
+
158
+ def transform_evidence_value(val) # :nodoc:
159
+ val.to_s.downcase
160
+ end
161
+
162
+ private
163
+ def test_equal(variable)
164
+ returnval = true
165
+ returnval = false unless self.class == variable.class and self.is_a? StringVariable
166
+ returnval = false unless returnval and @name == variable.name
167
+ returnval = false unless returnval and @covariable_children == variable.instance_eval('@covariable_children')
168
+ returnval = false unless returnval and @covariable_parents == variable.instance_eval('@covariable_parents')
169
+ @covariables.each do |key, val|
170
+ break unless returnval
171
+ returnval = false unless val == variable.instance_eval("@covariables[:#{key.to_s}]")
172
+ end
173
+ returnval
174
+ end
175
+ end
176
+ end