RubyGems - sbn - Versions diffs - 0.9.0 - Mend

sbn 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/lib/inference.rb ADDED Viewed

@@ -0,0 +1,65 @@
+class Sbn
+  class Net
+    MCMC_DEFAULT_SAMPLE_COUNT = 2000
+  	# Returns a hash containing the estimated posterior probability of each
+    # possible state for the specified variable, based on previously-supplied
+    # evidence, using the Markov Chain Monte Carlo algorithm. The MCMC algorithm
+    # generates each event by making a random change to the preceding event. The
+    # next state is generated by randomly sampling a value for one of the
+    # nonevidence variables Xi, conditioned on the current values of the
+    # variables in the Markov blanket of Xi. MCMC basically wanders randomly
+    # around the state space--the space of possible complete
+    # assignments--flipping one variable at a time, but keeping the evidence
+    # variables fixed. The sampling process works because it settles into a
+    # "dynamic equilibrium" in which the long-run fraction of time spent in each
+    # state is proportional to its posterior probability.
+    #
+    # Optionally accepts a block that receives a number between 0 and 1 indicating
+    # the percentage of completion.
+    def query_variable(varname, callback = nil)
+      # keep track of number of times a state has been observed
+      state_frequencies = {}
+      varname = varname.to_underscore_sym
+      states = @variables[varname].states
+      states.each {|s| state_frequencies[s] ||= 0 }
+      e = generate_random_event
+      relevant_evidence = e.reject {|key, val| @variables[key].set_in_evidence?(@evidence) }
+      MCMC_DEFAULT_SAMPLE_COUNT.times do |n|
+        state = e[varname]
+        state_frequencies[state] += 1
+        relevant_evidence.each do |vname, vstate|
+          e[vname] = @variables[vname].get_random_state_with_markov_blanket(e)
+        end
+        yield(n / MCMC_DEFAULT_SAMPLE_COUNT.to_f) if block_given?
+      end
+      # normalize results
+      magnitude = 0
+      returnval = {}
+      state_frequencies.each_value {|count| magnitude += count }
+      state_frequencies.each {|state, count| returnval[state] = count / magnitude.to_f }
+      returnval
+    end
+  private
+    # Returns an event in which variables that are not fixed by the evidence are set
+    # to random states whose frequencies (after repeated calls) are consistent
+    # with the network's joint probability distribution.
+    def generate_random_event
+      unset_variables = @variables.reject {|name, variable| variable.set_in_evidence?(@evidence) }
+      new_evidence = @evidence.dup
+      until unset_variables.empty? do
+        settable_variables = unset_variables.reject {|name, variable| !variable.can_be_evaluated?(new_evidence) }
+        settable_variables.each do |name, variable|
+          unset_variables.delete(name)
+          new_evidence[name] = variable.get_random_state(new_evidence)
+        end
+      end
+      new_evidence
+    end
+  end
+end

data/lib/learning.rb ADDED Viewed

@@ -0,0 +1,141 @@
+class Sbn
+  class Variable
+    NEGLIGIBLE_PROBABILITY = 0.0001
+    def is_complete_evidence?(evidence) # :nodoc:
+      varnames = [evidence_name.to_s]
+      @parents.each {|p| varnames << p.name.to_s }
+      yield(varnames) if block_given?
+      # ignore covariables when determining whether evidence is complete or not
+      varnames.map! do |n|
+        n = n.split('_').first if n =~ /covar/
+        n
+      end
+      varnames.uniq!
+      varnames.sort!
+      keys = evidence.keys.map {|k| k.to_s }
+      keys.sort!
+      varnames & keys == varnames
+    end
+    def add_sample_point(evidence)
+      # reject incomplete evidence sets
+      raise "Incomplete sample points" unless is_complete_evidence?(evidence)
+      # Because string variables add new variables to the net during learning,
+      # the process of determining state frequencies has to be deferred until
+      # the end.  For now, we'll just store the evidence and use it later.
+      @sample_points ||= []
+      @sample_points << evidence
+    end
+    def set_probabilities_from_sample_points!
+      return unless @sample_points
+      accumulate_state_frequencies
+      # find the sums for each parent combination so we
+      # know how to normalize their associated states
+      sums = {}
+      state_combinations.each do |comb|
+        parent_comb = comb.dup
+        # remove state for this node so that all
+        # that is left is the parent combination
+        parent_comb.pop
+        @state_frequencies[comb] ||= 0
+        sums[parent_comb] ||= 0
+        sums[parent_comb] += @state_frequencies[comb]
+      end
+      probabilities = []
+      count_of_zero_prob_states = count_of_nonzero_prob_states = {}
+      last_state = @states.first
+      state_combinations.each do |comb|
+        state = comb.last
+        parent_comb = comb.dup
+        parent_comb.pop
+        prob = @state_frequencies[comb] / sums[parent_comb].to_f
+        probabilities << (prob == 0.0 ? NEGLIGIBLE_PROBABILITY : prob)
+        # Keep track of how many of this node's states were
+        # empty for this particular parent combination, so that
+        # we can pad them with tiny numbers later.  Otherwise,
+        # some exact inference algorithms will fail.
+        if prob == 0.0
+          count_of_zero_prob_states[parent_comb] ||= 0
+          count_of_zero_prob_states[parent_comb] += 1
+        else
+          count_of_nonzero_prob_states[parent_comb] ||= 0
+          count_of_nonzero_prob_states[parent_comb] += 1
+        end
+      end
+      # pad the zero probabilities
+      count = 0
+      state_combinations.each do |comb|
+        state = comb.last
+        parent_comb = comb.dup
+        parent_comb.pop
+        amount_to_subtract = count_of_zero_prob_states[parent_comb] *
+                             NEGLIGIBLE_PROBABILITY /
+                             count_of_nonzero_prob_states[parent_comb].to_f
+        p = probabilities[count]
+        p = (p > NEGLIGIBLE_PROBABILITY ? p - amount_to_subtract : p)
+        probabilities[count] = p
+        count += 1
+      end
+      # assign new probabilities
+      set_probabilities(probabilities)
+    end
+  private
+    def accumulate_state_frequencies
+      @sample_points.each do |evidence|
+        combination_instance = []
+        @parents.each {|p| combination_instance << p.get_observed_state(evidence) }
+        combination_instance << get_observed_state(evidence)
+        @state_frequencies[combination_instance] ||= 0
+        @state_frequencies[combination_instance] += 1
+      end
+    end
+  end
+  class Net
+    # Expects data to be an array of hashes containing complete sets of evidence
+    # for all variables in the network.  Constructs probability tables for each variable
+    # based on the data.
+    def learn(data)
+      data.each {|evidence| add_sample_point(evidence) }
+      set_probabilities_from_sample_points!
+    end
+    def add_sample_point(evidence)
+      evidence = symbolize_evidence(evidence)
+      @variables.keys.each {|key| @variables[key].add_sample_point(evidence) }
+    end
+    def set_probabilities_from_sample_points!
+      # we must first conduct learning on parents then their children
+      unlearned_variables = @variables.keys
+      count = 0
+      size = @variables.size.to_f
+      until unlearned_variables.empty?
+        learnable_variables = @variables.reject do |name, var|
+          reject = false
+          var.parents.each {|p| reject = true if unlearned_variables.include?(p.name) }
+          reject
+        end
+        learnable_variables.keys.each do |key|
+          @variables[key].set_probabilities_from_sample_points!
+          count += 1
+          unlearned_variables.delete(key)
+        end
+      end
+    end
+  end
+end

data/lib/net.rb ADDED Viewed

@@ -0,0 +1,49 @@
+class Sbn
+  class Net
+    attr_reader :name, :variables
+    def initialize(name = '')
+      @@net_count ||= 0
+      @@net_count += 1
+      @name = (name.empty? ? "net_#{@@net_count}" : name.to_underscore_sym)
+      @variables = {}
+      @evidence = {}
+    end
+    def ==(obj); test_equal(obj); end
+    def eql?(obj); test_equal(obj); end
+    def ===(obj); test_equal(obj); end
+    def add_variable(variable)
+      name = variable.name
+      if @variables.has_key? name
+        raise "Variable of name #{name} has already been added to this net"
+      end
+      @variables[name] = variable
+    end
+    def symbolize_evidence(evidence) # :nodoc:
+      newevidence = {}
+      evidence.each do |key, val|
+        key = key.to_underscore_sym
+        raise "Invalid variable name #{key}." unless @variables.has_key?(key)
+        newevidence[key] = @variables[key].transform_evidence_value(val)
+      end
+      newevidence
+    end
+    def set_evidence(event)
+      @evidence = symbolize_evidence(event)
+    end
+  private
+    def test_equal(net)
+      returnval = true
+      returnval = false unless net.class == self.class and self.class == Net
+      returnval = false unless net.name == @name
+      returnval = false unless @variables.keys.map {|k| k.to_s}.sort == net.variables.keys.map {|k| k.to_s}.sort
+      net.variables.each {|name, variable| returnval = false unless variable == @variables[name] }
+      returnval
+    end
+  end
+end

data/lib/numeric_variable.rb ADDED Viewed

@@ -0,0 +1,94 @@
+require File.dirname(__FILE__) + '/variable'
+class Sbn
+  class NumericVariable < Variable
+    DEFAULT_FIRST_STDEV_STATE_COUNT = 14
+    DEFAULT_SECOND_STDEV_STATE_COUNT = 6
+    attr_reader :state_thresholds
+    def initialize(net, name, probabilities = [], state_thresholds = [], options = {})
+      @state_count_one = options.fetch(:first_stdev_state_count, DEFAULT_FIRST_STDEV_STATE_COUNT).to_f.round
+      @state_count_two = options.fetch(:second_stdev_state_count, DEFAULT_SECOND_STDEV_STATE_COUNT).to_f.round
+      @state_count_one += 1 if @state_count_one.odd?
+      @state_count_two += 1 if @state_count_two.odd?
+      @state_thresholds = state_thresholds
+      states = generate_states_from_thresholds
+      super(net, name, probabilities, states)
+    end
+    # alter the state table based on the variance of the sample points
+    def set_probabilities_from_sample_points! # :nodoc:
+      values = []
+      @sample_points.each {|evidence| values << evidence[@name] }
+      stdev = values.standard_deviation
+      average = values.average
+      increment_amount_for_first_stdev = stdev * 2.0 / @state_count_one.to_f
+      increment_amount_for_second_stdev = stdev * 2.0 / @state_count_two.to_f
+      current_position = average - (stdev * 2.0)
+      @state_thresholds = []
+      # start on the left, two standard deviations away from the average
+      (@state_count_two / 2).times do
+        @state_thresholds << current_position
+        current_position += increment_amount_for_second_stdev
+      end
+      # continue to add thresholds within the first standard deviation
+      @state_count_one.times do
+        @state_thresholds << current_position
+        current_position += increment_amount_for_first_stdev
+      end
+      # add thresholds to the second standard deviation on the right
+      (@state_count_two / 2).times do
+        @state_thresholds << current_position
+        current_position += increment_amount_for_second_stdev
+      end
+      @states = generate_states_from_thresholds
+      # Now that states have been determined, call parent
+      # class to finish processing sample points.
+      super
+    end
+    def to_xmlbif_variable(xml) # :nodoc:
+      super(xml) {|x| x.property("StateThresholds = #{@state_thresholds.join(',')}") }
+    end
+    def get_observed_state(evidence) # :nodoc:
+      num = evidence[@name]
+      thresholds = @state_thresholds.dup
+      index = 0
+      t = thresholds.shift
+      while num >= t and !thresholds.empty? do
+        t = thresholds.shift
+        index += 1
+      end
+      index += 1 if num >= t and thresholds.empty?
+      @states[index]
+    end
+    def transform_evidence_value(val) # :nodoc:
+      val.to_f
+    end
+  private
+    def generate_states_from_thresholds
+      returnval = []
+      unless @state_thresholds.empty?
+        th = @state_thresholds.map {|t| t.to_s.sub('\.', '_') }
+        th.each_index do |i|
+          if i == 0
+            returnval << "lt#{th[0]}"
+          else
+            returnval << "gte#{th[i - 1]}lt#{th[i]}"
+          end
+        end
+        returnval << "gte#{th[th.size - 1]}"
+        returnval.map! {|state| state.to_sym }
+      end
+      returnval
+    end
+  end
+end

data/lib/sbn.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'rubygems'
+require 'active_support'
+gem 'builder', '>=2.0'
+require 'builder'
+Dir[File.join(File.dirname(__FILE__), '*.rb')].sort.each { |lib| require lib unless lib == 'sbn.rb' }

data/lib/string_variable.rb ADDED Viewed

@@ -0,0 +1,176 @@
+require File.dirname(__FILE__) + '/variable'
+class Sbn
+  class StringCovariable < Variable # :nodoc:
+    attr_reader :text_to_match
+    def initialize(net, manager_name, text_to_match, probabilities)
+      @@covar_count ||= 0
+      @@covar_count += 1
+      @manager_name = manager_name
+      @text_to_match = text_to_match.downcase
+      super(net, "#{@manager_name}_covar_#{@@covar_count}", probabilities)
+    end
+    def to_xmlbif_variable(xml)
+      super(xml) do |x|
+        x.property("ManagerVariableName = #{@manager_name.to_s}")
+        x.property("TextToMatch = #{@text_to_match.inspect}")
+      end
+    end
+    def evidence_name # :nodoc:
+      @manager_name
+    end
+    def get_observed_state(evidence) # :nodoc:
+      evidence[@manager_name].include?(@text_to_match) ? :true : :false
+    end
+    def transform_evidence_value(val) # :nodoc:
+      raise "Evidence should not be provided for string covariables"
+    end
+    def set_in_evidence?(evidence) # :nodoc:
+      evidence.has_key?(@manager_name)
+    end
+  private
+    def test_equal(covariable)
+      returnval = true
+      returnval = false unless self.class == covariable.class and self.is_a? StringCovariable
+      returnval = false unless returnval and @manager_name == covariable.instance_eval('@manager_name')
+      returnval = false unless returnval and @text_to_match == covariable.instance_eval('@text_to_match')
+      returnval = false unless returnval and super(covariable)
+      returnval
+    end
+  end
+  class StringVariable < Variable
+    DEFAULT_NGRAM_SIZES = [3, 5, 10]
+    def initialize(net, name = '')
+      @net = net
+      @covariables = {}
+      @covariable_children = []
+      @covariable_parents = []
+      super(net, name, [], [])
+    end
+    # create co-variables when new n-grams are encountered
+    def add_sample_point(evidence) # :nodoc:
+      val = evidence[@name].downcase.strip
+      len = val.length
+      ngrams = []
+      # Make ngrams as small as 3 characters in length up to
+      # the length of the string.  We may need to whittle this
+      # down significantly to avoid severe computational burdens.
+      DEFAULT_NGRAM_SIZES.each {|n| ngrams.concat val.ngrams(n) }
+      ngrams.uniq!
+      ngrams.each do |ng|
+        unless @covariables.has_key?(ng)
+          # these probabilities are temporary and will get erased after learning
+          newcovar = StringCovariable.new(@net, @name, ng, [0.5, 0.5])
+          count = 0
+          @covariable_parents.each {|p| newcovar.add_parent(p) }
+          @covariable_children.each {|p| newcovar.add_child(p) }
+          @covariables[ng] = newcovar
+        end
+        @covariables[ng].add_sample_point(evidence)
+      end
+    end
+    # returns an array of the variable's string covariables in alphabetical order
+    def covariables # :nodoc:
+      returnval = []
+      @covariables.keys.sort.each {|key| returnval << @covariables[key] }
+      returnval
+    end
+    def to_xmlbif_variable(xml) # :nodoc:
+      super(xml) do |x|
+        covars = @covariables.keys.sort
+        parents = @covariable_parents.map {|p| p.name }
+        x.property("Covariables = #{covars.join(',')}") unless covars.empty?
+        # A string variable's parents cannot be specified in the "given"
+        # section below, because only its covariables actually have them.
+        x.property("Parents = #{parents.join(',')}") unless parents.empty?
+      end
+    end
+    def to_xmlbif_definition(xml) # :nodoc:
+      # string variables do not have any direct probabilities--only their covariables
+    end
+    # This node never influences the probabilities.  Its sole
+    # responsibility is to manage the co-variables, so it should
+    # always appear to be set in the evidence so that it won't
+    # waste time in the inference process.
+    def set_in_evidence?(evidence) # :nodoc:
+      true
+      # raise "String variables should never be used in inference--only their covariables"
+    end
+    # This method is used when reconstituting saved networks
+    def add_covariable(covariable) # :nodoc:
+      @covariable_children.each {|child| covariable.add_child(child) }
+      @covariable_parents.each {|parent| covariable.add_parent(parent) }
+      @covariables[covariable.text_to_match] = covariable
+    end
+    # This node never has any parents or children.  It just
+    # sets the parents or children of its covariables.
+    def add_child_no_recurse(variable) # :nodoc:
+      return if variable == self or @covariable_children.include?(variable)
+      if variable.is_a?(StringVariable)
+        @covariable_children.concat variable.covariables
+        @covariables.each {|ng, covar| variable.covariables.each {|varcovar| covar.add_child(varcovar) } }
+      else
+        @covariable_children << variable
+        @covariables.each {|ng, covar| covar.add_child(variable) }
+      end
+      variable.generate_probability_table
+    end
+    def add_parent_no_recurse(variable) # :nodoc:
+      return if variable == self or @covariable_parents.include?(variable)
+      if variable.is_a?(StringVariable)
+        @covariable_parents.concat variable.covariables
+        @covariables.each {|ng, covar| variable.covariables.each {|varcovar| covar.add_parent(varcovar) } }
+      else
+        @covariable_parents << variable
+        @covariables.each {|ng, covar| covar.add_parent(variable) }
+      end
+      generate_probability_table
+    end
+    def generate_probability_table # :nodoc:
+      @covariables.each {|ng, covar| covar.generate_probability_table }
+    end
+    def is_complete_evidence?(evidence) # :nodoc:
+      parent_names = @covariable_parents.map {|p| p.name.to_s }
+      super(evidence) {|varnames| varnames.concat(parent_names) }
+    end
+    def transform_evidence_value(val) # :nodoc:
+      val.to_s.downcase
+    end
+  private
+    def test_equal(variable)
+      returnval = true
+      returnval = false unless self.class == variable.class and self.is_a? StringVariable
+      returnval = false unless returnval and @name == variable.name
+      returnval = false unless returnval and @covariable_children == variable.instance_eval('@covariable_children')
+      returnval = false unless returnval and @covariable_parents == variable.instance_eval('@covariable_parents')
+      @covariables.each do |key, val|
+        break unless returnval
+        returnval = false unless val == variable.instance_eval("@covariables[:#{key.to_s}]")
+      end
+      returnval
+    end
+  end
+end