RubyGems - finite_mdp - Versions diffs - 0.2.0 → 0.3.0 - Mend

finite_mdp 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/README.rdoc +8 -12
data/lib/finite_mdp/array_model.rb +226 -0
data/lib/finite_mdp/hash_model.rb +10 -9
data/lib/finite_mdp/model.rb +19 -18
data/lib/finite_mdp/solver.rb +96 -83
data/lib/finite_mdp/table_model.rb +28 -19
data/lib/finite_mdp/vector_valued.rb +5 -5
data/lib/finite_mdp/version.rb +2 -1
data/lib/finite_mdp.rb +3 -2
data/test/finite_mdp/finite_mdp_test.rb +151 -98
metadata +33 -4

data/lib/finite_mdp/solver.rb CHANGED Viewed

@@ -1,3 +1,9 @@
+# frozen_string_literal: true
+# We use A to denote a matrix, which rubocop does not like.
+# rubocop:disable Style/MethodName
+# rubocop:disable Style/VariableName
 require 'narray'
 #
@@ -29,59 +35,54 @@ class FiniteMDP::Solver
   # @param [Hash<state, Float>] value initial value for each state; defaults to
   #        zero for every state
   #
-  def initialize model, discount, policy=nil, value=Hash.new(0)
-    @model = model
+  def initialize(model, discount, policy: nil, value: Hash.new(0))
     @discount = discount
     # get the model data into a more compact form for calculation; this means
     # that we number the states and actions for faster lookups (avoid most of
-    # the hashing); the 'next states' map is still stored in sparse format
-    # (that is, as a hash)
-    model_states = model.states
-    state_to_num = Hash[model_states.zip((0...model_states.size).to_a)]
-    @array_model = model_states.map {|state|
-      model.actions(state).map {|action|
-        model.next_states(state, action).map {|next_state|
-          pr = model.transition_probability(state, action, next_state)
-          [state_to_num[next_state], pr,
-            model.reward(state, action, next_state)] if pr > 0
-        }.compact
-      }
-    }
+    # the hashing)
+    @model =
+      if model.is_a?(FiniteMDP::ArrayModel)
+        model
+      else
+        FiniteMDP::ArrayModel.from_model(model)
+      end
     # convert initial values and policies to compact form
-    @array_value = model_states.map {|state| value[state]}
-    if policy
-      action_to_num = model_states.map{|state|
-        actions = model.actions(state)
-        Hash[actions.zip((0...actions.size).to_a)]
-      }
-      @array_policy = action_to_num.zip(model_states).
-                            map {|a_to_n, state| a_to_n[policy[state]]}
-    else
-      # default to the first action, arbitrarily
-      @array_policy = [0]*model_states.size
-    end
+    @array_value = @model.states.map { |state| value[state] }
+    @array_policy =
+      if policy
+        @model.states.map do |state|
+          @model.actions(state).index(policy[state])
+        end
+      else
+        [0] * @model.num_states
+      end
     raise 'some initial values are missing' if
-      @array_value.any? {|v| v.nil?}
+      @array_value.any?(&:nil?)
     raise 'some initial policy actions are missing' if
-      @array_policy.any? {|a| a.nil?}
+      @array_policy.any?(&:nil?)
     @policy_A = nil
   end
   #
-  # @return [Model] the model being solved; read only; do not change the model
-  #         while it is being solved
+  # @return [ArrayModel] the model being solved; read only; do not change the
+  #         model while it is being solved
   #
   attr_reader :model
-  #
+  #
+  # @return [Float] discount factor, in (0, 1]
+  #
+  attr_reader :discount
+  #
   # Current value estimate for each state.
   #
   # The result is converted from the solver's internal representation, so you
-  # cannot affect the solver by changing the result.
+  # cannot affect the solver by changing the result.
   #
   # @return [Hash<state, Float>] from states to values; read only; any changes
   # made to the returned object will not affect the solver
@@ -98,14 +99,12 @@ class FiniteMDP::Solver
   #
   def state_action_value
     q = {}
-    states = model.states
-    @array_model.each_with_index do |actions, state_n|
-      state = states[state_n]
-      state_actions = model.actions(state)
-      actions.each_with_index do |next_state_ns, action_n|
-        q_sa = next_state_ns.map {|next_state_n, pr, r|
-          pr * (r + @discount * @array_value[next_state_n])}.inject(:+)
-        q[[state, state_actions[action_n]]] = q_sa
+    model.states.each_with_index do |state, state_n|
+      model.actions(state).each_with_index do |action, action_n|
+        q_sa = model.array[state_n][action_n].map do |next_state_n, pr, r|
+          pr * (r + @discount * @array_value[next_state_n])
+        end.inject(:+)
+        q[[state, action]] = q_sa
       end
     end
     q
@@ -118,8 +117,9 @@ class FiniteMDP::Solver
   # made to the returned object will not affect the solver
   #
   def policy
-    Hash[model.states.zip(@array_policy).map{|state, action_n|
-      [state, model.actions(state)[action_n]]}]
+    Hash[model.states.zip(@array_policy).map do |state, action_n|
+      [state, model.actions(state)[action_n]]
+    end]
   end
   #
@@ -135,7 +135,7 @@ class FiniteMDP::Solver
   #
   def evaluate_policy
     delta = 0.0
-    @array_model.each_with_index do |actions, state_n|
+    model.array.each_with_index do |actions, state_n|
       next_state_ns = actions[@array_policy[state_n]]
       new_value = backup(next_state_ns)
       delta = [delta, (@array_value[state_n] - new_value).abs].max
@@ -162,16 +162,16 @@ class FiniteMDP::Solver
   def evaluate_policy_exact
     if @policy_A
       # update only those rows for which the policy has changed
-      @policy_A_action.zip(@array_policy).
-        each_with_index do |(old_action_n, new_action_n), state_n|
+      @policy_A_action.zip(@array_policy)
+        .each_with_index do |(old_action_n, new_action_n), state_n|
         next if old_action_n == new_action_n
         update_policy_Ab state_n, new_action_n
       end
     else
       # initialise the A and the b for Ax = b
-      num_states = @array_model.size
+      num_states = model.num_states
       @policy_A = NMatrix.float(num_states, num_states)
-      @policy_A_action = [-1]*num_states
+      @policy_A_action = [-1] * num_states
       @policy_b = NVector.float(num_states)
       @array_policy.each_with_index do |action_n, state_n|
@@ -189,26 +189,30 @@ class FiniteMDP::Solver
   #
   # This is the 'policy improvement' step in Figure 4.3 of Sutton and Barto
   # (1998).
-  #
-  # @return [Boolean] false iff the policy changed for any state
   #
-  def improve_policy
-    stable = true
-    @array_model.each_with_index do |actions, state_n|
+  # @param [Float] tolerance non-negative tolerance; for the policy to change,
+  #        the action must be at least this much better than the current
+  #        action
+  #
+  # @return [Integer] number of states that changed
+  #
+  def improve_policy(tolerance: Float::EPSILON)
+    changed = 0
+    model.array.each_with_index do |actions, state_n|
       a_max = nil
       v_max = -Float::MAX
       actions.each_with_index do |next_state_ns, action_n|
         v = backup(next_state_ns)
-        if v > v_max
+        if v > v_max + tolerance
           a_max = action_n
           v_max = v
         end
       end
       raise "no feasible actions in state #{state_n}" unless a_max
-      stable = false if @array_policy[state_n] != a_max
+      changed += 1 if @array_policy[state_n] != a_max
       @array_policy[state_n] = a_max
     end
-    stable
+    changed
   end
   #
@@ -223,7 +227,7 @@ class FiniteMDP::Solver
   #
   def value_iteration_single
     delta = 0.0
-    @array_model.each_with_index do |actions, state_n|
+    model.array.each_with_index do |actions, state_n|
       a_max = nil
       v_max = -Float::MAX
       actions.each_with_index do |next_state_ns, action_n|
@@ -247,7 +251,7 @@ class FiniteMDP::Solver
   #
   # @param [Float] tolerance small positive number
   #
-  # @param [Integer, nil] max_iters terminate after this many iterations, even
+  # @param [Integer, nil] max_iters terminate after this many iterations, even
   #        if the value function has not converged; nil means that there is
   #        no limit on the number of iterations
   #
@@ -260,7 +264,7 @@ class FiniteMDP::Solver
   # @yieldparam [Float] delta largest change in the value function in the last
   #             iteration
   #
-  def value_iteration tolerance, max_iters=nil
+  def value_iteration(tolerance:, max_iters: nil)
     delta = Float::MAX
     num_iters = 0
     loop do
@@ -281,6 +285,11 @@ class FiniteMDP::Solver
   #        phase ends if the largest change in the value function
   #        (<tt>delta</tt>) is below this tolerance
   #
+  # @param [Float] policy_tolerance small positive number; when comparing
+  #        actions during policy improvement, ignore value function differences
+  #        smaller than this tolerance; this helps with convergence when there
+  #        are several equivalent or extremely similar actions
+  #
   # @param [Integer, nil] max_value_iters terminate the policy evaluation
   #        phase after this many iterations, even if the value function has not
   #        converged; nil means that there is no limit on the number of
@@ -298,16 +307,20 @@ class FiniteMDP::Solver
   # @yieldparam [Integer] num_policy_iters policy improvement iterations done so
   #             far
   #
+  # @yieldparam [Integer?] actions_changed number of actions that changed in
+  #             the policy improvement phase, if any
+  #
   # @yieldparam [Integer] num_value_iters policy evaluation iterations done so
   #             far for the current policy improvement iteration
   #
   # @yieldparam [Float] delta largest change in the value function in the last
   #             policy evaluation iteration
   #
-  def policy_iteration value_tolerance, max_value_iters=nil,
-    max_policy_iters=nil
+  def policy_iteration(value_tolerance:,
+    policy_tolerance: value_tolerance / 2.0, max_value_iters: nil,
+    max_policy_iters: nil)
-    stable = false
+    num_actions_changed = nil
     num_policy_iters = 0
     loop do
       # policy evaluation
@@ -315,19 +328,19 @@ class FiniteMDP::Solver
       loop do
         value_delta = evaluate_policy
         num_value_iters += 1
+        yield(num_policy_iters, num_actions_changed, num_value_iters,
+          value_delta) if block_given?
         break if value_delta < value_tolerance
         break if max_value_iters && num_value_iters >= max_value_iters
-        yield num_policy_iters, num_value_iters, value_delta if block_given?
       end
       # policy improvement
-      stable = improve_policy
+      num_actions_changed = improve_policy(tolerance: policy_tolerance)
       num_policy_iters += 1
-      break if stable
-      break if max_policy_iters && num_policy_iters >= max_policy_iters
+      return true if num_actions_changed == 0
+      return false if max_policy_iters && num_policy_iters >= max_policy_iters
     end
-    stable
   end
   #
@@ -343,18 +356,19 @@ class FiniteMDP::Solver
   #
   # @yieldparam [Integer] num_iters policy improvement iterations done so far
   #
-  def policy_iteration_exact max_iters=nil
-    stable = false
+  # @yieldparam [Integer] num_actions_changed number of actions that changed in
+  #             the last policy improvement phase
+  #
+  def policy_iteration_exact(max_iters: nil)
     num_iters = 0
     loop do
       evaluate_policy_exact
-      stable = improve_policy
+      num_actions_changed = improve_policy
       num_iters += 1
-      break if stable
-      break if max_iters && num_iters >= max_iters
-      yield num_iters if block_given?
+      yield num_iters, num_actions_changed if block_given?
+      return true if num_actions_changed == 0
+      return false if max_iters && num_iters >= max_iters
     end
-    stable
   end
   private
@@ -362,30 +376,29 @@ class FiniteMDP::Solver
   #
   # Updated value estimate for a state with the given successor states.
   #
-  def backup next_state_ns
-    next_state_ns.map {|next_state_n, probability, reward|
-      probability*(reward + @discount*@array_value[next_state_n])
-    }.inject(:+)
+  def backup(next_state_ns)
+    next_state_ns.map do |next_state_n, probability, reward|
+      probability * (reward + @discount * @array_value[next_state_n])
+    end.inject(:+)
   end
   #
   # Update the row in A the entry in b (in Ax=b) for the given state; see
   # {#evaluate_policy_exact}.
   #
-  def update_policy_Ab state_n, action_n
+  def update_policy_Ab(state_n, action_n)
     # clear out the old values for state_n's row
     @policy_A[true, state_n] = 0.0
     # set new values according to state_n's successors under the current policy
     b_n = 0
-    next_state_ns = @array_model[state_n][action_n]
+    next_state_ns = model.array[state_n][action_n]
     next_state_ns.each do |next_state_n, probability, reward|
-      @policy_A[next_state_n, state_n] = -@discount*probability
-      b_n += probability*reward
+      @policy_A[next_state_n, state_n] = -@discount * probability
+      b_n += probability * reward
     end
     @policy_A[state_n, state_n] += 1
     @policy_A_action[state_n] = action_n
     @policy_b[state_n] = b_n
   end
 end

data/lib/finite_mdp/table_model.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # A finite markov decision process model for which the states, actions,
 # transition probabilities and rewards are specified as a table. This is a
@@ -12,7 +13,7 @@ class FiniteMDP::TableModel
   # @param [Array<[state, action, state, Float, Float]>] rows each row is
   #  [state, action, next state, probability, reward]
   #
-  def initialize rows
+  def initialize(rows)
     @rows = rows
   end
@@ -28,7 +29,7 @@ class FiniteMDP::TableModel
   # @return [Array<state>] not empty; no duplicate states
   #
   def states
-    @rows.map{|row| row[0]}.uniq
+    @rows.map { |row| row[0] }.uniq
   end
   #
@@ -38,23 +39,23 @@ class FiniteMDP::TableModel
   #
   # @return [Array<action>] not empty; no duplicate actions
   #
-  def actions state
-    @rows.map{|row| row[1] if row[0] == state}.compact.uniq
+  def actions(state)
+    @rows.map { |row| row[1] if row[0] == state }.compact.uniq
   end
   #
   # Possible successor states after taking the given action in the given state;
   # see {Model#next_states}.
-  #
+  #
   # @param [state] state
   #
   # @param [action] action
   #
   # @return [Array<state>] not empty; no duplicate states
   #
-  def next_states state, action
-    @rows.map{|row| row[2] if row[0] == state && row[1] == action}.compact
-  end
+  def next_states(state, action)
+    @rows.map { |row| row[2] if row[0] == state && row[1] == action }.compact
+  end
   #
   # Probability of the given transition; see {Model#transition_probability}.
@@ -66,10 +67,10 @@ class FiniteMDP::TableModel
   # @param [state] next_state
   #
   # @return [Float] in [0, 1]; zero if the transition is not in the table
-  #
-  def transition_probability state, action, next_state
-    @rows.map{|row| row[3] if row[0] == state &&
-      row[1] == action && row[2] == next_state}.compact.first || 0
+  #
+  def transition_probability(state, action, next_state)
+    row = find_row(state, action, next_state)
+    row ? row[3] : 0
   end
   #
@@ -83,9 +84,9 @@ class FiniteMDP::TableModel
   #
   # @return [Float, nil] nil if the transition is not in the table
   #
-  def reward state, action, next_state
-    @rows.map{|row| row[4] if row[0] == state &&
-      row[1] == action && row[2] == next_state}.compact.first
+  def reward(state, action, next_state)
+    row = find_row(state, action, next_state)
+    row[4] if row
   end
   #
@@ -105,18 +106,26 @@ class FiniteMDP::TableModel
   #
   # @return [TableModel]
   #
-  def self.from_model model, sparse=true
+  def self.from_model(model, sparse = true)
     rows = []
     model.states.each do |state|
       model.actions(state).each do |action|
         model.next_states(state, action).each do |next_state|
           pr = model.transition_probability(state, action, next_state)
-          rows << [state, action, next_state,  pr,
-            model.reward(state, action, next_state)] if pr > 0 || !sparse
+          next unless pr > 0 || !sparse
+          reward = model.reward(state, action, next_state)
+          rows << [state, action, next_state, pr, reward]
         end
       end
     end
     FiniteMDP::TableModel.new(rows)
   end
-end
+  private
+  def find_row(state, action, next_state)
+    @rows.find do |row|
+      row[0] == state && row[1] == action && row[2] == next_state
+    end
+  end
+end

data/lib/finite_mdp/vector_valued.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # Define an object's hash code and equality (in the sense of <tt>eql?</tt>)
 # according to its array representation (<tt>to_a</tt>). See notes for {Model}
@@ -7,7 +8,7 @@
 #
 # @example
 #
-#   class MyPoint
+#   class MyPoint
 #     include FiniteMDP::VectorValued
 #
 #     def initialize x, y
@@ -31,7 +32,7 @@ module FiniteMDP::VectorValued
   # @return [Integer]
   #
   def hash
-    self.to_a.hash
+    to_a.hash
   end
   #
@@ -39,8 +40,7 @@ module FiniteMDP::VectorValued
   #
   # @return [Boolean]
   #
-  def eql? state
-    self.to_a.eql? state.to_a
+  def eql?(other)
+    to_a.eql? other.to_a
   end
 end

data/lib/finite_mdp/version.rb CHANGED Viewed

@@ -1,6 +1,7 @@
+# frozen_string_literal: true
 module FiniteMDP
   VERSION_MAJOR = 0
-  VERSION_MINOR = 2
+  VERSION_MINOR = 3
   VERSION_PATCH = 0
   VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
 end

data/lib/finite_mdp.rb CHANGED Viewed

@@ -1,14 +1,15 @@
+# frozen_string_literal: true
 require 'enumerator'
 require 'finite_mdp/version'
 require 'finite_mdp/vector_valued'
 require 'finite_mdp/model'
+require 'finite_mdp/array_model'
 require 'finite_mdp/hash_model'
 require 'finite_mdp/table_model'
 require 'finite_mdp/solver'
-# TODO maybe for efficiency it would be worth including a special case for
+# TODO: maybe for efficiency it would be worth including a special case for
 # models in which rewards depend only on the state -- a few minor
 # simplifications are possible in the solver, but it won't make a huge
 # difference.