finite_mdp 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,344 @@
1
+ require 'narray'
2
+
3
+ #
4
+ # Find optimal values and policies using policy iteration and/or value
5
+ # iteration. The methods here are suitable for finding deterministic policies
6
+ # for infinite-horizon problems.
7
+ #
8
+ # The computations are carried out on an intermediate form of the given model,
9
+ # which is stored using nested arrays:
10
+ # model[state_num][action_num] = [[next_state_num, probability, reward], ...]
11
+ # The solver assigns numbers to each state and each action automatically. Note
12
+ # that the successor state data are stored in sparse format, and any transitions
13
+ # that are in the given model but have zero probability are not stored.
14
+ #
15
+ # TODO implement backward induction for finite horizon problems
16
+ #
17
+ # TODO maybe implement a 'dense' storage format for models with many successor
18
+ # states, probably as a different solver class
19
+ #
20
+ class FiniteMDP::Solver
21
+ #
22
+ # @param [Model] model
23
+ #
24
+ # @param [Float] discount in (0, 1]
25
+ #
26
+ # @param [Hash<state, action>, nil] policy initial policy; if nil, an
27
+ # arbitrary action is selected for each state
28
+ #
29
+ # @param [Hash<state, Float>] value initial value for each state; defaults to
30
+ # zero for every state
31
+ #
32
+ def initialize model, discount, policy=nil, value=Hash.new(0)
33
+ @model = model
34
+ @discount = discount
35
+
36
+ # get the model data into a more compact form for calculation; this means
37
+ # that we number the states and actions for faster lookups (avoid most of
38
+ # the hashing); the 'next states' map is still stored in sparse format
39
+ # (that is, as a hash)
40
+ model_states = model.states
41
+ state_to_num = Hash[model_states.zip((0...model_states.size).to_a)]
42
+ @array_model = model_states.map {|state|
43
+ model.actions(state).map {|action|
44
+ model.next_states(state, action).map {|next_state|
45
+ pr = model.transition_probability(state, action, next_state)
46
+ [state_to_num[next_state], pr,
47
+ model.reward(state, action, next_state)] if pr > 0
48
+ }.compact
49
+ }
50
+ }
51
+
52
+ # convert initial values and policies to compact form
53
+ @array_value = model_states.map {|state| value[state]}
54
+ if policy
55
+ action_to_num = model_states.map{|state|
56
+ actions = model.actions(state)
57
+ Hash[actions.zip((0...actions.size).to_a)]
58
+ }
59
+ @array_policy = action_to_num.zip(model_states).
60
+ map {|a_to_n, state| a_to_n[policy[state]]}
61
+ else
62
+ # default to the first action, arbitrarily
63
+ @array_policy = [0]*model_states.size
64
+ end
65
+
66
+ raise 'some initial values are missing' if
67
+ @array_value.any? {|v| v.nil?}
68
+ raise 'some initial policy actions are missing' if
69
+ @array_policy.any? {|a| a.nil?}
70
+
71
+ @policy_A = nil
72
+ end
73
+
74
+ #
75
+ # @return [Model] the model being solved; read only; do not change the model
76
+ # while it is being solved
77
+ #
78
+ attr_reader :model
79
+
80
+ #
81
+ # Current value estimate for each state.
82
+ #
83
+ # The result is converted from the solver's internal representation, so you
84
+ # cannot affect the solver by changing the result.
85
+ #
86
+ # @return [Hash<state, Float>] from states to values; read only; any changes
87
+ # made to the returned object will not affect the solver
88
+ #
89
+ def value
90
+ Hash[model.states.zip(@array_value)]
91
+ end
92
+
93
+ #
94
+ # Current estimate of the optimal action for each state.
95
+ #
96
+ # @return [Hash<state, action>] from states to actions; read only; any changes
97
+ # made to the returned object will not affect the solver
98
+ #
99
+ def policy
100
+ Hash[model.states.zip(@array_policy).map{|state, action_n|
101
+ [state, model.actions(state)[action_n]]}]
102
+ end
103
+
104
+ #
105
+ # Refine the estimate of the value function for the current policy. This is
106
+ # done by iterating the Bellman equations; see also {#evaluate_policy_exact}
107
+ # for a different approach.
108
+ #
109
+ # This is the 'policy evaluation' step in Figure 4.3 of Sutton and Barto
110
+ # (1998).
111
+ #
112
+ # @return [Float] largest absolute change (over all states) in the value
113
+ # function
114
+ #
115
+ def evaluate_policy
116
+ delta = 0.0
117
+ @array_model.each_with_index do |actions, state_n|
118
+ next_state_ns = actions[@array_policy[state_n]]
119
+ new_value = backup(next_state_ns)
120
+ delta = [delta, (@array_value[state_n] - new_value).abs].max
121
+ @array_value[state_n] = new_value
122
+ end
123
+ delta
124
+ end
125
+
126
+ #
127
+ # Evaluate the value function for the current policy by solving a linear
128
+ # system of n equations in n unknowns, where n is the number of states in the
129
+ # model.
130
+ #
131
+ # This routine currently uses dense linear algebra, so it requires that the
132
+ # full n-by-n matrix be stored in memory. This may be a problem for moderately
133
+ # large n.
134
+ #
135
+ # All of the coefficients (A and b in Ax = b) are computed first call, but
136
+ # subsequent calls recompute only those rows for which the policy has changed
137
+ # since the last call.
138
+ #
139
+ # @return [nil]
140
+ #
141
+ def evaluate_policy_exact
142
+ if @policy_A
143
+ # update only those rows for which the policy has changed
144
+ @policy_A_action.zip(@array_policy).
145
+ each_with_index do |(old_action_n, new_action_n), state_n|
146
+ next if old_action_n == new_action_n
147
+ update_policy_Ab state_n, new_action_n
148
+ end
149
+ else
150
+ # initialise the A and the b for Ax = b
151
+ num_states = @array_model.size
152
+ @policy_A = NMatrix.float(num_states, num_states)
153
+ @policy_A_action = [-1]*num_states
154
+ @policy_b = NVector.float(num_states)
155
+
156
+ @array_policy.each_with_index do |action_n, state_n|
157
+ update_policy_Ab state_n, action_n
158
+ end
159
+ end
160
+
161
+ value = @policy_b / @policy_A # solve linear system
162
+ @array_value = value.to_a
163
+ nil
164
+ end
165
+
166
+ #
167
+ # Make policy greedy with respect to the current value function.
168
+ #
169
+ # This is the 'policy improvement' step in Figure 4.3 of Sutton and Barto
170
+ # (1998).
171
+ #
172
+ # @return [Boolean] false iff the policy changed for any state
173
+ #
174
+ def improve_policy
175
+ stable = true
176
+ @array_model.each_with_index do |actions, state_n|
177
+ a_max = nil
178
+ v_max = -Float::MAX
179
+ actions.each_with_index do |next_state_ns, action_n|
180
+ v = backup(next_state_ns)
181
+ if v > v_max
182
+ a_max = action_n
183
+ v_max = v
184
+ end
185
+ end
186
+ raise "no feasible actions in state #{state_n}" unless a_max
187
+ stable = false if @array_policy[state_n] != a_max
188
+ @array_policy[state_n] = a_max
189
+ end
190
+ stable
191
+ end
192
+
193
+ #
194
+ # A single iteration of value iteration.
195
+ #
196
+ # This is the algorithm from Figure 4.5 of Sutton and Barto (1998). It is
197
+ # mostly equivalent to calling {#evaluate_policy} and then {#improve_policy},
198
+ # but it is somewhat more efficient.
199
+ #
200
+ # @return [Float] largest absolute change (over all states) in the value
201
+ # function
202
+ #
203
+ def value_iteration_single
204
+ delta = 0.0
205
+ @array_model.each_with_index do |actions, state_n|
206
+ a_max = nil
207
+ v_max = -Float::MAX
208
+ actions.each_with_index do |next_state_ns, action_n|
209
+ v = backup(next_state_ns)
210
+ if v > v_max
211
+ a_max = action_n
212
+ v_max = v
213
+ end
214
+ end
215
+ delta = [delta, (@array_value[state_n] - v_max).abs].max
216
+ @array_value[state_n] = v_max
217
+ @array_policy[state_n] = a_max
218
+ end
219
+ delta
220
+ end
221
+
222
+ #
223
+ # Value iteration; call {#value_iteration_single} up to
224
+ # <tt>max_iters</tt> times until the largest change in the value function
225
+ # (<tt>delta</tt>) is less than <tt>tolerance</tt>.
226
+ #
227
+ # @param [Float] tolerance small positive number
228
+ #
229
+ # @param [Integer, nil] max_iters terminate after this many iterations, even
230
+ # if the value function has not converged; nil means that there is
231
+ # no limit on the number of iterations
232
+ #
233
+ # @return [Boolean] true iff iteration converged to within tolerance
234
+ #
235
+ def value_iteration tolerance, max_iters=nil
236
+ delta = Float::MAX
237
+ num_iters = 0
238
+ loop do
239
+ delta = value_iteration_single
240
+ num_iters += 1
241
+
242
+ break if delta < tolerance
243
+ break if max_iters && num_iters > max_iters
244
+ end
245
+ delta < tolerance
246
+ end
247
+
248
+ #
249
+ # Solve with policy iteration using approximate (iterative) policy evaluation.
250
+ #
251
+ # @param [Float] value_tolerance small positive number; the policy evaluation
252
+ # phase ends if the largest change in the value function
253
+ # (<tt>delta</tt>) is below this tolerance
254
+ #
255
+ # @param [Integer, nil] max_value_iters terminate the policy evaluation
256
+ # phase after this many iterations, even if the value function has not
257
+ # converged; nil means that there is no limit on the number of
258
+ # iterations in each policy evaluation phase
259
+ #
260
+ # @param [Integer, nil] max_policy_iters terminate after this many
261
+ # iterations, even if a stable policy has not been obtained; nil means
262
+ # that there is no limit on the number of iterations
263
+ #
264
+ # @return [Boolean] true iff a stable policy was obtained
265
+ #
266
+ def policy_iteration value_tolerance, max_value_iters=nil,
267
+ max_policy_iters=nil
268
+
269
+ stable = false
270
+ num_policy_iters = 0
271
+ loop do
272
+ # policy evaluation
273
+ num_value_iters = 0
274
+ loop do
275
+ value_delta = evaluate_policy
276
+ num_value_iters += 1
277
+
278
+ break if value_delta < value_tolerance
279
+ break if max_value_iters && num_value_iters > max_value_iters
280
+ end
281
+
282
+ # policy improvement
283
+ stable = improve_policy
284
+ num_policy_iters += 1
285
+ break if stable
286
+ break if max_policy_iters && num_policy_iters > max_policy_iters
287
+ end
288
+ stable
289
+ end
290
+
291
+ #
292
+ # Solve with policy iteration using exact policy evaluation.
293
+ #
294
+ # @param [Integer, nil] max_iters terminate after this many
295
+ # iterations, even if a stable policy has not been obtained; nil means
296
+ # that there is no limit on the number of iterations
297
+ #
298
+ # @return [Boolean] true iff a stable policy was obtained
299
+ #
300
+ def policy_iteration_exact max_iters=nil
301
+ stable = false
302
+ num_iters = 0
303
+ loop do
304
+ evaluate_policy_exact
305
+ stable = improve_policy
306
+ num_iters += 1
307
+ break if stable
308
+ break if max_iters && num_iters > max_iters
309
+ end
310
+ stable
311
+ end
312
+
313
+ private
314
+
315
+ #
316
+ # Updated value estimate for a state with the given successor states.
317
+ #
318
+ def backup next_state_ns
319
+ next_state_ns.map {|next_state_n, probability, reward|
320
+ probability*(reward + @discount*@array_value[next_state_n])
321
+ }.inject(:+)
322
+ end
323
+
324
+ #
325
+ # Update the row in A the entry in b (in Ax=b) for the given state; see
326
+ # {#evaluate_policy_exact}.
327
+ #
328
+ def update_policy_Ab state_n, action_n
329
+ # clear out the old values for state_n's row
330
+ @policy_A[true, state_n] = 0.0
331
+
332
+ # set new values according to state_n's successors under the current policy
333
+ b_n = 0
334
+ next_state_ns = @array_model[state_n][action_n]
335
+ next_state_ns.each do |next_state_n, probability, reward|
336
+ @policy_A[next_state_n, state_n] = -@discount*probability
337
+ b_n += probability*reward
338
+ end
339
+ @policy_A[state_n, state_n] += 1
340
+ @policy_A_action[state_n] = action_n
341
+ @policy_b[state_n] = b_n
342
+ end
343
+ end
344
+
@@ -0,0 +1,122 @@
1
+ #
2
+ # A finite markov decision process model for which the states, actions,
3
+ # transition probabilities and rewards are specified as a table. This is a
4
+ # common way of specifying small models.
5
+ #
6
+ # The states and actions can be arbitrary objects; see notes for {Model}.
7
+ #
8
+ class FiniteMDP::TableModel
9
+ include FiniteMDP::Model
10
+
11
+ #
12
+ # @param [Array<[state, action, state, Float, Float]>] rows each row is
13
+ # [state, action, next state, probability, reward]
14
+ #
15
+ def initialize rows
16
+ @rows = rows
17
+ end
18
+
19
+ #
20
+ # @return [Array<[state, action, state, Float, Float]>] each row is [state,
21
+ # action, next state, probability, reward]
22
+ #
23
+ attr_accessor :rows
24
+
25
+ #
26
+ # States in this model; see {Model#states}.
27
+ #
28
+ # @return [Array<state>] not empty; no duplicate states
29
+ #
30
+ def states
31
+ @rows.map{|row| row[0]}.uniq
32
+ end
33
+
34
+ #
35
+ # Actions that are valid for the given state; see {Model#actions}.
36
+ #
37
+ # @param [state] state
38
+ #
39
+ # @return [Array<action>] not empty; no duplicate actions
40
+ #
41
+ def actions state
42
+ @rows.map{|row| row[1] if row[0] == state}.compact.uniq
43
+ end
44
+
45
+ #
46
+ # Possible successor states after taking the given action in the given state;
47
+ # see {Model#next_states}.
48
+ #
49
+ # @param [state] state
50
+ #
51
+ # @param [action] action
52
+ #
53
+ # @return [Array<state>] not empty; no duplicate states
54
+ #
55
+ def next_states state, action
56
+ @rows.map{|row| row[2] if row[0] == state && row[1] == action}.compact
57
+ end
58
+
59
+ #
60
+ # Probability of the given transition; see {Model#transition_probability}.
61
+ #
62
+ # @param [state] state
63
+ #
64
+ # @param [action] action
65
+ #
66
+ # @param [state] next_state
67
+ #
68
+ # @return [Float] in [0, 1]; zero if the transition is not in the table
69
+ #
70
+ def transition_probability state, action, next_state
71
+ @rows.map{|row| row[3] if row[0] == state &&
72
+ row[1] == action && row[2] == next_state}.compact.first || 0
73
+ end
74
+
75
+ #
76
+ # Reward for a given transition; see {Model#reward}.
77
+ #
78
+ # @param [state] state
79
+ #
80
+ # @param [action] action
81
+ #
82
+ # @param [state] next_state
83
+ #
84
+ # @return [Float, nil] nil if the transition is not in the table
85
+ #
86
+ def reward state, action, next_state
87
+ @rows.map{|row| row[4] if row[0] == state &&
88
+ row[1] == action && row[2] == next_state}.compact.first
89
+ end
90
+
91
+ #
92
+ # @return [String] can be quite large
93
+ #
94
+ def inspect
95
+ rows.map(&:inspect).join("\n")
96
+ end
97
+
98
+ #
99
+ # Convert any model into a table model.
100
+ #
101
+ # @param [Model] model
102
+ #
103
+ # @param [Boolean] sparse do not store rows for transitions with zero
104
+ # probability
105
+ #
106
+ # @return [TableModel]
107
+ #
108
+ def self.from_model model, sparse=true
109
+ rows = []
110
+ model.states.each do |state|
111
+ model.actions(state).each do |action|
112
+ model.next_states(state, action).each do |next_state|
113
+ pr = model.transition_probability(state, action, next_state)
114
+ rows << [state, action, next_state, pr,
115
+ model.reward(state, action, next_state)] if pr > 0 || !sparse
116
+ end
117
+ end
118
+ end
119
+ FiniteMDP::TableModel.new(rows)
120
+ end
121
+ end
122
+
@@ -0,0 +1,46 @@
1
+ #
2
+ # Define an object's hash code and equality (in the sense of <tt>eql?</tt>)
3
+ # according to its array representation (<tt>to_a</tt>). See notes for {Model}
4
+ # for why this might be useful.
5
+ #
6
+ # A class that includes this module must define <tt>to_a</tt>.
7
+ #
8
+ # @example
9
+ #
10
+ # class MyPoint
11
+ # include FiniteMDP::VectorValued
12
+ #
13
+ # def initialize x, y
14
+ # @x, @y = x, y
15
+ # end
16
+ #
17
+ # attr_accessor :x, :y
18
+ #
19
+ # # must implement to_a to make VectorValued work
20
+ # def to_a
21
+ # [x, y]
22
+ # end
23
+ # end
24
+ #
25
+ # MyPoint.new(0, 0).eql?(MyPoint.new(0, 0)) #=> true as expected
26
+ #
27
+ module FiniteMDP::VectorValued
28
+ #
29
+ # Redefine hashing based on +to_a+.
30
+ #
31
+ # @return [Integer]
32
+ #
33
+ def hash
34
+ self.to_a.hash
35
+ end
36
+
37
+ #
38
+ # Redefine equality based on +to_a+.
39
+ #
40
+ # @return [Boolean]
41
+ #
42
+ def eql? state
43
+ self.to_a.eql? state.to_a
44
+ end
45
+ end
46
+
@@ -0,0 +1,3 @@
1
+ module FiniteMDP
2
+ VERSION = '0.0.1'
3
+ end
data/lib/finite_mdp.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'enumerator'
2
+
3
+ require 'finite_mdp/version'
4
+ require 'finite_mdp/vector_valued'
5
+ require 'finite_mdp/model'
6
+ require 'finite_mdp/hash_model'
7
+ require 'finite_mdp/table_model'
8
+ require 'finite_mdp/solver'
9
+
10
+ # TODO maybe for efficiency it would be worth including a special case for
11
+ # models in which rewards depend only on the state -- a few minor
12
+ # simplifications are possible in the solver, but it won't make a huge
13
+ # difference.
14
+