finite_mdp 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +229 -0
- data/lib/finite_mdp/hash_model.rb +123 -0
- data/lib/finite_mdp/model.rb +195 -0
- data/lib/finite_mdp/solver.rb +344 -0
- data/lib/finite_mdp/table_model.rb +122 -0
- data/lib/finite_mdp/vector_valued.rb +46 -0
- data/lib/finite_mdp/version.rb +3 -0
- data/lib/finite_mdp.rb +14 -0
- data/test/finite_mdp_test.rb +347 -0
- metadata +94 -0
data/README.rdoc
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
= finite_mdp
|
2
|
+
|
3
|
+
* https://github.com/jdleesmiller/finite_mdp
|
4
|
+
|
5
|
+
== SYNOPSIS
|
6
|
+
|
7
|
+
Solve small, finite Markov Decision Process (MDP) models.
|
8
|
+
|
9
|
+
This library provides several ways of describing an MDP model (see
|
10
|
+
{FiniteMDP::Model}) and some reasonably efficient implementations of policy
|
11
|
+
iteration and value iteration to solve it (see {FiniteMDP::Solver}).
|
12
|
+
|
13
|
+
=== Usage
|
14
|
+
|
15
|
+
==== Example 1: Recycling Robot
|
16
|
+
|
17
|
+
The following shows how to solve the recycling robot model (example 3.7) from
|
18
|
+
<cite>Sutton and Barto (1998). Reinforcement Learning: An Introduction</cite>.
|
19
|
+
|
20
|
+
<blockquote>
|
21
|
+
At each time step, the robot decides whether it should (1) actively search for a
|
22
|
+
can, (2) remain stationary and wait for someone to bring it a can, or (3) go
|
23
|
+
back to home base to recharge its battery. The best way to find cans is to
|
24
|
+
actively search for them, but this runs down the robot's battery, whereas
|
25
|
+
waiting does not. Whenever the robot is searching, the possibility exists that
|
26
|
+
its battery will become depleted. In this case the robot must shut down and wait
|
27
|
+
to be rescued (producing a low reward). The agent makes its decisions solely as
|
28
|
+
a function of the energy level of the battery. It can distinguish two levels,
|
29
|
+
high and low.
|
30
|
+
</blockquote>
|
31
|
+
|
32
|
+
The transition model is described in Table 3.1, which can be fed directly into
|
33
|
+
FiniteMDP using the {FiniteMDP::TableModel}, as follows.
|
34
|
+
|
35
|
+
require 'finite_mdp'
|
36
|
+
|
37
|
+
alpha = 0.1 # Pr(stay at high charge if searching | now have high charge)
|
38
|
+
beta = 0.1 # Pr(stay at low charge if searching | now have low charge)
|
39
|
+
r_search = 2 # reward for searching
|
40
|
+
r_wait = 1 # reward for waiting
|
41
|
+
r_rescue = -3 # reward (actually penalty) for running out of charge
|
42
|
+
|
43
|
+
model = FiniteMDP::TableModel.new [
|
44
|
+
[:high, :search, :high, alpha, r_search],
|
45
|
+
[:high, :search, :low, 1-alpha, r_search],
|
46
|
+
[:low, :search, :high, 1-beta, r_rescue],
|
47
|
+
[:low, :search, :low, beta, r_search],
|
48
|
+
[:high, :wait, :high, 1, r_wait],
|
49
|
+
[:high, :wait, :low, 0, r_wait],
|
50
|
+
[:low, :wait, :high, 0, r_wait],
|
51
|
+
[:low, :wait, :low, 1, r_wait],
|
52
|
+
[:low, :recharge, :high, 1, 0],
|
53
|
+
[:low, :recharge, :low, 0, 0]]
|
54
|
+
|
55
|
+
solver = FiniteMDP::Solver.new(model, 0.95) # discount factor 0.95
|
56
|
+
solver.policy_iteration 1e-4
|
57
|
+
solver.policy #=> {:high=>:search, :low=>:recharge}
|
58
|
+
|
59
|
+
==== Example 2: Grid Worlds
|
60
|
+
|
61
|
+
A more complicated example: the grid world from
|
62
|
+
<cite>Russel and Norvig (2003). Artificial Intelligence: A Modern
|
63
|
+
Approach</cite>, Chapter 17.
|
64
|
+
|
65
|
+
Here we describe the model as a class that implements the {FiniteMDP::Model}
|
66
|
+
interface. The model contains terminal states, which we represent with a special
|
67
|
+
absorbing state with zero reward, called :stop.
|
68
|
+
|
69
|
+
require 'finite_mdp'
|
70
|
+
|
71
|
+
class AIMAGridModel
|
72
|
+
include FiniteMDP::Model
|
73
|
+
|
74
|
+
#
|
75
|
+
# @param [Array<Array<Float, nil>>] grid rewards at each point, or nil if a
|
76
|
+
# grid square is an obstacle
|
77
|
+
#
|
78
|
+
# @param [Array<[i, j]>] terminii coordinates of the terminal states
|
79
|
+
#
|
80
|
+
def initialize grid, terminii
|
81
|
+
@grid, @terminii = grid, terminii
|
82
|
+
end
|
83
|
+
|
84
|
+
attr_reader :grid, :terminii
|
85
|
+
|
86
|
+
# every position on the grid is a state, except for obstacles, which are
|
87
|
+
# indicated by a nil in the grid
|
88
|
+
def states
|
89
|
+
is, js = (0...grid.size).to_a, (0...grid.first.size).to_a
|
90
|
+
is.product(js).select {|i, j| grid[i][j]} + [:stop]
|
91
|
+
end
|
92
|
+
|
93
|
+
# can move north, east, south or west on the grid
|
94
|
+
MOVES = {
|
95
|
+
'^' => [-1, 0],
|
96
|
+
'>' => [ 0, 1],
|
97
|
+
'v' => [ 1, 0],
|
98
|
+
'<' => [ 0, -1]}
|
99
|
+
|
100
|
+
# agent can move north, south, east or west (unless it's in the :stop
|
101
|
+
# state); if it tries to move off the grid or into an obstacle, it stays
|
102
|
+
# where it is
|
103
|
+
def actions state
|
104
|
+
if state == :stop || terminii.member?(state)
|
105
|
+
[:stop]
|
106
|
+
else
|
107
|
+
MOVES.keys
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# define the transition model
|
112
|
+
def transition_probability state, action, next_state
|
113
|
+
if state == :stop || terminii.member?(state)
|
114
|
+
(action == :stop && next_state == :stop) ? 1 : 0
|
115
|
+
else
|
116
|
+
# agent usually succeeds in moving forward, but sometimes it ends up
|
117
|
+
# moving left or right
|
118
|
+
move = case action
|
119
|
+
when '^' then [['^', 0.8], ['<', 0.1], ['>', 0.1]]
|
120
|
+
when '>' then [['>', 0.8], ['^', 0.1], ['v', 0.1]]
|
121
|
+
when 'v' then [['v', 0.8], ['<', 0.1], ['>', 0.1]]
|
122
|
+
when '<' then [['<', 0.8], ['^', 0.1], ['v', 0.1]]
|
123
|
+
end
|
124
|
+
move.map {|m, pr|
|
125
|
+
m_state = [state[0] + MOVES[m][0], state[1] + MOVES[m][1]]
|
126
|
+
m_state = state unless states.member?(m_state) # stay in bounds
|
127
|
+
pr if m_state == next_state
|
128
|
+
}.compact.inject(:+) || 0
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# reward is given by the grid cells; zero reward for the :stop state
|
133
|
+
def reward state, action, next_state
|
134
|
+
state == :stop ? 0 : grid[state[0]][state[1]]
|
135
|
+
end
|
136
|
+
|
137
|
+
# helper for functions below
|
138
|
+
def hash_to_grid hash
|
139
|
+
0.upto(grid.size-1).map{|i| 0.upto(grid[i].size-1).map{|j| hash[[i,j]]}}
|
140
|
+
end
|
141
|
+
|
142
|
+
# print the values in a grid
|
143
|
+
def pretty_value value
|
144
|
+
hash_to_grid(Hash[value.map {|s, v| [s, "%+.3f" % v]}]).map{|row|
|
145
|
+
row.map{|cell| cell || ' '}.join(' ')}
|
146
|
+
end
|
147
|
+
|
148
|
+
# print the policy using ASCII arrows
|
149
|
+
def pretty_policy policy
|
150
|
+
hash_to_grid(policy).map{|row| row.map{|cell|
|
151
|
+
(cell.nil? || cell == :stop) ? ' ' : cell}.join(' ')}
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# the grid from Figures 17.1, 17.2(a) and 17.3
|
156
|
+
model = AIMAGridModel.new(
|
157
|
+
[[-0.04, -0.04, -0.04, +1],
|
158
|
+
[-0.04, nil, -0.04, -1],
|
159
|
+
[-0.04, -0.04, -0.04, -0.04]],
|
160
|
+
[[0, 3], [1, 3]]) # terminals (the +1 and -1 states)
|
161
|
+
|
162
|
+
# sanity check: probabilities in a row must sum to 1
|
163
|
+
model.check_transition_probabilities_sum
|
164
|
+
|
165
|
+
solver = FiniteMDP::Solver.new(model, 1) # discount factor 1
|
166
|
+
solver.value_iteration(1e-5, 100) #=> true if converged
|
167
|
+
|
168
|
+
puts model.pretty_policy(solver.policy)
|
169
|
+
# output: (matches Figure 17.2(a))
|
170
|
+
# > > >
|
171
|
+
# ^ ^
|
172
|
+
# ^ < < <
|
173
|
+
|
174
|
+
puts model.pretty_value(solver.value)
|
175
|
+
# output: (matches Figure 17.3)
|
176
|
+
# 0.812 0.868 0.918 1.000
|
177
|
+
# 0.762 0.660 -1.000
|
178
|
+
# 0.705 0.655 0.611 0.388
|
179
|
+
|
180
|
+
FiniteMDP::TableModel.from_model(model)
|
181
|
+
#=> [[0, 0], "v", [0, 0], 0.1, -0.04]
|
182
|
+
# [[0, 0], "v", [0, 1], 0.1, -0.04]
|
183
|
+
# [[0, 0], "v", [1, 0], 0.8, -0.04]
|
184
|
+
# [[0, 0], "<", [0, 0], 0.9, -0.04]
|
185
|
+
# [[0, 0], "<", [1, 0], 0.1, -0.04]
|
186
|
+
# [[0, 0], ">", [0, 0], 0.1, -0.04]
|
187
|
+
# [[0, 0], ">", [0, 1], 0.8, -0.04]
|
188
|
+
# [[0, 0], ">", [1, 0], 0.1, -0.04]
|
189
|
+
# ...
|
190
|
+
# [:stop, :stop, :stop, 1, 0]
|
191
|
+
|
192
|
+
Note that python code for this model is also available from the book's authors
|
193
|
+
at http://aima.cs.berkeley.edu/python/mdp.html
|
194
|
+
|
195
|
+
== REQUIREMENTS
|
196
|
+
|
197
|
+
Tested on
|
198
|
+
* ruby 1.8.7 (2010-06-23 patchlevel 299) [i686-linux]
|
199
|
+
* ruby 1.9.2p0 (2010-08-18 revision 29036) [i686-linux]
|
200
|
+
|
201
|
+
== INSTALLATION
|
202
|
+
|
203
|
+
gem install finite_mdp
|
204
|
+
|
205
|
+
== LICENSE
|
206
|
+
|
207
|
+
(The MIT License)
|
208
|
+
|
209
|
+
Copyright (c) 2011 John Lees-Miller
|
210
|
+
|
211
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
212
|
+
a copy of this software and associated documentation files (the
|
213
|
+
'Software'), to deal in the Software without restriction, including
|
214
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
215
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
216
|
+
permit persons to whom the Software is furnished to do so, subject to
|
217
|
+
the following conditions:
|
218
|
+
|
219
|
+
The above copyright notice and this permission notice shall be
|
220
|
+
included in all copies or substantial portions of the Software.
|
221
|
+
|
222
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
223
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
224
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
225
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
226
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
227
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
228
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
229
|
+
|
@@ -0,0 +1,123 @@
|
|
1
|
+
#
|
2
|
+
# A finite markov decision process model for which the transition
|
3
|
+
# probabilities and rewards are specified using nested hash tables.
|
4
|
+
#
|
5
|
+
# The structure of the nested hash is as follows:
|
6
|
+
# hash[:s] #=> a Hash that maps actions to successor states
|
7
|
+
# hash[:s][:a] #=> a Hash from successor states to pairs (see next)
|
8
|
+
# hash[:s][:a][:t] #=> an Array [probability, reward] for transition (s,a,t)
|
9
|
+
#
|
10
|
+
# The states and actions can be arbitrary objects; see notes for {Model}.
|
11
|
+
#
|
12
|
+
# The {TableModel} is an alternative way of storing these data.
|
13
|
+
#
|
14
|
+
class FiniteMDP::HashModel
|
15
|
+
include FiniteMDP::Model
|
16
|
+
|
17
|
+
#
|
18
|
+
# @param [Hash<state, Hash<action, Hash<state, [Float, Float]>>>] hash see
|
19
|
+
# notes for {HashModel} for an explanation of this structure
|
20
|
+
#
|
21
|
+
def initialize hash
|
22
|
+
@hash = hash
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# @return [Hash<state, Hash<action, Hash<state, [Float, Float]>>>] see notes
|
27
|
+
# for {HashModel} for an explanation of this structure
|
28
|
+
#
|
29
|
+
attr_accessor :hash
|
30
|
+
|
31
|
+
#
|
32
|
+
# States in this model; see {Model#states}.
|
33
|
+
#
|
34
|
+
# @return [Array<state>] not empty; no duplicate states
|
35
|
+
#
|
36
|
+
def states
|
37
|
+
hash.keys
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Actions that are valid for the given state; see {Model#actions}.
|
42
|
+
#
|
43
|
+
# @param [state] state
|
44
|
+
#
|
45
|
+
# @return [Array<action>] not empty; no duplicate actions
|
46
|
+
#
|
47
|
+
def actions state
|
48
|
+
hash[state].keys
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Possible successor states after taking the given action in the given state;
|
53
|
+
# see {Model#next_states}.
|
54
|
+
#
|
55
|
+
# @param [state] state
|
56
|
+
#
|
57
|
+
# @param [action] action
|
58
|
+
#
|
59
|
+
# @return [Array<state>] not empty; no duplicate states
|
60
|
+
#
|
61
|
+
def next_states state, action
|
62
|
+
hash[state][action].keys
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Probability of the given transition; see {Model#transition_probability}.
|
67
|
+
#
|
68
|
+
# @param [state] state
|
69
|
+
#
|
70
|
+
# @param [action] action
|
71
|
+
#
|
72
|
+
# @param [state] next_state
|
73
|
+
#
|
74
|
+
# @return [Float] in [0, 1]; zero if the transition is not in the hash
|
75
|
+
#
|
76
|
+
def transition_probability state, action, next_state
|
77
|
+
probability, reward = hash[state][action][next_state]
|
78
|
+
probability || 0
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Reward for a given transition; see {Model#reward}.
|
83
|
+
#
|
84
|
+
# @param [state] state
|
85
|
+
#
|
86
|
+
# @param [action] action
|
87
|
+
#
|
88
|
+
# @param [state] next_state
|
89
|
+
#
|
90
|
+
# @return [Float, nil] nil if the transition is not in the hash
|
91
|
+
#
|
92
|
+
def reward state, action, next_state
|
93
|
+
probability, reward = hash[state][action][next_state]
|
94
|
+
reward
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Convert a generic model into a hash model.
|
99
|
+
#
|
100
|
+
# @param [Model] model
|
101
|
+
#
|
102
|
+
# @param [Boolean] sparse do not store entries for transitions with zero
|
103
|
+
# probability
|
104
|
+
#
|
105
|
+
# @return [HashModel] not nil
|
106
|
+
#
|
107
|
+
def self.from_model model, sparse=true
|
108
|
+
hash = {}
|
109
|
+
model.states.each do |state|
|
110
|
+
hash[state] ||= {}
|
111
|
+
model.actions(state).each do |action|
|
112
|
+
hash[state][action] ||= {}
|
113
|
+
model.next_states(state, action).each do |next_state|
|
114
|
+
pr = model.transition_probability(state, action, next_state)
|
115
|
+
hash[state][action][next_state] = [pr,
|
116
|
+
model.reward(state, action, next_state)] if pr > 0 || !sparse
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
FiniteMDP::HashModel.new(hash)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
@@ -0,0 +1,195 @@
|
|
1
|
+
#
|
2
|
+
# Interface that defines a finite markov decision process model.
|
3
|
+
#
|
4
|
+
# There are several approaches to describing the state, action, transition
|
5
|
+
# probability and reward data for use with this library.
|
6
|
+
#
|
7
|
+
# 1. Write the data directly into a {TableModel} or {HashModel}. This is usually
|
8
|
+
# the way to go for small models, such as examples from text books.
|
9
|
+
#
|
10
|
+
# 1. Write a procedure that generates the data and stores them in a
|
11
|
+
# {TableModel} or {HashModel}. This gives the most flexibility in how the
|
12
|
+
# data are generated.
|
13
|
+
#
|
14
|
+
# 1. Write a class that implements the methods in this module. The methods in
|
15
|
+
# this module are a fairly close approximation to the usual way of defining
|
16
|
+
# an MDP mathematically, so it can be a useful way of structuring the
|
17
|
+
# definition. It can then be converted to one of the other representations
|
18
|
+
# (see {TableModel.from_model}) or passed directly to a {Solver}.
|
19
|
+
#
|
20
|
+
# The discussion below applies to all of these approaches.
|
21
|
+
#
|
22
|
+
# Note that there is no special treatment for terminal states, but they can be
|
23
|
+
# modeled by including a dummy state (a state with zero reward and one action
|
24
|
+
# that brings the process back to the dummy state with probability 1).
|
25
|
+
#
|
26
|
+
# The states and actions can be arbitrary objects. The only requirement is that
|
27
|
+
# they support hashing and equality (in the sense of <tt>eql?</tt>), which all
|
28
|
+
# ruby objects do. Built-in types, such as symbols, arrays and Structs, will
|
29
|
+
# work as expected. Note, however, that the default hashing and equality
|
30
|
+
# semantics for custom classes may not be what you want. The following example
|
31
|
+
# illustrates this:
|
32
|
+
#
|
33
|
+
# class BadGridState
|
34
|
+
# def initialize x, y
|
35
|
+
# @x, @y = x, y
|
36
|
+
# end
|
37
|
+
# attr_accessor :x, :y
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# BadGridState.new(1, 1) == BadGridState.new(1, 2) #=> false
|
41
|
+
# BadGridState.new(1, 1) == BadGridState.new(1, 1) #=> false (!!!)
|
42
|
+
#
|
43
|
+
# This is because, by default, hashing and equality are defined in terms of
|
44
|
+
# object identifiers, not the 'content' of the objects.
|
45
|
+
# The preferred solution is to define the state as a <tt>Struct</tt>:
|
46
|
+
#
|
47
|
+
# GoodGridState = Struct.new(:x, :y)
|
48
|
+
#
|
49
|
+
# GoodGridState.new(1, 1) == GoodGridState.new(1, 2) #=> false
|
50
|
+
# GoodGridState.new(1, 1) == GoodGridState.new(1, 1) #=> true
|
51
|
+
#
|
52
|
+
# <tt>Struct</tt> is part of the ruby standard library, and it implements
|
53
|
+
# hashing and equality based on object content rather than identity.
|
54
|
+
#
|
55
|
+
# Alternatively, if you cannot derive your state class from <tt>Struct</tt>, you
|
56
|
+
# can define your own hash code and equality check. An easy way to do this is to
|
57
|
+
# include the {VectorValued} mix-in. It is also notable that you can make the
|
58
|
+
# default semantics work; you just have to make sure that there is only one
|
59
|
+
# instance of your state class per state, as in the following example:
|
60
|
+
#
|
61
|
+
# g11 = BadGridState.new(1, 1)
|
62
|
+
# g12 = BadGridState.new(1, 2)
|
63
|
+
# g21 = BadGridState.new(2, 1)
|
64
|
+
# model = FiniteMDP::TableModel.new([
|
65
|
+
# [g11, :up, g12, 0, 0.9],
|
66
|
+
# [g11, :up, g21, 0, 0.1],
|
67
|
+
# [g11, :right, g21, 0, 0.9],
|
68
|
+
# # ...
|
69
|
+
# ]) # this will work as expected
|
70
|
+
#
|
71
|
+
# Note that the {Solver} will convert the model to its own internal
|
72
|
+
# representation. The efficiency of the methods that define the model is
|
73
|
+
# important while the solver is building its internal representation, but it
|
74
|
+
# does not affect the performance of the iterative algorithm used after that.
|
75
|
+
# Also note that the solver handles state and action numbering internally, so it
|
76
|
+
# is not necessary to use numbers for the states.
|
77
|
+
#
|
78
|
+
module FiniteMDP::Model
|
79
|
+
#
|
80
|
+
# States in this model.
|
81
|
+
#
|
82
|
+
# @return [Array<state>] not empty; no duplicate states
|
83
|
+
#
|
84
|
+
# @abstract
|
85
|
+
#
|
86
|
+
def states
|
87
|
+
raise NotImplementedError
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# Actions that are valid for the given state.
|
92
|
+
#
|
93
|
+
# All states must have at least one valid action; see notes for {Model}
|
94
|
+
# regarding how to encode a terminal state.
|
95
|
+
#
|
96
|
+
# @param [state] state
|
97
|
+
#
|
98
|
+
# @return [Array<action>] not empty; no duplicate actions
|
99
|
+
#
|
100
|
+
# @abstract
|
101
|
+
#
|
102
|
+
def actions state
|
103
|
+
raise NotImplementedError
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Successor states after taking the given action in the given state. Note that
|
108
|
+
# the returned states may occur with zero probability.
|
109
|
+
#
|
110
|
+
# The default behavior is to return all states as candidate successor states
|
111
|
+
# and let {#transition_probability} determine which ones are possible. It can
|
112
|
+
# be overridden in sparse models to avoid storing or computing lots of zeros.
|
113
|
+
# Also note that {TableModel.from_model} and {HashModel.from_model} can be
|
114
|
+
# told to ignore transitions with zero probability, and that the {Solver}
|
115
|
+
# ignores them in its internal representation, so you can usually forget about
|
116
|
+
# this method.
|
117
|
+
#
|
118
|
+
# @param [state] state
|
119
|
+
#
|
120
|
+
# @param [action] action
|
121
|
+
#
|
122
|
+
# @return [Array<state>] not empty; no duplicate states
|
123
|
+
#
|
124
|
+
def next_states state, action
|
125
|
+
states
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# Probability of the given transition.
|
130
|
+
#
|
131
|
+
# If the transition is not in the model, in the sense that it would never
|
132
|
+
# arise from {#states}, {#actions} and {#next_states}, the result is
|
133
|
+
# undefined. Note that {HashModel#transition_probability} and
|
134
|
+
# {TableModel#transition_probability} return zero in this case, but this is
|
135
|
+
# not part of the contract.
|
136
|
+
#
|
137
|
+
# @param [state] state
|
138
|
+
#
|
139
|
+
# @param [action] action
|
140
|
+
#
|
141
|
+
# @param [state] next_state
|
142
|
+
#
|
143
|
+
# @return [Float] in [0, 1]; undefined if the transition is not in the model
|
144
|
+
# (see notes above)
|
145
|
+
#
|
146
|
+
# @abstract
|
147
|
+
#
|
148
|
+
def transition_probability state, action, next_state
|
149
|
+
raise NotImplementedError
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# Reward for a given transition.
|
154
|
+
#
|
155
|
+
# If the transition is not in the model, in the sense that it would never
|
156
|
+
# arise from {#states}, {#actions} and {#next_states}, the result is
|
157
|
+
# undefined. Note that {HashModel#reward} and {TableModel#reward} return
|
158
|
+
# <tt>nil</tt> in this case, but this is not part of the contract.
|
159
|
+
#
|
160
|
+
# @param [state] state
|
161
|
+
#
|
162
|
+
# @param [action] action
|
163
|
+
#
|
164
|
+
# @param [state] next_state
|
165
|
+
#
|
166
|
+
# @return [Float, nil] nil only if the transition is not in the model (but the
|
167
|
+
# result is undefined in this case -- it need not be nil; see notes above)
|
168
|
+
#
|
169
|
+
# @abstract
|
170
|
+
#
|
171
|
+
def reward state, action, next_state
|
172
|
+
raise NotImplementedError
|
173
|
+
end
|
174
|
+
|
175
|
+
#
|
176
|
+
# Raise an error if the sum of the transition probabilities for any (state,
|
177
|
+
# action) pair is not sufficiently close to 1.
|
178
|
+
#
|
179
|
+
# @param [Float] tol numerical tolerance
|
180
|
+
#
|
181
|
+
# @return [nil]
|
182
|
+
#
|
183
|
+
def check_transition_probabilities_sum tol=1e-6
|
184
|
+
states.each do |state|
|
185
|
+
actions(state).each do |action|
|
186
|
+
pr = next_states(state, action).map{|next_state|
|
187
|
+
transition_probability(state, action, next_state)}.inject(:+)
|
188
|
+
raise "transition probabilities for state #{state.inspect} and
|
189
|
+
action #{action.inspect} sum to #{pr}" if pr < 1 - tol
|
190
|
+
end
|
191
|
+
end
|
192
|
+
nil
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|