finite_mdp 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +229 -0
- data/lib/finite_mdp/hash_model.rb +123 -0
- data/lib/finite_mdp/model.rb +195 -0
- data/lib/finite_mdp/solver.rb +344 -0
- data/lib/finite_mdp/table_model.rb +122 -0
- data/lib/finite_mdp/vector_valued.rb +46 -0
- data/lib/finite_mdp/version.rb +3 -0
- data/lib/finite_mdp.rb +14 -0
- data/test/finite_mdp_test.rb +347 -0
- metadata +94 -0
data/README.rdoc
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
= finite_mdp
|
2
|
+
|
3
|
+
* https://github.com/jdleesmiller/finite_mdp
|
4
|
+
|
5
|
+
== SYNOPSIS
|
6
|
+
|
7
|
+
Solve small, finite Markov Decision Process (MDP) models.
|
8
|
+
|
9
|
+
This library provides several ways of describing an MDP model (see
|
10
|
+
{FiniteMDP::Model}) and some reasonably efficient implementations of policy
|
11
|
+
iteration and value iteration to solve it (see {FiniteMDP::Solver}).
|
12
|
+
|
13
|
+
=== Usage
|
14
|
+
|
15
|
+
==== Example 1: Recycling Robot
|
16
|
+
|
17
|
+
The following shows how to solve the recycling robot model (example 3.7) from
|
18
|
+
<cite>Sutton and Barto (1998). Reinforcement Learning: An Introduction</cite>.
|
19
|
+
|
20
|
+
<blockquote>
|
21
|
+
At each time step, the robot decides whether it should (1) actively search for a
|
22
|
+
can, (2) remain stationary and wait for someone to bring it a can, or (3) go
|
23
|
+
back to home base to recharge its battery. The best way to find cans is to
|
24
|
+
actively search for them, but this runs down the robot's battery, whereas
|
25
|
+
waiting does not. Whenever the robot is searching, the possibility exists that
|
26
|
+
its battery will become depleted. In this case the robot must shut down and wait
|
27
|
+
to be rescued (producing a low reward). The agent makes its decisions solely as
|
28
|
+
a function of the energy level of the battery. It can distinguish two levels,
|
29
|
+
high and low.
|
30
|
+
</blockquote>
|
31
|
+
|
32
|
+
The transition model is described in Table 3.1, which can be fed directly into
|
33
|
+
FiniteMDP using the {FiniteMDP::TableModel}, as follows.
|
34
|
+
|
35
|
+
require 'finite_mdp'
|
36
|
+
|
37
|
+
alpha = 0.1 # Pr(stay at high charge if searching | now have high charge)
|
38
|
+
beta = 0.1 # Pr(stay at low charge if searching | now have low charge)
|
39
|
+
r_search = 2 # reward for searching
|
40
|
+
r_wait = 1 # reward for waiting
|
41
|
+
r_rescue = -3 # reward (actually penalty) for running out of charge
|
42
|
+
|
43
|
+
model = FiniteMDP::TableModel.new [
|
44
|
+
[:high, :search, :high, alpha, r_search],
|
45
|
+
[:high, :search, :low, 1-alpha, r_search],
|
46
|
+
[:low, :search, :high, 1-beta, r_rescue],
|
47
|
+
[:low, :search, :low, beta, r_search],
|
48
|
+
[:high, :wait, :high, 1, r_wait],
|
49
|
+
[:high, :wait, :low, 0, r_wait],
|
50
|
+
[:low, :wait, :high, 0, r_wait],
|
51
|
+
[:low, :wait, :low, 1, r_wait],
|
52
|
+
[:low, :recharge, :high, 1, 0],
|
53
|
+
[:low, :recharge, :low, 0, 0]]
|
54
|
+
|
55
|
+
solver = FiniteMDP::Solver.new(model, 0.95) # discount factor 0.95
|
56
|
+
solver.policy_iteration 1e-4
|
57
|
+
solver.policy #=> {:high=>:search, :low=>:recharge}
|
58
|
+
|
59
|
+
==== Example 2: Grid Worlds
|
60
|
+
|
61
|
+
A more complicated example: the grid world from
|
62
|
+
<cite>Russel and Norvig (2003). Artificial Intelligence: A Modern
|
63
|
+
Approach</cite>, Chapter 17.
|
64
|
+
|
65
|
+
Here we describe the model as a class that implements the {FiniteMDP::Model}
|
66
|
+
interface. The model contains terminal states, which we represent with a special
|
67
|
+
absorbing state with zero reward, called :stop.
|
68
|
+
|
69
|
+
require 'finite_mdp'
|
70
|
+
|
71
|
+
class AIMAGridModel
|
72
|
+
include FiniteMDP::Model
|
73
|
+
|
74
|
+
#
|
75
|
+
# @param [Array<Array<Float, nil>>] grid rewards at each point, or nil if a
|
76
|
+
# grid square is an obstacle
|
77
|
+
#
|
78
|
+
# @param [Array<[i, j]>] terminii coordinates of the terminal states
|
79
|
+
#
|
80
|
+
def initialize grid, terminii
|
81
|
+
@grid, @terminii = grid, terminii
|
82
|
+
end
|
83
|
+
|
84
|
+
attr_reader :grid, :terminii
|
85
|
+
|
86
|
+
# every position on the grid is a state, except for obstacles, which are
|
87
|
+
# indicated by a nil in the grid
|
88
|
+
def states
|
89
|
+
is, js = (0...grid.size).to_a, (0...grid.first.size).to_a
|
90
|
+
is.product(js).select {|i, j| grid[i][j]} + [:stop]
|
91
|
+
end
|
92
|
+
|
93
|
+
# can move north, east, south or west on the grid
|
94
|
+
MOVES = {
|
95
|
+
'^' => [-1, 0],
|
96
|
+
'>' => [ 0, 1],
|
97
|
+
'v' => [ 1, 0],
|
98
|
+
'<' => [ 0, -1]}
|
99
|
+
|
100
|
+
# agent can move north, south, east or west (unless it's in the :stop
|
101
|
+
# state); if it tries to move off the grid or into an obstacle, it stays
|
102
|
+
# where it is
|
103
|
+
def actions state
|
104
|
+
if state == :stop || terminii.member?(state)
|
105
|
+
[:stop]
|
106
|
+
else
|
107
|
+
MOVES.keys
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# define the transition model
|
112
|
+
def transition_probability state, action, next_state
|
113
|
+
if state == :stop || terminii.member?(state)
|
114
|
+
(action == :stop && next_state == :stop) ? 1 : 0
|
115
|
+
else
|
116
|
+
# agent usually succeeds in moving forward, but sometimes it ends up
|
117
|
+
# moving left or right
|
118
|
+
move = case action
|
119
|
+
when '^' then [['^', 0.8], ['<', 0.1], ['>', 0.1]]
|
120
|
+
when '>' then [['>', 0.8], ['^', 0.1], ['v', 0.1]]
|
121
|
+
when 'v' then [['v', 0.8], ['<', 0.1], ['>', 0.1]]
|
122
|
+
when '<' then [['<', 0.8], ['^', 0.1], ['v', 0.1]]
|
123
|
+
end
|
124
|
+
move.map {|m, pr|
|
125
|
+
m_state = [state[0] + MOVES[m][0], state[1] + MOVES[m][1]]
|
126
|
+
m_state = state unless states.member?(m_state) # stay in bounds
|
127
|
+
pr if m_state == next_state
|
128
|
+
}.compact.inject(:+) || 0
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# reward is given by the grid cells; zero reward for the :stop state
|
133
|
+
def reward state, action, next_state
|
134
|
+
state == :stop ? 0 : grid[state[0]][state[1]]
|
135
|
+
end
|
136
|
+
|
137
|
+
# helper for functions below
|
138
|
+
def hash_to_grid hash
|
139
|
+
0.upto(grid.size-1).map{|i| 0.upto(grid[i].size-1).map{|j| hash[[i,j]]}}
|
140
|
+
end
|
141
|
+
|
142
|
+
# print the values in a grid
|
143
|
+
def pretty_value value
|
144
|
+
hash_to_grid(Hash[value.map {|s, v| [s, "%+.3f" % v]}]).map{|row|
|
145
|
+
row.map{|cell| cell || ' '}.join(' ')}
|
146
|
+
end
|
147
|
+
|
148
|
+
# print the policy using ASCII arrows
|
149
|
+
def pretty_policy policy
|
150
|
+
hash_to_grid(policy).map{|row| row.map{|cell|
|
151
|
+
(cell.nil? || cell == :stop) ? ' ' : cell}.join(' ')}
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# the grid from Figures 17.1, 17.2(a) and 17.3
|
156
|
+
model = AIMAGridModel.new(
|
157
|
+
[[-0.04, -0.04, -0.04, +1],
|
158
|
+
[-0.04, nil, -0.04, -1],
|
159
|
+
[-0.04, -0.04, -0.04, -0.04]],
|
160
|
+
[[0, 3], [1, 3]]) # terminals (the +1 and -1 states)
|
161
|
+
|
162
|
+
# sanity check: probabilities in a row must sum to 1
|
163
|
+
model.check_transition_probabilities_sum
|
164
|
+
|
165
|
+
solver = FiniteMDP::Solver.new(model, 1) # discount factor 1
|
166
|
+
solver.value_iteration(1e-5, 100) #=> true if converged
|
167
|
+
|
168
|
+
puts model.pretty_policy(solver.policy)
|
169
|
+
# output: (matches Figure 17.2(a))
|
170
|
+
# > > >
|
171
|
+
# ^ ^
|
172
|
+
# ^ < < <
|
173
|
+
|
174
|
+
puts model.pretty_value(solver.value)
|
175
|
+
# output: (matches Figure 17.3)
|
176
|
+
# 0.812 0.868 0.918 1.000
|
177
|
+
# 0.762 0.660 -1.000
|
178
|
+
# 0.705 0.655 0.611 0.388
|
179
|
+
|
180
|
+
FiniteMDP::TableModel.from_model(model)
|
181
|
+
#=> [[0, 0], "v", [0, 0], 0.1, -0.04]
|
182
|
+
# [[0, 0], "v", [0, 1], 0.1, -0.04]
|
183
|
+
# [[0, 0], "v", [1, 0], 0.8, -0.04]
|
184
|
+
# [[0, 0], "<", [0, 0], 0.9, -0.04]
|
185
|
+
# [[0, 0], "<", [1, 0], 0.1, -0.04]
|
186
|
+
# [[0, 0], ">", [0, 0], 0.1, -0.04]
|
187
|
+
# [[0, 0], ">", [0, 1], 0.8, -0.04]
|
188
|
+
# [[0, 0], ">", [1, 0], 0.1, -0.04]
|
189
|
+
# ...
|
190
|
+
# [:stop, :stop, :stop, 1, 0]
|
191
|
+
|
192
|
+
Note that python code for this model is also available from the book's authors
|
193
|
+
at http://aima.cs.berkeley.edu/python/mdp.html
|
194
|
+
|
195
|
+
== REQUIREMENTS
|
196
|
+
|
197
|
+
Tested on
|
198
|
+
* ruby 1.8.7 (2010-06-23 patchlevel 299) [i686-linux]
|
199
|
+
* ruby 1.9.2p0 (2010-08-18 revision 29036) [i686-linux]
|
200
|
+
|
201
|
+
== INSTALLATION
|
202
|
+
|
203
|
+
gem install finite_mdp
|
204
|
+
|
205
|
+
== LICENSE
|
206
|
+
|
207
|
+
(The MIT License)
|
208
|
+
|
209
|
+
Copyright (c) 2011 John Lees-Miller
|
210
|
+
|
211
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
212
|
+
a copy of this software and associated documentation files (the
|
213
|
+
'Software'), to deal in the Software without restriction, including
|
214
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
215
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
216
|
+
permit persons to whom the Software is furnished to do so, subject to
|
217
|
+
the following conditions:
|
218
|
+
|
219
|
+
The above copyright notice and this permission notice shall be
|
220
|
+
included in all copies or substantial portions of the Software.
|
221
|
+
|
222
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
223
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
224
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
225
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
226
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
227
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
228
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
229
|
+
|
@@ -0,0 +1,123 @@
|
|
1
|
+
#
|
2
|
+
# A finite markov decision process model for which the transition
|
3
|
+
# probabilities and rewards are specified using nested hash tables.
|
4
|
+
#
|
5
|
+
# The structure of the nested hash is as follows:
|
6
|
+
# hash[:s] #=> a Hash that maps actions to successor states
|
7
|
+
# hash[:s][:a] #=> a Hash from successor states to pairs (see next)
|
8
|
+
# hash[:s][:a][:t] #=> an Array [probability, reward] for transition (s,a,t)
|
9
|
+
#
|
10
|
+
# The states and actions can be arbitrary objects; see notes for {Model}.
|
11
|
+
#
|
12
|
+
# The {TableModel} is an alternative way of storing these data.
|
13
|
+
#
|
14
|
+
class FiniteMDP::HashModel
|
15
|
+
include FiniteMDP::Model
|
16
|
+
|
17
|
+
#
|
18
|
+
# @param [Hash<state, Hash<action, Hash<state, [Float, Float]>>>] hash see
|
19
|
+
# notes for {HashModel} for an explanation of this structure
|
20
|
+
#
|
21
|
+
def initialize hash
|
22
|
+
@hash = hash
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# @return [Hash<state, Hash<action, Hash<state, [Float, Float]>>>] see notes
|
27
|
+
# for {HashModel} for an explanation of this structure
|
28
|
+
#
|
29
|
+
attr_accessor :hash
|
30
|
+
|
31
|
+
#
|
32
|
+
# States in this model; see {Model#states}.
|
33
|
+
#
|
34
|
+
# @return [Array<state>] not empty; no duplicate states
|
35
|
+
#
|
36
|
+
def states
|
37
|
+
hash.keys
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Actions that are valid for the given state; see {Model#actions}.
|
42
|
+
#
|
43
|
+
# @param [state] state
|
44
|
+
#
|
45
|
+
# @return [Array<action>] not empty; no duplicate actions
|
46
|
+
#
|
47
|
+
def actions state
|
48
|
+
hash[state].keys
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Possible successor states after taking the given action in the given state;
|
53
|
+
# see {Model#next_states}.
|
54
|
+
#
|
55
|
+
# @param [state] state
|
56
|
+
#
|
57
|
+
# @param [action] action
|
58
|
+
#
|
59
|
+
# @return [Array<state>] not empty; no duplicate states
|
60
|
+
#
|
61
|
+
def next_states state, action
|
62
|
+
hash[state][action].keys
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Probability of the given transition; see {Model#transition_probability}.
|
67
|
+
#
|
68
|
+
# @param [state] state
|
69
|
+
#
|
70
|
+
# @param [action] action
|
71
|
+
#
|
72
|
+
# @param [state] next_state
|
73
|
+
#
|
74
|
+
# @return [Float] in [0, 1]; zero if the transition is not in the hash
|
75
|
+
#
|
76
|
+
def transition_probability state, action, next_state
|
77
|
+
probability, reward = hash[state][action][next_state]
|
78
|
+
probability || 0
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Reward for a given transition; see {Model#reward}.
|
83
|
+
#
|
84
|
+
# @param [state] state
|
85
|
+
#
|
86
|
+
# @param [action] action
|
87
|
+
#
|
88
|
+
# @param [state] next_state
|
89
|
+
#
|
90
|
+
# @return [Float, nil] nil if the transition is not in the hash
|
91
|
+
#
|
92
|
+
def reward state, action, next_state
|
93
|
+
probability, reward = hash[state][action][next_state]
|
94
|
+
reward
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Convert a generic model into a hash model.
|
99
|
+
#
|
100
|
+
# @param [Model] model
|
101
|
+
#
|
102
|
+
# @param [Boolean] sparse do not store entries for transitions with zero
|
103
|
+
# probability
|
104
|
+
#
|
105
|
+
# @return [HashModel] not nil
|
106
|
+
#
|
107
|
+
def self.from_model model, sparse=true
|
108
|
+
hash = {}
|
109
|
+
model.states.each do |state|
|
110
|
+
hash[state] ||= {}
|
111
|
+
model.actions(state).each do |action|
|
112
|
+
hash[state][action] ||= {}
|
113
|
+
model.next_states(state, action).each do |next_state|
|
114
|
+
pr = model.transition_probability(state, action, next_state)
|
115
|
+
hash[state][action][next_state] = [pr,
|
116
|
+
model.reward(state, action, next_state)] if pr > 0 || !sparse
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
FiniteMDP::HashModel.new(hash)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
@@ -0,0 +1,195 @@
|
|
1
|
+
#
|
2
|
+
# Interface that defines a finite markov decision process model.
|
3
|
+
#
|
4
|
+
# There are several approaches to describing the state, action, transition
|
5
|
+
# probability and reward data for use with this library.
|
6
|
+
#
|
7
|
+
# 1. Write the data directly into a {TableModel} or {HashModel}. This is usually
|
8
|
+
# the way to go for small models, such as examples from text books.
|
9
|
+
#
|
10
|
+
# 1. Write a procedure that generates the data and stores them in a
|
11
|
+
# {TableModel} or {HashModel}. This gives the most flexibility in how the
|
12
|
+
# data are generated.
|
13
|
+
#
|
14
|
+
# 1. Write a class that implements the methods in this module. The methods in
|
15
|
+
# this module are a fairly close approximation to the usual way of defining
|
16
|
+
# an MDP mathematically, so it can be a useful way of structuring the
|
17
|
+
# definition. It can then be converted to one of the other representations
|
18
|
+
# (see {TableModel.from_model}) or passed directly to a {Solver}.
|
19
|
+
#
|
20
|
+
# The discussion below applies to all of these approaches.
|
21
|
+
#
|
22
|
+
# Note that there is no special treatment for terminal states, but they can be
|
23
|
+
# modeled by including a dummy state (a state with zero reward and one action
|
24
|
+
# that brings the process back to the dummy state with probability 1).
|
25
|
+
#
|
26
|
+
# The states and actions can be arbitrary objects. The only requirement is that
|
27
|
+
# they support hashing and equality (in the sense of <tt>eql?</tt>), which all
|
28
|
+
# ruby objects do. Built-in types, such as symbols, arrays and Structs, will
|
29
|
+
# work as expected. Note, however, that the default hashing and equality
|
30
|
+
# semantics for custom classes may not be what you want. The following example
|
31
|
+
# illustrates this:
|
32
|
+
#
|
33
|
+
# class BadGridState
|
34
|
+
# def initialize x, y
|
35
|
+
# @x, @y = x, y
|
36
|
+
# end
|
37
|
+
# attr_accessor :x, :y
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# BadGridState.new(1, 1) == BadGridState.new(1, 2) #=> false
|
41
|
+
# BadGridState.new(1, 1) == BadGridState.new(1, 1) #=> false (!!!)
|
42
|
+
#
|
43
|
+
# This is because, by default, hashing and equality are defined in terms of
|
44
|
+
# object identifiers, not the 'content' of the objects.
|
45
|
+
# The preferred solution is to define the state as a <tt>Struct</tt>:
|
46
|
+
#
|
47
|
+
# GoodGridState = Struct.new(:x, :y)
|
48
|
+
#
|
49
|
+
# GoodGridState.new(1, 1) == GoodGridState.new(1, 2) #=> false
|
50
|
+
# GoodGridState.new(1, 1) == GoodGridState.new(1, 1) #=> true
|
51
|
+
#
|
52
|
+
# <tt>Struct</tt> is part of the ruby standard library, and it implements
|
53
|
+
# hashing and equality based on object content rather than identity.
|
54
|
+
#
|
55
|
+
# Alternatively, if you cannot derive your state class from <tt>Struct</tt>, you
|
56
|
+
# can define your own hash code and equality check. An easy way to do this is to
|
57
|
+
# include the {VectorValued} mix-in. It is also notable that you can make the
|
58
|
+
# default semantics work; you just have to make sure that there is only one
|
59
|
+
# instance of your state class per state, as in the following example:
|
60
|
+
#
|
61
|
+
# g11 = BadGridState.new(1, 1)
|
62
|
+
# g12 = BadGridState.new(1, 2)
|
63
|
+
# g21 = BadGridState.new(2, 1)
|
64
|
+
# model = FiniteMDP::TableModel.new([
|
65
|
+
# [g11, :up, g12, 0, 0.9],
|
66
|
+
# [g11, :up, g21, 0, 0.1],
|
67
|
+
# [g11, :right, g21, 0, 0.9],
|
68
|
+
# # ...
|
69
|
+
# ]) # this will work as expected
|
70
|
+
#
|
71
|
+
# Note that the {Solver} will convert the model to its own internal
|
72
|
+
# representation. The efficiency of the methods that define the model is
|
73
|
+
# important while the solver is building its internal representation, but it
|
74
|
+
# does not affect the performance of the iterative algorithm used after that.
|
75
|
+
# Also note that the solver handles state and action numbering internally, so it
|
76
|
+
# is not necessary to use numbers for the states.
|
77
|
+
#
|
78
|
+
module FiniteMDP::Model
|
79
|
+
#
|
80
|
+
# States in this model.
|
81
|
+
#
|
82
|
+
# @return [Array<state>] not empty; no duplicate states
|
83
|
+
#
|
84
|
+
# @abstract
|
85
|
+
#
|
86
|
+
def states
|
87
|
+
raise NotImplementedError
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# Actions that are valid for the given state.
|
92
|
+
#
|
93
|
+
# All states must have at least one valid action; see notes for {Model}
|
94
|
+
# regarding how to encode a terminal state.
|
95
|
+
#
|
96
|
+
# @param [state] state
|
97
|
+
#
|
98
|
+
# @return [Array<action>] not empty; no duplicate actions
|
99
|
+
#
|
100
|
+
# @abstract
|
101
|
+
#
|
102
|
+
def actions state
|
103
|
+
raise NotImplementedError
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Successor states after taking the given action in the given state. Note that
|
108
|
+
# the returned states may occur with zero probability.
|
109
|
+
#
|
110
|
+
# The default behavior is to return all states as candidate successor states
|
111
|
+
# and let {#transition_probability} determine which ones are possible. It can
|
112
|
+
# be overridden in sparse models to avoid storing or computing lots of zeros.
|
113
|
+
# Also note that {TableModel.from_model} and {HashModel.from_model} can be
|
114
|
+
# told to ignore transitions with zero probability, and that the {Solver}
|
115
|
+
# ignores them in its internal representation, so you can usually forget about
|
116
|
+
# this method.
|
117
|
+
#
|
118
|
+
# @param [state] state
|
119
|
+
#
|
120
|
+
# @param [action] action
|
121
|
+
#
|
122
|
+
# @return [Array<state>] not empty; no duplicate states
|
123
|
+
#
|
124
|
+
def next_states state, action
|
125
|
+
states
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# Probability of the given transition.
|
130
|
+
#
|
131
|
+
# If the transition is not in the model, in the sense that it would never
|
132
|
+
# arise from {#states}, {#actions} and {#next_states}, the result is
|
133
|
+
# undefined. Note that {HashModel#transition_probability} and
|
134
|
+
# {TableModel#transition_probability} return zero in this case, but this is
|
135
|
+
# not part of the contract.
|
136
|
+
#
|
137
|
+
# @param [state] state
|
138
|
+
#
|
139
|
+
# @param [action] action
|
140
|
+
#
|
141
|
+
# @param [state] next_state
|
142
|
+
#
|
143
|
+
# @return [Float] in [0, 1]; undefined if the transition is not in the model
|
144
|
+
# (see notes above)
|
145
|
+
#
|
146
|
+
# @abstract
|
147
|
+
#
|
148
|
+
def transition_probability state, action, next_state
|
149
|
+
raise NotImplementedError
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# Reward for a given transition.
|
154
|
+
#
|
155
|
+
# If the transition is not in the model, in the sense that it would never
|
156
|
+
# arise from {#states}, {#actions} and {#next_states}, the result is
|
157
|
+
# undefined. Note that {HashModel#reward} and {TableModel#reward} return
|
158
|
+
# <tt>nil</tt> in this case, but this is not part of the contract.
|
159
|
+
#
|
160
|
+
# @param [state] state
|
161
|
+
#
|
162
|
+
# @param [action] action
|
163
|
+
#
|
164
|
+
# @param [state] next_state
|
165
|
+
#
|
166
|
+
# @return [Float, nil] nil only if the transition is not in the model (but the
|
167
|
+
# result is undefined in this case -- it need not be nil; see notes above)
|
168
|
+
#
|
169
|
+
# @abstract
|
170
|
+
#
|
171
|
+
def reward state, action, next_state
|
172
|
+
raise NotImplementedError
|
173
|
+
end
|
174
|
+
|
175
|
+
#
|
176
|
+
# Raise an error if the sum of the transition probabilities for any (state,
|
177
|
+
# action) pair is not sufficiently close to 1.
|
178
|
+
#
|
179
|
+
# @param [Float] tol numerical tolerance
|
180
|
+
#
|
181
|
+
# @return [nil]
|
182
|
+
#
|
183
|
+
def check_transition_probabilities_sum tol=1e-6
|
184
|
+
states.each do |state|
|
185
|
+
actions(state).each do |action|
|
186
|
+
pr = next_states(state, action).map{|next_state|
|
187
|
+
transition_probability(state, action, next_state)}.inject(:+)
|
188
|
+
raise "transition probabilities for state #{state.inspect} and
|
189
|
+
action #{action.inspect} sum to #{pr}" if pr < 1 - tol
|
190
|
+
end
|
191
|
+
end
|
192
|
+
nil
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|