test-mdp 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test_mdp-0.0.1/PKG-INFO +22 -0
- test_mdp-0.0.1/envs/__init__.py +1 -0
- test_mdp-0.0.1/envs/baird.py +125 -0
- test_mdp-0.0.1/envs/base_mdp.py +101 -0
- test_mdp-0.0.1/envs/env_utils.py +17 -0
- test_mdp-0.0.1/envs/random_walk_dependent.py +131 -0
- test_mdp-0.0.1/envs/random_walk_tabular.py +134 -0
- test_mdp-0.0.1/setup.cfg +4 -0
- test_mdp-0.0.1/setup.py +22 -0
- test_mdp-0.0.1/test_mdp.egg-info/PKG-INFO +22 -0
- test_mdp-0.0.1/test_mdp.egg-info/SOURCES.txt +13 -0
- test_mdp-0.0.1/test_mdp.egg-info/dependency_links.txt +1 -0
- test_mdp-0.0.1/test_mdp.egg-info/not-zip-safe +1 -0
- test_mdp-0.0.1/test_mdp.egg-info/requires.txt +1 -0
- test_mdp-0.0.1/test_mdp.egg-info/top_level.txt +1 -0
test_mdp-0.0.1/PKG-INFO
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: test-mdp
|
3
|
+
Version: 0.0.1
|
4
|
+
Summary: test for policy evaluation examples
|
5
|
+
Home-page: https://github.com/limaries30/test-mdp
|
6
|
+
Author: teddylee777
|
7
|
+
Author-email: limaries30@kaist.ac.kr
|
8
|
+
Keywords: policy evaluation
|
9
|
+
Classifier: Programming Language :: Python :: 3.6
|
10
|
+
Classifier: Programming Language :: Python :: 3.7
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
13
|
+
Requires-Python: >=3.6
|
14
|
+
Requires-Dist: tqdm
|
15
|
+
Dynamic: author
|
16
|
+
Dynamic: author-email
|
17
|
+
Dynamic: classifier
|
18
|
+
Dynamic: home-page
|
19
|
+
Dynamic: keywords
|
20
|
+
Dynamic: requires-dist
|
21
|
+
Dynamic: requires-python
|
22
|
+
Dynamic: summary
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.0.1'
|
@@ -0,0 +1,125 @@
|
|
1
|
+
'''
|
2
|
+
Baird's counterexample environment
|
3
|
+
'''
|
4
|
+
from typing import Tuple
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
from nptyping import NDArray
|
8
|
+
|
9
|
+
from envs.base_mdp import BaseMDP
|
10
|
+
from utils.math_utils import rowwise_kron, compute_stationary_dist
|
11
|
+
|
12
|
+
class Baird(BaseMDP):
|
13
|
+
|
14
|
+
NUM_STATES = 7
|
15
|
+
NUM_ACTIONS = 2
|
16
|
+
num_features = 8
|
17
|
+
|
18
|
+
DASH = 0
|
19
|
+
SOLID = 1
|
20
|
+
|
21
|
+
def __init__(self, gamma:float=0.9):
|
22
|
+
|
23
|
+
self.gamma = gamma
|
24
|
+
self.current_state = None
|
25
|
+
self.num_steps = 0
|
26
|
+
|
27
|
+
self.phi = self.construct_feature()
|
28
|
+
self.pi_beta, self.pi_target = self.construct_target_behavior_policy()
|
29
|
+
self.transition_mat = self.construct_transition()
|
30
|
+
|
31
|
+
self.p_pi = self.pi_target@self.transition_mat
|
32
|
+
self.p_beta = self.pi_beta@self.transition_mat
|
33
|
+
self.d_mu = np.diag(compute_stationary_dist(self.p_beta))
|
34
|
+
|
35
|
+
|
36
|
+
self.rewards = self.construct_reward()
|
37
|
+
self.expected_rewards = np.sum(self.p_pi*self.rewards, axis=1)
|
38
|
+
|
39
|
+
|
40
|
+
self.proj = self.phi @ np.linalg.pinv(self.phi.T@self.d_mu@self.phi) @ self.phi.T @ self.d_mu
|
41
|
+
|
42
|
+
|
43
|
+
self.sol = np.linalg.pinv(self.phi.T @ self.d_mu @ (np.eye(self.NUM_STATES) - self.gamma*self.p_pi) @ self.phi) @ (self.phi.T @ self.d_mu @ self.expected_rewards)
|
44
|
+
|
45
|
+
|
46
|
+
def construct_reward(self)->NDArray:
|
47
|
+
'''Construct reward matrix'''
|
48
|
+
self.rewards = np.zeros((self.NUM_STATES, self.NUM_STATES))
|
49
|
+
return self.rewards
|
50
|
+
|
51
|
+
|
52
|
+
def construct_target_behavior_policy(self)->Tuple[NDArray, NDArray]:
|
53
|
+
'''Construct target and behavior policy'''
|
54
|
+
self.target_policy = np.array([0, 1])
|
55
|
+
self.behavior_policy = np.array([1/6, 5/6])
|
56
|
+
self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
|
57
|
+
self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
|
58
|
+
self.pi_begta = rowwise_kron(np.eye(self.NUM_STATES), self.behavior_policy)
|
59
|
+
self.pi_target = rowwise_kron(np.eye(self.NUM_STATES), self.target_policy)
|
60
|
+
|
61
|
+
return self.pi_begta, self.pi_target
|
62
|
+
|
63
|
+
def construct_transition(self)->NDArray:
|
64
|
+
'''Construct transition matrix size of SA times S'''
|
65
|
+
self.transition_mat = np.zeros((self.NUM_STATES*self.NUM_ACTIONS, self.NUM_STATES))
|
66
|
+
for s in range(0, self.NUM_STATES):
|
67
|
+
self.transition_mat[s*self.NUM_ACTIONS + self.SOLID, -1] = 1.0
|
68
|
+
self.transition_mat[s*self.NUM_ACTIONS + self.DASH, :-1] = 1/6
|
69
|
+
return self.transition_mat
|
70
|
+
|
71
|
+
def construct_feature(self,)->NDArray:
|
72
|
+
'''Construct baird feature matrix'''
|
73
|
+
self.phi = np.zeros((self.NUM_STATES, self.num_features))
|
74
|
+
for s in range(self.NUM_STATES):
|
75
|
+
self.phi[s,s] = 2
|
76
|
+
self.phi[s,-1 ] = 1
|
77
|
+
self.phi[-1,-1] = 2
|
78
|
+
self.phi[-1, -2] = 1
|
79
|
+
return self.phi
|
80
|
+
|
81
|
+
|
82
|
+
def reset(self)->Tuple[NDArray,dict]:
|
83
|
+
'''Return initial state'''
|
84
|
+
|
85
|
+
state = np.random.randint(0, self.NUM_STATES)
|
86
|
+
self.num_steps = 0
|
87
|
+
|
88
|
+
current_phi = self.phi[state,:]
|
89
|
+
self.current_state = state
|
90
|
+
|
91
|
+
info = {'rho': 0} # null value
|
92
|
+
|
93
|
+
return state, current_phi, info
|
94
|
+
|
95
|
+
def sample_action(self, state=None)->int:
|
96
|
+
'''Select action according to behavior policy'''
|
97
|
+
|
98
|
+
action = np.random.choice(np.arange(self.NUM_ACTIONS), 1, p=self.behavior_policy[state,:])[0]
|
99
|
+
return action
|
100
|
+
|
101
|
+
def step(self, state:int, action:int)->Tuple[NDArray,float,bool,bool,dict]:
|
102
|
+
'''Take action, return next state, reward, done, truncated, info'''
|
103
|
+
done = False
|
104
|
+
truncated = False
|
105
|
+
self.num_steps += 1
|
106
|
+
|
107
|
+
next_state = np.random.choice(np.arange(self.NUM_STATES), p=self.transition_mat[state*self.NUM_ACTIONS+action,:])
|
108
|
+
next_phi = self.phi[next_state,:]
|
109
|
+
reward = self.rewards[state, next_state]
|
110
|
+
|
111
|
+
info = {'rho':self.target_policy[state,action]/self.behavior_policy[state, action]}
|
112
|
+
|
113
|
+
self.current_state = next_state
|
114
|
+
return next_state, next_phi, reward, done, truncated, info
|
115
|
+
|
116
|
+
|
117
|
+
def get_bellman_error(self, weight: np.ndarray) -> float:
|
118
|
+
be = self.proj @ (self.expected_rewards + self.gamma*self.p_pi @ self.phi @ weight - self.phi @ weight)
|
119
|
+
return np.linalg.norm(np.sqrt(self.d_mu)@be)
|
120
|
+
|
121
|
+
def get_error(self, weight)->float:
|
122
|
+
error = np.linalg.norm(weight - self.sol )
|
123
|
+
return error
|
124
|
+
|
125
|
+
|
@@ -0,0 +1,101 @@
|
|
1
|
+
'''
|
2
|
+
Abstract Class for MDP
|
3
|
+
'''
|
4
|
+
|
5
|
+
from abc import ABC, abstractmethod
|
6
|
+
from typing import Tuple, Dict, Any
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
from nptyping import NDArray
|
10
|
+
|
11
|
+
|
12
|
+
class BaseMDP(ABC):
|
13
|
+
'''
|
14
|
+
Base class for Markov Decision Process (MDP) environments.
|
15
|
+
|
16
|
+
Subclasses must implement the following methods:
|
17
|
+
- reset()
|
18
|
+
- step()
|
19
|
+
- get_error()
|
20
|
+
'''
|
21
|
+
|
22
|
+
|
23
|
+
@abstractmethod
|
24
|
+
def construct_feature(self) -> NDArray:
|
25
|
+
'''
|
26
|
+
Construct the feature matrix for the MDP.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
NDArray: The feature matrix.
|
30
|
+
'''
|
31
|
+
@abstractmethod
|
32
|
+
def construct_transition(self) -> NDArray:
|
33
|
+
'''
|
34
|
+
Construct the transition matrix for the MDP.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
NDArray: The transition matrix.
|
38
|
+
'''
|
39
|
+
@abstractmethod
|
40
|
+
def construct_target_behavior_policy(self) -> Tuple[NDArray, NDArray]:
|
41
|
+
'''
|
42
|
+
Construct the behavior and target policy matrices.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
Tuple[NDArray, NDArray]: The behavior and target policy matrices.
|
46
|
+
'''
|
47
|
+
|
48
|
+
@abstractmethod
|
49
|
+
def reset(self) -> Tuple[int, NDArray, dict]:
|
50
|
+
'''
|
51
|
+
Resets the environment to an initial state.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
Tuple: (state, feature, info)
|
55
|
+
- state: The initial state (int).
|
56
|
+
- feature: The feature representation of the initial state (NDArray).
|
57
|
+
- info: Additional information (dict).
|
58
|
+
'''
|
59
|
+
|
60
|
+
@abstractmethod
|
61
|
+
def step(self, action: int) -> Tuple[NDArray, float, bool, bool, Dict[str, Any]]:
|
62
|
+
'''
|
63
|
+
Takes an action and returns the next state, reward, and other information.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
action (int): The action taken by the agent.
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
Tuple: (next_state, reward, done, truncated, info)
|
70
|
+
- next_state: State after the action (NDArray).
|
71
|
+
- reward: Reward after the action (float).
|
72
|
+
- done: Boolean indicating whether the episode has ended.
|
73
|
+
- truncated: Boolean indicating if the episode was truncated.
|
74
|
+
- info: Additional information (e.g., importance sampling ratio).
|
75
|
+
'''
|
76
|
+
|
77
|
+
|
78
|
+
@abstractmethod
|
79
|
+
def get_error(self, weight: np.ndarray) -> float:
|
80
|
+
'''
|
81
|
+
Calculate the error (e.g., in value function approximation).
|
82
|
+
|
83
|
+
Args:
|
84
|
+
weight (np.ndarray): The weight parameter (e.g., for value function).
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
float: The error based on the weight.
|
88
|
+
'''
|
89
|
+
|
90
|
+
|
91
|
+
@abstractmethod
|
92
|
+
def sample_action(self, state: int) -> int:
|
93
|
+
'''
|
94
|
+
Select an action based on the current state.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
state (int): The current state.
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
int: The action selected by the agent.
|
101
|
+
'''
|
@@ -0,0 +1,17 @@
|
|
1
|
+
'''
|
2
|
+
utils related to environments
|
3
|
+
'''
|
4
|
+
|
5
|
+
|
6
|
+
def get_env(name:str, **kwargs):
|
7
|
+
if name == "Baird":
|
8
|
+
from envs.baird import Baird
|
9
|
+
return Baird(kwargs.get('gamma', 0.9))
|
10
|
+
elif name == "RandomWalkTabular":
|
11
|
+
from envs.random_walk_tabular import RandomWalkTabular
|
12
|
+
return RandomWalkTabular(gamma=kwargs.get('gamma', 0.9))
|
13
|
+
elif name == "RandomWalkDependent":
|
14
|
+
from envs.random_walk_dependent import RandomWalkDependent
|
15
|
+
return RandomWalkDependent(gamma=kwargs.get('gamma', 0.9))
|
16
|
+
else:
|
17
|
+
raise NotImplementedError(f"Environment {name} not implemented")
|
@@ -0,0 +1,131 @@
|
|
1
|
+
from envs.base_mdp import BaseMDP
|
2
|
+
import numpy as np
|
3
|
+
from nptyping import NDArray
|
4
|
+
from typing import Tuple
|
5
|
+
import copy
|
6
|
+
from utils.math_utils import rowwise_kron, compute_stationary_dist
|
7
|
+
|
8
|
+
class RandomWalkDependent(BaseMDP):
|
9
|
+
|
10
|
+
NUM_STATES = 7
|
11
|
+
NUM_ACTIONS = 2
|
12
|
+
num_features = 3
|
13
|
+
|
14
|
+
LEFT = 0
|
15
|
+
RIGHT = 1
|
16
|
+
|
17
|
+
END_0 = 0
|
18
|
+
END_1 = NUM_STATES-1
|
19
|
+
START = int((NUM_STATES-1)/2)
|
20
|
+
|
21
|
+
def __init__(self, gamma):
|
22
|
+
|
23
|
+
|
24
|
+
self.reward = 0
|
25
|
+
self.current_state = None
|
26
|
+
self.num_steps = 0
|
27
|
+
self.phi = self.construct_feature()
|
28
|
+
# Define target and behavior policy
|
29
|
+
# Target policy: go right with prob 0.6, left with prob 0
|
30
|
+
|
31
|
+
self.target_policy = np.array([0.4, 0.6])
|
32
|
+
self.behavior_policy = np.array([0.5, 0.5])
|
33
|
+
self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
|
34
|
+
self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
|
35
|
+
|
36
|
+
self.pi_beta, self.pi_target = self.construct_target_behavior_policy()
|
37
|
+
self.transition_mat = self.construct_transition()
|
38
|
+
|
39
|
+
self.p_pi = self.pi_target@self.transition_mat
|
40
|
+
self.p_beta = self.pi_beta@self.transition_mat
|
41
|
+
self.d_mu = np.diag(compute_stationary_dist(self.p_beta))
|
42
|
+
|
43
|
+
|
44
|
+
self.gamma = gamma
|
45
|
+
self.rewards = self.construct_reward()
|
46
|
+
self.expected_rewards = np.sum(self.p_pi*self.rewards, axis=1)
|
47
|
+
|
48
|
+
self.proj = self.phi @ np.linalg.pinv(self.phi.T@self.d_mu@self.phi) @ self.phi.T @ self.d_mu
|
49
|
+
|
50
|
+
|
51
|
+
self.sol = np.linalg.pinv(self.phi.T @ self.d_mu @ (np.eye(self.NUM_STATES) - self.gamma*self.p_pi) @ self.phi) @ (self.phi.T @ self.d_mu @ self.expected_rewards)
|
52
|
+
|
53
|
+
def construct_reward(self):
|
54
|
+
self.rewards = np.zeros((self.NUM_STATES, self.NUM_STATES))
|
55
|
+
self.rewards[self.END_0+1, self.END_0] = -1.0
|
56
|
+
self.rewards[self.END_1-1, self.END_1] = 1.0
|
57
|
+
return self.rewards
|
58
|
+
|
59
|
+
def construct_feature(self):
|
60
|
+
self.phi = np.array([[1,0,0], [1,0,0], [1/np.sqrt(2), 1/np.sqrt(2) ,0],
|
61
|
+
[1/np.sqrt(3), 1/np.sqrt(3), 1/np.sqrt(3)],
|
62
|
+
[0,1/np.sqrt(2),1/np.sqrt(2)],[0,0,1],[0,0,1]])
|
63
|
+
return self.phi
|
64
|
+
def construct_target_behavior_policy(self)->Tuple[NDArray, NDArray]:
|
65
|
+
|
66
|
+
self.target_policy = np.array([0.4, 0.6])
|
67
|
+
self.behavior_policy = np.array([0.5, 0.5])
|
68
|
+
self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
|
69
|
+
self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
|
70
|
+
|
71
|
+
self.pi_beta = rowwise_kron(np.eye(self.NUM_STATES), self.behavior_policy)
|
72
|
+
self.pi_target = rowwise_kron(np.eye(self.NUM_STATES), self.target_policy)
|
73
|
+
|
74
|
+
return self.pi_beta, self.pi_target
|
75
|
+
|
76
|
+
def construct_transition(self):
|
77
|
+
|
78
|
+
self.transition_mat = np.zeros((self.NUM_STATES*self.NUM_ACTIONS, self.NUM_STATES))
|
79
|
+
for s in range(1, self.NUM_STATES-1):
|
80
|
+
self.transition_mat[s*self.NUM_ACTIONS + self.LEFT, s-1] = 1.0
|
81
|
+
self.transition_mat[s*self.NUM_ACTIONS + self.RIGHT, s+1] = 1.0
|
82
|
+
self.transition_mat[0, self.START] = 1.0
|
83
|
+
self.transition_mat[1, self.START] = 1.0
|
84
|
+
self.transition_mat[(self.NUM_STATES-1)*self.NUM_ACTIONS + self.LEFT, self.START] = 1.0
|
85
|
+
self.transition_mat[(self.NUM_STATES-1)*self.NUM_ACTIONS + self.RIGHT, self.START] = 1.0
|
86
|
+
|
87
|
+
return self.transition_mat
|
88
|
+
|
89
|
+
def reset(self)->Tuple[NDArray,dict]:
|
90
|
+
'''Return initial state'''
|
91
|
+
|
92
|
+
state = np.random.randint(0, self.NUM_STATES)
|
93
|
+
self.num_steps = 0
|
94
|
+
|
95
|
+
current_phi = self.phi[state,:]
|
96
|
+
|
97
|
+
info = {'rho':0}
|
98
|
+
|
99
|
+
return state, current_phi, info
|
100
|
+
|
101
|
+
def sample_action(self, state=None)->int:
|
102
|
+
'''Select action according to behavior policy'''
|
103
|
+
|
104
|
+
action = np.random.choice(np.arange(self.NUM_ACTIONS), 1, p=self.behavior_policy[state,:])[0]
|
105
|
+
return action
|
106
|
+
|
107
|
+
def step(self, state:int, action:int)->Tuple[NDArray,float,bool,bool,dict]:
|
108
|
+
|
109
|
+
done = False
|
110
|
+
truncated = False
|
111
|
+
self.num_steps += 1
|
112
|
+
|
113
|
+
|
114
|
+
next_state = np.random.choice(np.arange(self.NUM_STATES), p=self.transition_mat[state*self.NUM_ACTIONS+action,:])
|
115
|
+
next_phi = self.phi[next_state,:]
|
116
|
+
reward = self.rewards[state, next_state]
|
117
|
+
|
118
|
+
info = {'rho':self.target_policy[state, action]/self.behavior_policy[state, action]}
|
119
|
+
self.current_state = next_state
|
120
|
+
return next_state, next_phi, reward, done, truncated,info
|
121
|
+
|
122
|
+
|
123
|
+
def get_bellman_error(self, weight: np.ndarray) -> float:
|
124
|
+
be = self.proj @ (self.expected_rewards + self.gamma*self.p_pi @ self.phi @ weight - self.phi @ weight)
|
125
|
+
return np.linalg.norm(np.sqrt(self.d_mu)@be)
|
126
|
+
|
127
|
+
def get_error(self, weight)->float:
|
128
|
+
error = np.linalg.norm(weight - self.sol )
|
129
|
+
return error
|
130
|
+
|
131
|
+
|
@@ -0,0 +1,134 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from nptyping import NDArray
|
3
|
+
from typing import Tuple
|
4
|
+
|
5
|
+
from envs.base_mdp import BaseMDP
|
6
|
+
from utils.math_utils import rowwise_kron, compute_stationary_dist
|
7
|
+
|
8
|
+
class RandomWalkTabular(BaseMDP):
|
9
|
+
|
10
|
+
NUM_STATES = 7
|
11
|
+
NUM_ACTIONS = 2
|
12
|
+
num_features = 7
|
13
|
+
|
14
|
+
LEFT = 0
|
15
|
+
RIGHT = 1
|
16
|
+
|
17
|
+
END_0 = 0
|
18
|
+
END_1 = NUM_STATES-1
|
19
|
+
START = int((NUM_STATES-1)/2)
|
20
|
+
|
21
|
+
def __init__(self, gamma:float):
|
22
|
+
|
23
|
+
self.gamma = gamma
|
24
|
+
self.reward = 0
|
25
|
+
self.current_state = None
|
26
|
+
self.num_steps = 0
|
27
|
+
self.phi = self.construct_feature()
|
28
|
+
|
29
|
+
|
30
|
+
self.target_policy = np.array([0.4, 0.6])
|
31
|
+
self.behavior_policy = np.array([0.5, 0.5])
|
32
|
+
self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
|
33
|
+
self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
|
34
|
+
|
35
|
+
self.pi_beta , self.pi_target = self.construct_target_behavior_policy()
|
36
|
+
self.transition_mat = self.construct_transition()
|
37
|
+
|
38
|
+
self.p_pi = self.pi_target@self.transition_mat
|
39
|
+
self.p_beta = self.pi_beta@self.transition_mat
|
40
|
+
self.d_mu = np.diag(compute_stationary_dist(self.p_beta))
|
41
|
+
|
42
|
+
|
43
|
+
self.rewards = self.construct_reward()
|
44
|
+
self.expected_rewards = np.sum(self.p_pi*self.rewards, axis=1)
|
45
|
+
|
46
|
+
self.proj = self.phi @ np.linalg.pinv(self.phi.T@self.d_mu@self.phi) @ self.phi.T @ self.d_mu
|
47
|
+
|
48
|
+
self.sol = np.linalg.pinv(self.phi.T @ self.d_mu @ (np.eye(self.NUM_STATES) - self.gamma*self.p_pi) @ self.phi) @ (self.phi.T @ self.d_mu @ self.expected_rewards)
|
49
|
+
|
50
|
+
|
51
|
+
def construct_feature(self):
|
52
|
+
self.phi = np.eye(self.NUM_STATES)
|
53
|
+
return self.phi
|
54
|
+
|
55
|
+
def construct_target_behavior_policy(self)->Tuple[NDArray, NDArray]:
|
56
|
+
|
57
|
+
self.target_policy = np.array([0.4, 0.6])
|
58
|
+
self.behavior_policy = np.array([0.5, 0.5])
|
59
|
+
self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
|
60
|
+
self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
|
61
|
+
|
62
|
+
self.pi_beta = rowwise_kron(np.eye(self.NUM_STATES), self.behavior_policy)
|
63
|
+
self.pi_target = rowwise_kron(np.eye(self.NUM_STATES), self.target_policy)
|
64
|
+
|
65
|
+
return self.pi_beta, self.pi_target
|
66
|
+
|
67
|
+
def construct_transition(self):
|
68
|
+
|
69
|
+
self.transition_mat = np.zeros((self.NUM_STATES*self.NUM_ACTIONS, self.NUM_STATES))
|
70
|
+
for s in range(1, self.NUM_STATES-1):
|
71
|
+
self.transition_mat[s*self.NUM_ACTIONS + self.LEFT, s-1] = 1.0
|
72
|
+
self.transition_mat[s*self.NUM_ACTIONS + self.RIGHT, s+1] = 1.0
|
73
|
+
self.transition_mat[0, self.START] = 1.0
|
74
|
+
self.transition_mat[1, self.START] = 1.0
|
75
|
+
self.transition_mat[(self.NUM_STATES-1)*self.NUM_ACTIONS + self.LEFT, self.START] = 1.0
|
76
|
+
self.transition_mat[(self.NUM_STATES-1)*self.NUM_ACTIONS + self.RIGHT, self.START] = 1.0
|
77
|
+
|
78
|
+
return self.transition_mat
|
79
|
+
def construct_reward(self):
|
80
|
+
self.rewards = np.zeros((self.NUM_STATES, self.NUM_STATES))
|
81
|
+
self.rewards[self.END_0+1, self.END_0] = -1.0
|
82
|
+
self.rewards[self.END_1-1, self.END_1] = 1.0
|
83
|
+
self.expected_rewards = np.sum(self.p_pi*self.rewards, axis=1)
|
84
|
+
return self.rewards
|
85
|
+
|
86
|
+
|
87
|
+
def reset(self)->Tuple[NDArray,dict]:
|
88
|
+
'''Return initial state'''
|
89
|
+
|
90
|
+
state = np.random.randint(0, self.NUM_STATES)
|
91
|
+
self.num_steps = 0
|
92
|
+
|
93
|
+
current_phi = self.phi[state,:]
|
94
|
+
|
95
|
+
info = {'rho':1}
|
96
|
+
|
97
|
+
self.current_state = state
|
98
|
+
self.current_phi = current_phi
|
99
|
+
|
100
|
+
|
101
|
+
return state, current_phi ,info
|
102
|
+
|
103
|
+
def sample_action(self, state=None)->int:
|
104
|
+
'''Select action according to behavior policy'''
|
105
|
+
|
106
|
+
action = np.random.choice(np.arange(self.NUM_ACTIONS), 1, p=self.behavior_policy[state,:])[0]
|
107
|
+
return action
|
108
|
+
|
109
|
+
def step(self, state:int, action:int)->Tuple[NDArray,float,bool,bool,dict]:
|
110
|
+
|
111
|
+
done = False
|
112
|
+
truncated = False
|
113
|
+
self.num_steps += 1
|
114
|
+
|
115
|
+
|
116
|
+
next_state = np.random.choice(np.arange(self.NUM_STATES),
|
117
|
+
p=self.transition_mat[state*self.NUM_ACTIONS+action,:])
|
118
|
+
next_phi = self.phi[next_state,:]
|
119
|
+
reward = self.rewards[state, next_state]
|
120
|
+
|
121
|
+
info = {'rho':self.target_policy[state, action]/self.behavior_policy[state, action]}
|
122
|
+
self.current_state = next_state
|
123
|
+
return next_state, next_phi, reward, done, truncated,info
|
124
|
+
|
125
|
+
|
126
|
+
def get_bellman_error(self, weight: np.ndarray) -> float:
|
127
|
+
be = self.proj @ (self.expected_rewards + self.gamma*self.p_pi @ self.phi @ weight - self.phi @ weight)
|
128
|
+
return np.linalg.norm(np.sqrt(self.d_mu)@be)
|
129
|
+
|
130
|
+
def get_error(self, weight)->float:
|
131
|
+
error = np.linalg.norm(weight - self.sol )
|
132
|
+
return error
|
133
|
+
|
134
|
+
|
test_mdp-0.0.1/setup.cfg
ADDED
test_mdp-0.0.1/setup.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
from setuptools import setup, find_packages
|
2
|
+
|
3
|
+
setup(
|
4
|
+
name='test-mdp',
|
5
|
+
version='0.0.1',
|
6
|
+
description='test for policy evaluation examples',
|
7
|
+
author='teddylee777',
|
8
|
+
author_email='limaries30@kaist.ac.kr',
|
9
|
+
url='https://github.com/limaries30/test-mdp',
|
10
|
+
install_requires=['tqdm',],
|
11
|
+
packages=find_packages(exclude=[]),
|
12
|
+
keywords=['policy evaluation'],
|
13
|
+
python_requires='>=3.6',
|
14
|
+
package_data={},
|
15
|
+
zip_safe=False,
|
16
|
+
classifiers=[
|
17
|
+
'Programming Language :: Python :: 3.6',
|
18
|
+
'Programming Language :: Python :: 3.7',
|
19
|
+
'Programming Language :: Python :: 3.8',
|
20
|
+
'Programming Language :: Python :: 3.9',
|
21
|
+
],
|
22
|
+
)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: test-mdp
|
3
|
+
Version: 0.0.1
|
4
|
+
Summary: test for policy evaluation examples
|
5
|
+
Home-page: https://github.com/limaries30/test-mdp
|
6
|
+
Author: teddylee777
|
7
|
+
Author-email: limaries30@kaist.ac.kr
|
8
|
+
Keywords: policy evaluation
|
9
|
+
Classifier: Programming Language :: Python :: 3.6
|
10
|
+
Classifier: Programming Language :: Python :: 3.7
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
13
|
+
Requires-Python: >=3.6
|
14
|
+
Requires-Dist: tqdm
|
15
|
+
Dynamic: author
|
16
|
+
Dynamic: author-email
|
17
|
+
Dynamic: classifier
|
18
|
+
Dynamic: home-page
|
19
|
+
Dynamic: keywords
|
20
|
+
Dynamic: requires-dist
|
21
|
+
Dynamic: requires-python
|
22
|
+
Dynamic: summary
|
@@ -0,0 +1,13 @@
|
|
1
|
+
setup.py
|
2
|
+
envs/__init__.py
|
3
|
+
envs/baird.py
|
4
|
+
envs/base_mdp.py
|
5
|
+
envs/env_utils.py
|
6
|
+
envs/random_walk_dependent.py
|
7
|
+
envs/random_walk_tabular.py
|
8
|
+
test_mdp.egg-info/PKG-INFO
|
9
|
+
test_mdp.egg-info/SOURCES.txt
|
10
|
+
test_mdp.egg-info/dependency_links.txt
|
11
|
+
test_mdp.egg-info/not-zip-safe
|
12
|
+
test_mdp.egg-info/requires.txt
|
13
|
+
test_mdp.egg-info/top_level.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
tqdm
|
@@ -0,0 +1 @@
|
|
1
|
+
envs
|