test-mdp 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: test-mdp
3
+ Version: 0.0.1
4
+ Summary: test for policy evaluation examples
5
+ Home-page: https://github.com/limaries30/test-mdp
6
+ Author: teddylee777
7
+ Author-email: limaries30@kaist.ac.kr
8
+ Keywords: policy evaluation
9
+ Classifier: Programming Language :: Python :: 3.6
10
+ Classifier: Programming Language :: Python :: 3.7
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Requires-Python: >=3.6
14
+ Requires-Dist: tqdm
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: home-page
19
+ Dynamic: keywords
20
+ Dynamic: requires-dist
21
+ Dynamic: requires-python
22
+ Dynamic: summary
@@ -0,0 +1 @@
1
+ __version__ = '0.0.1'
@@ -0,0 +1,125 @@
1
+ '''
2
+ Baird's counterexample environment
3
+ '''
4
+ from typing import Tuple
5
+
6
+ import numpy as np
7
+ from nptyping import NDArray
8
+
9
+ from envs.base_mdp import BaseMDP
10
+ from utils.math_utils import rowwise_kron, compute_stationary_dist
11
+
12
+ class Baird(BaseMDP):
13
+
14
+ NUM_STATES = 7
15
+ NUM_ACTIONS = 2
16
+ num_features = 8
17
+
18
+ DASH = 0
19
+ SOLID = 1
20
+
21
+ def __init__(self, gamma:float=0.9):
22
+
23
+ self.gamma = gamma
24
+ self.current_state = None
25
+ self.num_steps = 0
26
+
27
+ self.phi = self.construct_feature()
28
+ self.pi_beta, self.pi_target = self.construct_target_behavior_policy()
29
+ self.transition_mat = self.construct_transition()
30
+
31
+ self.p_pi = self.pi_target@self.transition_mat
32
+ self.p_beta = self.pi_beta@self.transition_mat
33
+ self.d_mu = np.diag(compute_stationary_dist(self.p_beta))
34
+
35
+
36
+ self.rewards = self.construct_reward()
37
+ self.expected_rewards = np.sum(self.p_pi*self.rewards, axis=1)
38
+
39
+
40
+ self.proj = self.phi @ np.linalg.pinv(self.phi.T@self.d_mu@self.phi) @ self.phi.T @ self.d_mu
41
+
42
+
43
+ self.sol = np.linalg.pinv(self.phi.T @ self.d_mu @ (np.eye(self.NUM_STATES) - self.gamma*self.p_pi) @ self.phi) @ (self.phi.T @ self.d_mu @ self.expected_rewards)
44
+
45
+
46
+ def construct_reward(self)->NDArray:
47
+ '''Construct reward matrix'''
48
+ self.rewards = np.zeros((self.NUM_STATES, self.NUM_STATES))
49
+ return self.rewards
50
+
51
+
52
+ def construct_target_behavior_policy(self)->Tuple[NDArray, NDArray]:
53
+ '''Construct target and behavior policy'''
54
+ self.target_policy = np.array([0, 1])
55
+ self.behavior_policy = np.array([1/6, 5/6])
56
+ self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
57
+ self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
58
+ self.pi_begta = rowwise_kron(np.eye(self.NUM_STATES), self.behavior_policy)
59
+ self.pi_target = rowwise_kron(np.eye(self.NUM_STATES), self.target_policy)
60
+
61
+ return self.pi_begta, self.pi_target
62
+
63
+ def construct_transition(self)->NDArray:
64
+ '''Construct transition matrix size of SA times S'''
65
+ self.transition_mat = np.zeros((self.NUM_STATES*self.NUM_ACTIONS, self.NUM_STATES))
66
+ for s in range(0, self.NUM_STATES):
67
+ self.transition_mat[s*self.NUM_ACTIONS + self.SOLID, -1] = 1.0
68
+ self.transition_mat[s*self.NUM_ACTIONS + self.DASH, :-1] = 1/6
69
+ return self.transition_mat
70
+
71
+ def construct_feature(self,)->NDArray:
72
+ '''Construct baird feature matrix'''
73
+ self.phi = np.zeros((self.NUM_STATES, self.num_features))
74
+ for s in range(self.NUM_STATES):
75
+ self.phi[s,s] = 2
76
+ self.phi[s,-1 ] = 1
77
+ self.phi[-1,-1] = 2
78
+ self.phi[-1, -2] = 1
79
+ return self.phi
80
+
81
+
82
+ def reset(self)->Tuple[NDArray,dict]:
83
+ '''Return initial state'''
84
+
85
+ state = np.random.randint(0, self.NUM_STATES)
86
+ self.num_steps = 0
87
+
88
+ current_phi = self.phi[state,:]
89
+ self.current_state = state
90
+
91
+ info = {'rho': 0} # null value
92
+
93
+ return state, current_phi, info
94
+
95
+ def sample_action(self, state=None)->int:
96
+ '''Select action according to behavior policy'''
97
+
98
+ action = np.random.choice(np.arange(self.NUM_ACTIONS), 1, p=self.behavior_policy[state,:])[0]
99
+ return action
100
+
101
+ def step(self, state:int, action:int)->Tuple[NDArray,float,bool,bool,dict]:
102
+ '''Take action, return next state, reward, done, truncated, info'''
103
+ done = False
104
+ truncated = False
105
+ self.num_steps += 1
106
+
107
+ next_state = np.random.choice(np.arange(self.NUM_STATES), p=self.transition_mat[state*self.NUM_ACTIONS+action,:])
108
+ next_phi = self.phi[next_state,:]
109
+ reward = self.rewards[state, next_state]
110
+
111
+ info = {'rho':self.target_policy[state,action]/self.behavior_policy[state, action]}
112
+
113
+ self.current_state = next_state
114
+ return next_state, next_phi, reward, done, truncated, info
115
+
116
+
117
+ def get_bellman_error(self, weight: np.ndarray) -> float:
118
+ be = self.proj @ (self.expected_rewards + self.gamma*self.p_pi @ self.phi @ weight - self.phi @ weight)
119
+ return np.linalg.norm(np.sqrt(self.d_mu)@be)
120
+
121
+ def get_error(self, weight)->float:
122
+ error = np.linalg.norm(weight - self.sol )
123
+ return error
124
+
125
+
@@ -0,0 +1,101 @@
1
+ '''
2
+ Abstract Class for MDP
3
+ '''
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Tuple, Dict, Any
7
+
8
+ import numpy as np
9
+ from nptyping import NDArray
10
+
11
+
12
+ class BaseMDP(ABC):
13
+ '''
14
+ Base class for Markov Decision Process (MDP) environments.
15
+
16
+ Subclasses must implement the following methods:
17
+ - reset()
18
+ - step()
19
+ - get_error()
20
+ '''
21
+
22
+
23
+ @abstractmethod
24
+ def construct_feature(self) -> NDArray:
25
+ '''
26
+ Construct the feature matrix for the MDP.
27
+
28
+ Returns:
29
+ NDArray: The feature matrix.
30
+ '''
31
+ @abstractmethod
32
+ def construct_transition(self) -> NDArray:
33
+ '''
34
+ Construct the transition matrix for the MDP.
35
+
36
+ Returns:
37
+ NDArray: The transition matrix.
38
+ '''
39
+ @abstractmethod
40
+ def construct_target_behavior_policy(self) -> Tuple[NDArray, NDArray]:
41
+ '''
42
+ Construct the behavior and target policy matrices.
43
+
44
+ Returns:
45
+ Tuple[NDArray, NDArray]: The behavior and target policy matrices.
46
+ '''
47
+
48
+ @abstractmethod
49
+ def reset(self) -> Tuple[int, NDArray, dict]:
50
+ '''
51
+ Resets the environment to an initial state.
52
+
53
+ Returns:
54
+ Tuple: (state, feature, info)
55
+ - state: The initial state (int).
56
+ - feature: The feature representation of the initial state (NDArray).
57
+ - info: Additional information (dict).
58
+ '''
59
+
60
+ @abstractmethod
61
+ def step(self, action: int) -> Tuple[NDArray, float, bool, bool, Dict[str, Any]]:
62
+ '''
63
+ Takes an action and returns the next state, reward, and other information.
64
+
65
+ Args:
66
+ action (int): The action taken by the agent.
67
+
68
+ Returns:
69
+ Tuple: (next_state, reward, done, truncated, info)
70
+ - next_state: State after the action (NDArray).
71
+ - reward: Reward after the action (float).
72
+ - done: Boolean indicating whether the episode has ended.
73
+ - truncated: Boolean indicating if the episode was truncated.
74
+ - info: Additional information (e.g., importance sampling ratio).
75
+ '''
76
+
77
+
78
+ @abstractmethod
79
+ def get_error(self, weight: np.ndarray) -> float:
80
+ '''
81
+ Calculate the error (e.g., in value function approximation).
82
+
83
+ Args:
84
+ weight (np.ndarray): The weight parameter (e.g., for value function).
85
+
86
+ Returns:
87
+ float: The error based on the weight.
88
+ '''
89
+
90
+
91
+ @abstractmethod
92
+ def sample_action(self, state: int) -> int:
93
+ '''
94
+ Select an action based on the current state.
95
+
96
+ Args:
97
+ state (int): The current state.
98
+
99
+ Returns:
100
+ int: The action selected by the agent.
101
+ '''
@@ -0,0 +1,17 @@
1
+ '''
2
+ utils related to environments
3
+ '''
4
+
5
+
6
+ def get_env(name:str, **kwargs):
7
+ if name == "Baird":
8
+ from envs.baird import Baird
9
+ return Baird(kwargs.get('gamma', 0.9))
10
+ elif name == "RandomWalkTabular":
11
+ from envs.random_walk_tabular import RandomWalkTabular
12
+ return RandomWalkTabular(gamma=kwargs.get('gamma', 0.9))
13
+ elif name == "RandomWalkDependent":
14
+ from envs.random_walk_dependent import RandomWalkDependent
15
+ return RandomWalkDependent(gamma=kwargs.get('gamma', 0.9))
16
+ else:
17
+ raise NotImplementedError(f"Environment {name} not implemented")
@@ -0,0 +1,131 @@
1
+ from envs.base_mdp import BaseMDP
2
+ import numpy as np
3
+ from nptyping import NDArray
4
+ from typing import Tuple
5
+ import copy
6
+ from utils.math_utils import rowwise_kron, compute_stationary_dist
7
+
8
+ class RandomWalkDependent(BaseMDP):
9
+
10
+ NUM_STATES = 7
11
+ NUM_ACTIONS = 2
12
+ num_features = 3
13
+
14
+ LEFT = 0
15
+ RIGHT = 1
16
+
17
+ END_0 = 0
18
+ END_1 = NUM_STATES-1
19
+ START = int((NUM_STATES-1)/2)
20
+
21
+ def __init__(self, gamma):
22
+
23
+
24
+ self.reward = 0
25
+ self.current_state = None
26
+ self.num_steps = 0
27
+ self.phi = self.construct_feature()
28
+ # Define target and behavior policy
29
+ # Target policy: go right with prob 0.6, left with prob 0
30
+
31
+ self.target_policy = np.array([0.4, 0.6])
32
+ self.behavior_policy = np.array([0.5, 0.5])
33
+ self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
34
+ self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
35
+
36
+ self.pi_beta, self.pi_target = self.construct_target_behavior_policy()
37
+ self.transition_mat = self.construct_transition()
38
+
39
+ self.p_pi = self.pi_target@self.transition_mat
40
+ self.p_beta = self.pi_beta@self.transition_mat
41
+ self.d_mu = np.diag(compute_stationary_dist(self.p_beta))
42
+
43
+
44
+ self.gamma = gamma
45
+ self.rewards = self.construct_reward()
46
+ self.expected_rewards = np.sum(self.p_pi*self.rewards, axis=1)
47
+
48
+ self.proj = self.phi @ np.linalg.pinv(self.phi.T@self.d_mu@self.phi) @ self.phi.T @ self.d_mu
49
+
50
+
51
+ self.sol = np.linalg.pinv(self.phi.T @ self.d_mu @ (np.eye(self.NUM_STATES) - self.gamma*self.p_pi) @ self.phi) @ (self.phi.T @ self.d_mu @ self.expected_rewards)
52
+
53
+ def construct_reward(self):
54
+ self.rewards = np.zeros((self.NUM_STATES, self.NUM_STATES))
55
+ self.rewards[self.END_0+1, self.END_0] = -1.0
56
+ self.rewards[self.END_1-1, self.END_1] = 1.0
57
+ return self.rewards
58
+
59
+ def construct_feature(self):
60
+ self.phi = np.array([[1,0,0], [1,0,0], [1/np.sqrt(2), 1/np.sqrt(2) ,0],
61
+ [1/np.sqrt(3), 1/np.sqrt(3), 1/np.sqrt(3)],
62
+ [0,1/np.sqrt(2),1/np.sqrt(2)],[0,0,1],[0,0,1]])
63
+ return self.phi
64
+ def construct_target_behavior_policy(self)->Tuple[NDArray, NDArray]:
65
+
66
+ self.target_policy = np.array([0.4, 0.6])
67
+ self.behavior_policy = np.array([0.5, 0.5])
68
+ self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
69
+ self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
70
+
71
+ self.pi_beta = rowwise_kron(np.eye(self.NUM_STATES), self.behavior_policy)
72
+ self.pi_target = rowwise_kron(np.eye(self.NUM_STATES), self.target_policy)
73
+
74
+ return self.pi_beta, self.pi_target
75
+
76
+ def construct_transition(self):
77
+
78
+ self.transition_mat = np.zeros((self.NUM_STATES*self.NUM_ACTIONS, self.NUM_STATES))
79
+ for s in range(1, self.NUM_STATES-1):
80
+ self.transition_mat[s*self.NUM_ACTIONS + self.LEFT, s-1] = 1.0
81
+ self.transition_mat[s*self.NUM_ACTIONS + self.RIGHT, s+1] = 1.0
82
+ self.transition_mat[0, self.START] = 1.0
83
+ self.transition_mat[1, self.START] = 1.0
84
+ self.transition_mat[(self.NUM_STATES-1)*self.NUM_ACTIONS + self.LEFT, self.START] = 1.0
85
+ self.transition_mat[(self.NUM_STATES-1)*self.NUM_ACTIONS + self.RIGHT, self.START] = 1.0
86
+
87
+ return self.transition_mat
88
+
89
+ def reset(self)->Tuple[NDArray,dict]:
90
+ '''Return initial state'''
91
+
92
+ state = np.random.randint(0, self.NUM_STATES)
93
+ self.num_steps = 0
94
+
95
+ current_phi = self.phi[state,:]
96
+
97
+ info = {'rho':0}
98
+
99
+ return state, current_phi, info
100
+
101
+ def sample_action(self, state=None)->int:
102
+ '''Select action according to behavior policy'''
103
+
104
+ action = np.random.choice(np.arange(self.NUM_ACTIONS), 1, p=self.behavior_policy[state,:])[0]
105
+ return action
106
+
107
+ def step(self, state:int, action:int)->Tuple[NDArray,float,bool,bool,dict]:
108
+
109
+ done = False
110
+ truncated = False
111
+ self.num_steps += 1
112
+
113
+
114
+ next_state = np.random.choice(np.arange(self.NUM_STATES), p=self.transition_mat[state*self.NUM_ACTIONS+action,:])
115
+ next_phi = self.phi[next_state,:]
116
+ reward = self.rewards[state, next_state]
117
+
118
+ info = {'rho':self.target_policy[state, action]/self.behavior_policy[state, action]}
119
+ self.current_state = next_state
120
+ return next_state, next_phi, reward, done, truncated,info
121
+
122
+
123
+ def get_bellman_error(self, weight: np.ndarray) -> float:
124
+ be = self.proj @ (self.expected_rewards + self.gamma*self.p_pi @ self.phi @ weight - self.phi @ weight)
125
+ return np.linalg.norm(np.sqrt(self.d_mu)@be)
126
+
127
+ def get_error(self, weight)->float:
128
+ error = np.linalg.norm(weight - self.sol )
129
+ return error
130
+
131
+
@@ -0,0 +1,134 @@
1
+ import numpy as np
2
+ from nptyping import NDArray
3
+ from typing import Tuple
4
+
5
+ from envs.base_mdp import BaseMDP
6
+ from utils.math_utils import rowwise_kron, compute_stationary_dist
7
+
8
+ class RandomWalkTabular(BaseMDP):
9
+
10
+ NUM_STATES = 7
11
+ NUM_ACTIONS = 2
12
+ num_features = 7
13
+
14
+ LEFT = 0
15
+ RIGHT = 1
16
+
17
+ END_0 = 0
18
+ END_1 = NUM_STATES-1
19
+ START = int((NUM_STATES-1)/2)
20
+
21
+ def __init__(self, gamma:float):
22
+
23
+ self.gamma = gamma
24
+ self.reward = 0
25
+ self.current_state = None
26
+ self.num_steps = 0
27
+ self.phi = self.construct_feature()
28
+
29
+
30
+ self.target_policy = np.array([0.4, 0.6])
31
+ self.behavior_policy = np.array([0.5, 0.5])
32
+ self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
33
+ self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
34
+
35
+ self.pi_beta , self.pi_target = self.construct_target_behavior_policy()
36
+ self.transition_mat = self.construct_transition()
37
+
38
+ self.p_pi = self.pi_target@self.transition_mat
39
+ self.p_beta = self.pi_beta@self.transition_mat
40
+ self.d_mu = np.diag(compute_stationary_dist(self.p_beta))
41
+
42
+
43
+ self.rewards = self.construct_reward()
44
+ self.expected_rewards = np.sum(self.p_pi*self.rewards, axis=1)
45
+
46
+ self.proj = self.phi @ np.linalg.pinv(self.phi.T@self.d_mu@self.phi) @ self.phi.T @ self.d_mu
47
+
48
+ self.sol = np.linalg.pinv(self.phi.T @ self.d_mu @ (np.eye(self.NUM_STATES) - self.gamma*self.p_pi) @ self.phi) @ (self.phi.T @ self.d_mu @ self.expected_rewards)
49
+
50
+
51
+ def construct_feature(self):
52
+ self.phi = np.eye(self.NUM_STATES)
53
+ return self.phi
54
+
55
+ def construct_target_behavior_policy(self)->Tuple[NDArray, NDArray]:
56
+
57
+ self.target_policy = np.array([0.4, 0.6])
58
+ self.behavior_policy = np.array([0.5, 0.5])
59
+ self.target_policy = np.tile(self.target_policy, (self.NUM_STATES, 1))
60
+ self.behavior_policy = np.tile(self.behavior_policy, (self.NUM_STATES, 1))
61
+
62
+ self.pi_beta = rowwise_kron(np.eye(self.NUM_STATES), self.behavior_policy)
63
+ self.pi_target = rowwise_kron(np.eye(self.NUM_STATES), self.target_policy)
64
+
65
+ return self.pi_beta, self.pi_target
66
+
67
+ def construct_transition(self):
68
+
69
+ self.transition_mat = np.zeros((self.NUM_STATES*self.NUM_ACTIONS, self.NUM_STATES))
70
+ for s in range(1, self.NUM_STATES-1):
71
+ self.transition_mat[s*self.NUM_ACTIONS + self.LEFT, s-1] = 1.0
72
+ self.transition_mat[s*self.NUM_ACTIONS + self.RIGHT, s+1] = 1.0
73
+ self.transition_mat[0, self.START] = 1.0
74
+ self.transition_mat[1, self.START] = 1.0
75
+ self.transition_mat[(self.NUM_STATES-1)*self.NUM_ACTIONS + self.LEFT, self.START] = 1.0
76
+ self.transition_mat[(self.NUM_STATES-1)*self.NUM_ACTIONS + self.RIGHT, self.START] = 1.0
77
+
78
+ return self.transition_mat
79
+ def construct_reward(self):
80
+ self.rewards = np.zeros((self.NUM_STATES, self.NUM_STATES))
81
+ self.rewards[self.END_0+1, self.END_0] = -1.0
82
+ self.rewards[self.END_1-1, self.END_1] = 1.0
83
+ self.expected_rewards = np.sum(self.p_pi*self.rewards, axis=1)
84
+ return self.rewards
85
+
86
+
87
+ def reset(self)->Tuple[NDArray,dict]:
88
+ '''Return initial state'''
89
+
90
+ state = np.random.randint(0, self.NUM_STATES)
91
+ self.num_steps = 0
92
+
93
+ current_phi = self.phi[state,:]
94
+
95
+ info = {'rho':1}
96
+
97
+ self.current_state = state
98
+ self.current_phi = current_phi
99
+
100
+
101
+ return state, current_phi ,info
102
+
103
+ def sample_action(self, state=None)->int:
104
+ '''Select action according to behavior policy'''
105
+
106
+ action = np.random.choice(np.arange(self.NUM_ACTIONS), 1, p=self.behavior_policy[state,:])[0]
107
+ return action
108
+
109
+ def step(self, state:int, action:int)->Tuple[NDArray,float,bool,bool,dict]:
110
+
111
+ done = False
112
+ truncated = False
113
+ self.num_steps += 1
114
+
115
+
116
+ next_state = np.random.choice(np.arange(self.NUM_STATES),
117
+ p=self.transition_mat[state*self.NUM_ACTIONS+action,:])
118
+ next_phi = self.phi[next_state,:]
119
+ reward = self.rewards[state, next_state]
120
+
121
+ info = {'rho':self.target_policy[state, action]/self.behavior_policy[state, action]}
122
+ self.current_state = next_state
123
+ return next_state, next_phi, reward, done, truncated,info
124
+
125
+
126
+ def get_bellman_error(self, weight: np.ndarray) -> float:
127
+ be = self.proj @ (self.expected_rewards + self.gamma*self.p_pi @ self.phi @ weight - self.phi @ weight)
128
+ return np.linalg.norm(np.sqrt(self.d_mu)@be)
129
+
130
+ def get_error(self, weight)->float:
131
+ error = np.linalg.norm(weight - self.sol )
132
+ return error
133
+
134
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,22 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='test-mdp',
5
+ version='0.0.1',
6
+ description='test for policy evaluation examples',
7
+ author='teddylee777',
8
+ author_email='limaries30@kaist.ac.kr',
9
+ url='https://github.com/limaries30/test-mdp',
10
+ install_requires=['tqdm',],
11
+ packages=find_packages(exclude=[]),
12
+ keywords=['policy evaluation'],
13
+ python_requires='>=3.6',
14
+ package_data={},
15
+ zip_safe=False,
16
+ classifiers=[
17
+ 'Programming Language :: Python :: 3.6',
18
+ 'Programming Language :: Python :: 3.7',
19
+ 'Programming Language :: Python :: 3.8',
20
+ 'Programming Language :: Python :: 3.9',
21
+ ],
22
+ )
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: test-mdp
3
+ Version: 0.0.1
4
+ Summary: test for policy evaluation examples
5
+ Home-page: https://github.com/limaries30/test-mdp
6
+ Author: teddylee777
7
+ Author-email: limaries30@kaist.ac.kr
8
+ Keywords: policy evaluation
9
+ Classifier: Programming Language :: Python :: 3.6
10
+ Classifier: Programming Language :: Python :: 3.7
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Requires-Python: >=3.6
14
+ Requires-Dist: tqdm
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: home-page
19
+ Dynamic: keywords
20
+ Dynamic: requires-dist
21
+ Dynamic: requires-python
22
+ Dynamic: summary
@@ -0,0 +1,13 @@
1
+ setup.py
2
+ envs/__init__.py
3
+ envs/baird.py
4
+ envs/base_mdp.py
5
+ envs/env_utils.py
6
+ envs/random_walk_dependent.py
7
+ envs/random_walk_tabular.py
8
+ test_mdp.egg-info/PKG-INFO
9
+ test_mdp.egg-info/SOURCES.txt
10
+ test_mdp.egg-info/dependency_links.txt
11
+ test_mdp.egg-info/not-zip-safe
12
+ test_mdp.egg-info/requires.txt
13
+ test_mdp.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ tqdm
@@ -0,0 +1 @@
1
+ envs