PyPI - pyrlutils - Versions diffs - 0.0.1__tar.gz → 0.0.2__tar.gz - Mend

pyrlutils 0.0.1tar.gz → 0.0.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{pyrlutils-0.0.1/pyrlutils.egg-info → pyrlutils-0.0.2}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: pyrlutils
-Version: 0.0.1
+Version: 0.0.2
 Summary: Utility and Helpers for Reinformcement Learning
 Home-page: https://github.com/stephenhky/PyRLUtils
 Author: Kwan-Yuet Ho
 Author-email: stephenhky@yahoo.com.hk
-License: LGPL
+License: MIT
 Keywords: machine learning,reinforcement leaning,artifiial intelligence
 Platform: UNKNOWN
 Classifier: Topic :: Scientific/Engineering :: Mathematics

pyrlutils-0.0.2/pyrlutils/policy.py ADDED Viewed

@@ -0,0 +1,84 @@
+from abc import ABC, abstractmethod
+from typing import Union, Dict
+from warnings import warn
+import numpy as np
+from .state import State, DiscreteState, DiscreteStateValueType
+from .action import Action, DiscreteActionValueType
+class Policy(ABC):
+    @abstractmethod
+    def get_action(self, state: State) -> Action:
+        pass
+    def __call__(self, state: State) -> Action:
+        return self.get_action(state)
+    @property
+    def is_stochastic(self) -> bool:
+        raise NotImplemented()
+class DeterministicPolicy(Policy):
+    @abstractmethod
+    def add_deterministic_rule(self, *args, **kwargs):
+        pass
+    @property
+    def is_stochastic(self) -> bool:
+        return False
+class DiscreteDeterminsticPolicy(DeterministicPolicy):
+    def __init__(self, actions_dict: Dict[DiscreteActionValueType, Action]):
+        self._state_to_action = {}
+        self._actions_dict = actions_dict
+    def add_deterministic_rule(self, state_value: DiscreteStateValueType, action_value: DiscreteActionValueType):
+        if state_value in self._state_to_action:
+            warn('State value {} exists in rule; it will be replaced.'.format(state_value))
+        self._state_to_action[state_value] = action_value
+    def get_action_value(self, state_value: DiscreteStateValueType) -> DiscreteActionValueType:
+        return self._state_to_action.get(state_value)
+    def get_action(self, state: DiscreteState) -> Action:
+        return self._actions_dict[self.get_action_value(state.state_value)]
+    def __eq__(self, other) -> bool:
+        if len(self._state_to_action) != len(set(self._state_to_action.keys()).union(other._state_to_action.keys())):
+            return False
+        if len(self._actions_dict) != len(set(self._actions_dict.keys()).union(other._actions_dict.keys())):
+            return False
+        for action in self._actions_dict.keys():
+            if self._actions_dict[action] != other._actions_dict[action]:
+                return False
+        for state in self._state_to_action.keys():
+            if self._state_to_action[state] != other._state_to_action[state]:
+                return False
+        return True
+class StochasticPolicy(Policy):
+    @abstractmethod
+    def get_probability(self, *args, **kwargs) -> float:
+        pass
+    @property
+    def is_stochastic(self) -> bool:
+        return True
+class DiscreteStochasticPolicy(StochasticPolicy):
+    @abstractmethod
+    def get_probability(self, state_value: DiscreteStateValueType, action_value: DiscreteActionValueType) -> float:
+        pass
+class ContinuousStochasticPolicy(StochasticPolicy):
+    @abstractmethod
+    def get_probability(self, state_value: Union[float, np.ndarray], action_value: DiscreteActionValueType, value: Union[float, np.ndarray]) -> float:
+        pass

pyrlutils-0.0.1/pyrlutils/values.py → pyrlutils-0.0.2/pyrlutils/reward.py RENAMED Viewed

@@ -33,3 +33,5 @@ class RewardFunction(ABC):
     def __call__(self, state_value, action_value) -> float:
         return self.total_reward(state_value, action_value)

{pyrlutils-0.0.1 → pyrlutils-0.0.2}/pyrlutils/state.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional, Union
+from typing import Tuple, List, Optional, Union
 import numpy as np
@@ -23,7 +23,7 @@ class State(ABC):
         self.set_state_value(new_state_value)
-DiscreteStateValueType = Union[float, str]
+DiscreteStateValueType = Union[float, str, Tuple[int]]
 class DiscreteState(State):
@@ -52,6 +52,10 @@ class DiscreteState(State):
     def state_value(self, new_state_value: DiscreteStateValueType):
         self.set_state_value(new_state_value)
+    @property
+    def state_space_size(self):
+        return len(self._all_state_values)
 class InvalidRangeError(Exception):
     def __init__(self, message=None):
@@ -168,3 +172,25 @@ class ContinuousState(State):
         return self._nbdims
+class Discrete2DCartesianState(DiscreteState):
+    def __init__(self, x_lowlim: int, x_hilim: int, y_lowlim: int, y_hilim: int, initial_coordinate: List[int]=None):
+        self._x_lowlim = x_lowlim
+        self._x_hilim = x_hilim
+        self._y_lowlim = y_lowlim
+        self._y_hilim = y_hilim
+        self._countx = self._x_hilim - self._x_lowlim + 1
+        self._county = self._y_hilim - self._y_lowlim + 1
+        if initial_coordinate is None:
+            initial_coordinate = [self._x_lowlim, self._y_lowlim]
+        initial_value =  (initial_coordinate[1] - self._y_lowlim) * self._countx + (initial_coordinate[0] - self._x_lowlim)
+        super().__init__(list(range(self._countx*self._county)), initial_values=initial_value)
+    def _encode_coordinates(self, x, y) -> int:
+        return (y - self._y_lowlim) * self._countx + (x - self._x_lowlim)
+    def encode_coordinates(self, coordinates: List[int]) -> int:
+        assert len(coordinates) == 2
+        return self._encode_coordinates(coordinates[0], coordinates[1])
+    def decode_coordinates(self, hashcode) -> List[int]:
+        return [hashcode % self._countx, hashcode // self._countx]

{pyrlutils-0.0.1 → pyrlutils-0.0.2}/pyrlutils/transition.py RENAMED Viewed

@@ -3,9 +3,10 @@ from types import LambdaType
 from typing import Tuple, Dict
 import numpy as np
+import gym
 from .state import DiscreteState, DiscreteStateValueType
-from .values import IndividualRewardFunction
+from .reward import IndividualRewardFunction
 from .action import Action, DiscreteActionValueType
@@ -35,22 +36,22 @@ class NextStateTuple:
 class TransitionProbabilityFactory:
     def __init__(self):
-        self.transprobs = {}
-        self.all_state_values = []
-        self.all_action_values = []
-        self.objects_generated = False
+        self._transprobs = {}
+        self._all_state_values = []
+        self._all_action_values = []
+        self._objects_generated = False
     def add_state_transitions(self, state_value: DiscreteStateValueType, action_values_to_next_state: dict):
-        if state_value not in self.all_state_values:
-            self.all_state_values.append(state_value)
+        if state_value not in self._all_state_values:
+            self._all_state_values.append(state_value)
         this_state_transition_dict = {}
         for action_value, next_state_tuples in action_values_to_next_state.items():
             this_state_transition_dict[action_value] = []
             for next_state_tuple in next_state_tuples:
-                if action_value not in self.all_action_values:
-                    self.all_action_values.append(action_value)
+                if action_value not in self._all_action_values:
+                    self._all_action_values.append(action_value)
                 if not isinstance(next_state_tuple, NextStateTuple):
                     if isinstance(next_state_tuple, dict):
                         next_state_tuple = NextStateTuple(
@@ -62,16 +63,16 @@ class TransitionProbabilityFactory:
                     else:
                         raise TypeError('"action_values_to_next_state" has to be a dictionary or NextStateTuple instance.')
-                if next_state_tuple.next_state_value not in self.all_state_values:
-                    self.all_state_values.append(next_state_tuple.next_state_value)
+                if next_state_tuple.next_state_value not in self._all_state_values:
+                    self._all_state_values.append(next_state_tuple.next_state_value)
                 this_state_transition_dict[action_value].append(next_state_tuple)
-        self.transprobs[state_value] = this_state_transition_dict
+        self._transprobs[state_value] = this_state_transition_dict
     def _get_probs_for_eachstate(self, action_value: DiscreteActionValueType) -> Dict[DiscreteStateValueType, NextStateTuple]:
         state_nexttuples = {}
-        for state_value, action_nexttuples_pair in self.transprobs.items():
+        for state_value, action_nexttuples_pair in self._transprobs.items():
             for this_action_value, nexttuples in action_nexttuples_pair.items():
                 if this_action_value == action_value:
                     state_nexttuples[state_value] = nexttuples
@@ -92,16 +93,17 @@ class TransitionProbabilityFactory:
     def _generate_individual_reward_function(self) -> IndividualRewardFunction:
         def _individual_reward_function(state_value, action_value, next_state_value) -> float:
-            if state_value in self.transprobs.keys():
-                if action_value in self.transprobs[state_value].keys():
-                    for next_tuple in self.transprobs[state_value][action_value]:
-                        if next_tuple.next_state_value == next_state_value:
-                            return next_tuple.reward
-                    return 0.0
-                else:
-                    return 0.0
-            else:
-                return 0.0
+            if state_value not in self._transprobs.keys():
+                return 0.
+            if action_value not in self._transprobs[state_value].keys():
+                return 0.
+            reward = 0.
+            for next_tuple in self._transprobs[state_value][action_value]:
+                if next_tuple.next_state_value == next_state_value:
+                    reward += next_tuple.reward
+            return reward
         class ThisIndividualRewardFunction(IndividualRewardFunction):
             def __init__(self):
@@ -112,10 +114,27 @@ class TransitionProbabilityFactory:
         return ThisIndividualRewardFunction()
-    def generate_mdp_objects(self) -> Tuple[DiscreteState, dict, IndividualRewardFunction]:
-        state = DiscreteState(self.all_state_values)
+    def get_probability(self, state_value, action_value, new_state_value) -> float:
+        if state_value not in self._transprobs.keys():
+            return 0.
+        if action_value not in self._transprobs[state_value]:
+            return 0.
+        probs = 0.
+        for next_state_tuple in self._transprobs[state_value][action_value]:
+            if next_state_tuple.next_state_value == new_state_value:
+                probs += next_state_tuple.probability
+        return probs
+    @property
+    def transition_probabilities(self) -> dict:
+        return self._transprobs
+    def generate_mdp_objects(self) -> Tuple[DiscreteState, Dict[DiscreteActionValueType, Action], IndividualRewardFunction]:
+        state = DiscreteState(self._all_state_values)
         actions_dict = {}
-        for action_value in self.all_action_values:
+        for action_value in self._all_action_values:
             state_nexttuple = self._get_probs_for_eachstate(action_value)
             actions_dict[action_value] = Action(self._generate_action_function(state_nexttuple))
@@ -123,3 +142,24 @@ class TransitionProbabilityFactory:
         return state, actions_dict, individual_reward_fcn
+    @property
+    def objects_generated(self) -> bool:
+        return self._objects_generated
+class OpenAIGymDiscreteEnvironmentTransitionProbabilityFactory(TransitionProbabilityFactory):
+    def __init__(self, envname):
+        super().__init__()
+        self.gymenv = gym.make(envname)
+        self._convert_openai_gymenv_to_transprob()
+    def _convert_openai_gymenv_to_transprob(self):
+        P = self.gymenv.env.P
+        for state_value, trans_dict in P.items():
+            new_trans_dict = {}
+            for action_value, next_state_list in trans_dict.items():
+                new_trans_dict[action_value] = [
+                    NextStateTuple(next_state[1], next_state[0], next_state[2], next_state[3])
+                    for next_state in next_state_list
+                ]
+            self.add_state_transitions(state_value, new_trans_dict)

pyrlutils-0.0.2/pyrlutils/valuefcns.py ADDED Viewed

@@ -0,0 +1,144 @@
+import random
+from copy import copy
+from typing import Tuple, Dict
+from itertools import product
+import numpy as np
+from .state import DiscreteStateValueType
+from .transition import TransitionProbabilityFactory
+from .policy import DiscreteDeterminsticPolicy
+class OptimalPolicyOnValueFunctions:
+    def __init__(self, discount_factor: float, transprobfac: TransitionProbabilityFactory):
+        try:
+            assert discount_factor >= 0. and discount_factor <= 1.
+        except AssertionError:
+            raise ValueError('Discount factor must be between 0 and 1.')
+        self._gamma = discount_factor
+        self._transprobfac = transprobfac
+        self._states, self._actions_dict, self._indrewardfcn = self._transprobfac.generate_mdp_objects()
+        self._state_names = self._states.get_all_possible_state_values()
+        self._states_to_indices = {state: idx for idx, state in enumerate(self._state_names)}
+        self._action_names = list(self._actions_dict.keys())
+        self._actions_to_indices = {action_value: idx for idx, action_value in enumerate(self._action_names)}
+        self._evaluated = False
+        self._improved = False
+        self._theta = 1e-10
+        self._policy_evaluation_maxiter = 10000
+    def _policy_evaluation(self, policy: DiscreteDeterminsticPolicy) -> np.ndarray:
+        prev_V = np.zeros(len(self._states_to_indices))
+        for _ in range(self._policy_evaluation_maxiter):
+            V = np.zeros(len(self._states_to_indices))
+            for state_value in self._state_names:
+                state_index = self._states_to_indices[state_value]
+                action_value = policy.get_action_value(state_value)
+                for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
+                    prob = next_state_tuple.probability
+                    reward = next_state_tuple.reward
+                    next_state_value = next_state_tuple.next_state_value
+                    next_state_index = self._states_to_indices[next_state_value]
+                    terminal = next_state_tuple.terminal
+                    V[state_index] += prob * (reward + (self._gamma*prev_V[next_state_index] if not terminal else 0.))
+            if np.max(np.abs(V-prev_V)) < self._theta:
+                break
+            prev_V = V.copy()
+        return V
+    def _policy_improvement(self, V: np.ndarray) -> DiscreteDeterminsticPolicy:
+        Q = np.zeros((len(self._states_to_indices), len(self._actions_to_indices)))
+        for state_value in self._state_names:
+            state_index = self._states_to_indices[state_value]
+            for action_value in self._action_names:
+                action_index = self._actions_to_indices[action_value]
+                for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
+                    prob = next_state_tuple.probability
+                    reward = next_state_tuple.reward
+                    next_state_value = next_state_tuple.next_state_value
+                    next_state_index = self._states_to_indices[next_state_value]
+                    terminal = next_state_tuple.terminal
+                    Q[state_index, action_index] += prob * (reward + (self._gamma*V[next_state_index] if not terminal else 0.))
+        optimal_policy = DiscreteDeterminsticPolicy(self._actions_dict)
+        optimal_action_indices = np.argmax(Q, axis=1)
+        for state_value, action_index in zip(self._state_names, optimal_action_indices):
+            action_value = self._action_names[action_index]
+            optimal_policy.add_deterministic_rule(state_value, action_value)
+        return optimal_policy
+    def _policy_iteration(self) -> Tuple[np.ndarray, DiscreteDeterminsticPolicy]:
+        policy = DiscreteDeterminsticPolicy(self._actions_dict)
+        for state_value in self._state_names:
+            policy.add_deterministic_rule(state_value, random.choice(self._action_names))
+        V = None
+        done = False
+        while not done:
+            old_policy = copy(policy)
+            V = self._policy_evaluation(policy)
+            policy = self._policy_improvement(V)
+            if policy == old_policy:
+                done = True
+        return V, policy
+    def _value_iteration(self) -> Tuple[np.ndarray, DiscreteDeterminsticPolicy]:
+        V = np.zeros(len(self._state_names))
+        for _ in range(self._policy_evaluation_maxiter):
+            Q = np.zeros((len(self._state_names), len(self._action_names)))
+            for state_value, action_value in product(self._state_names, self._action_names):
+                state_index = self._states_to_indices[state_value]
+                action_index = self._actions_to_indices[action_value]
+                for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
+                    prob = next_state_tuple.probability
+                    reward = next_state_tuple.reward
+                    next_state_value = next_state_tuple.next_state_value
+                    next_state_index = self._states_to_indices[next_state_value]
+                    terminal = next_state_tuple.terminal
+                    Q[state_index, action_index] += prob * (reward + (self._gamma * V[next_state_index] if not terminal else 0.))
+            if np.max(np.abs(V-np.max(Q, axis=1))) < self._theta:
+                break
+            V = np.max(Q, axis=1)
+        Qmaxj = np.argmax(Q, axis=1)
+        policy = DiscreteDeterminsticPolicy(self._actions_dict)
+        for state_value, action_index in zip(self._state_names, Qmaxj):
+            policy.add_deterministic_rule(state_value, self._action_names[action_index])
+        return V, policy
+    def policy_iteration(self) -> Tuple[Dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
+        V, policy = self._policy_iteration()
+        state_values_dict = {
+            self._state_names[i]: V[i]
+            for i in range(V.shape[0])
+        }
+        return state_values_dict, policy
+    def value_iteration(self) -> Tuple[Dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
+        V, policy = self._value_iteration()
+        state_values_dict = {
+            self._state_names[i]: V[i]
+            for i in range(V.shape[0])
+        }
+        return state_values_dict, policy

{pyrlutils-0.0.1 → pyrlutils-0.0.2/pyrlutils.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: pyrlutils
-Version: 0.0.1
+Version: 0.0.2
 Summary: Utility and Helpers for Reinformcement Learning
 Home-page: https://github.com/stephenhky/PyRLUtils
 Author: Kwan-Yuet Ho
 Author-email: stephenhky@yahoo.com.hk
-License: LGPL
+License: MIT
 Keywords: machine learning,reinforcement leaning,artifiial intelligence
 Platform: UNKNOWN
 Classifier: Topic :: Scientific/Engineering :: Mathematics

{pyrlutils-0.0.1 → pyrlutils-0.0.2}/pyrlutils.egg-info/SOURCES.txt RENAMED Viewed

@@ -5,16 +5,20 @@ setup.py
 pyrlutils/__init__.py
 pyrlutils/action.py
 pyrlutils/policy.py
+pyrlutils/reward.py
 pyrlutils/state.py
 pyrlutils/transition.py
-pyrlutils/values.py
+pyrlutils/valuefcns.py
 pyrlutils.egg-info/PKG-INFO
 pyrlutils.egg-info/SOURCES.txt
 pyrlutils.egg-info/dependency_links.txt
 pyrlutils.egg-info/not-zip-safe
 pyrlutils.egg-info/requires.txt
 pyrlutils.egg-info/top_level.txt
+test/test_2ddiscrete.py
+test/test_2dmaze.py
 test/test_action.py
 test/test_continous_state_actions.py
+test/test_frozenlake.py
 test/test_state.py
 test/test_transprobs.py

{pyrlutils-0.0.1 → pyrlutils-0.0.2}/pyrlutils.egg-info/requires.txt RENAMED Viewed

	@@ -1 +1,2 @@
1 1	numpy
2	+ gym

{pyrlutils-0.0.1 → pyrlutils-0.0.2}/setup.py RENAMED Viewed

@@ -18,7 +18,7 @@ def package_description():
 setup(
     name='pyrlutils',
-    version="0.0.1",
+    version="0.0.2",
     description="Utility and Helpers for Reinformcement Learning",
     long_description=package_description(),
     long_description_content_type='text/markdown',
@@ -38,7 +38,7 @@ setup(
     url="https://github.com/stephenhky/PyRLUtils",
     author="Kwan-Yuet Ho",
     author_email="stephenhky@yahoo.com.hk",
-    license='LGPL',
+    license='MIT',
     packages=[
         'pyrlutils'
     ],

pyrlutils-0.0.2/test/test_2ddiscrete.py ADDED Viewed

@@ -0,0 +1,20 @@
+import unittest
+from pyrlutils.state import Discrete2DCartesianState
+class Test2DDiscreteState(unittest.TestCase):
+    def test_twobythree(self):
+        state = Discrete2DCartesianState(0, 1, 0, 2)
+        assert state.state_space_size == 6
+        assert state.state_value == 0
+        state.set_state_value(5)
+        assert state.decode_coordinates(state.state_value) == [1, 2]
+if __name__ == '__main__':
+    unittest.main()

pyrlutils-0.0.2/test/test_2dmaze.py ADDED Viewed

@@ -0,0 +1,341 @@
+import unittest
+from pyrlutils.transition import TransitionProbabilityFactory, NextStateTuple
+from pyrlutils.valuefcns import OptimalPolicyOnValueFunctions
+from pyrlutils.state import Discrete2DCartesianState
+class Test2DMaze(unittest.TestCase):
+    def setUp(self):
+        maze_state = Discrete2DCartesianState(0, 5, 0, 4, initial_coordinate=[0, 0])
+        transprobfactory = TransitionProbabilityFactory()
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([0, 0]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([0, 0]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([0, 0]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([0, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([0, 1]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([0, 1]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([0, 1]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([1, 1]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([0, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([0, 2]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([0, 2]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([0, 2]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([0, 2]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([0, 1]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([0, 3]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([0, 3]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([0, 3]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([1, 3]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([0, 2]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([0, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([0, 4]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([0, 4]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([1, 4]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([0, 3]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([0, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([1, 0]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([1, 0]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([2, 0]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([1, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([1, 1]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([1, 1]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([0, 1]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([1, 1]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([1, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([1, 1]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([1, 2]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([1, 2]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([2, 2]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([1, 2]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([1, 3]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([1, 3]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([0, 3]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([1, 3]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([1, 2]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([1, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([1, 4]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([0, 4]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([1, 4]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([1, 4]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([1, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([2, 0]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([1, 0]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([3, 0]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([2, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([2, 1]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([2, 1]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([2, 1]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([2, 1]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([2, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([2, 2]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([2, 2]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([1, 2]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([2, 2]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([2, 1]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([2, 3]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([2, 3]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([2, 3]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([3, 3]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([2, 2]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([2, 3]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([2, 4]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([2, 4]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([3, 4]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([2, 4]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([2, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([3, 0]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([2, 0]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([4, 0]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([3, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([3, 1]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([3, 1]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([3, 1]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([3, 1]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([3, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([3, 2]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([3, 2]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([3, 2]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([4, 2]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([3, 1]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([3, 3]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([3, 3]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([2, 3]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([4, 3]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([3, 3]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([3, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([3, 4]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([2, 4]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([3, 4]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([3, 3]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([3, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([4, 0]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([3, 0]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([5, 0]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([4, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([4, 0]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([4, 1]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([4, 1]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([5, 1]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([4, 1]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([4, 1]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([4, 2]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([3, 2]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([4, 2]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([4, 2]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([4, 3]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([4, 3]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([3, 3]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([4, 3]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([4, 2]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([4, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([4, 4]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([4, 4]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([5, 4]), 1., 1., True)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([4, 3]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([4, 4]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([5, 0]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([4, 0]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([5, 0]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([5, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([5, 1]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([5, 1]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([4, 1]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([5, 1]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([5, 0]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([5, 1]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([5, 2]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([5, 2]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([5, 2]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([5, 2]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([5, 3]), 1., 0., False)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([5, 3]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([5, 3]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([5, 3]), 1., 0., False)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([5, 2]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([5, 4]), 1., 1., True)]
+            }
+        )
+        transprobfactory.add_state_transitions(
+            maze_state.encode_coordinates([5, 4]),
+            {
+                'up': [NextStateTuple(maze_state.encode_coordinates([4, 4]), 1., 0., False)],
+                'down': [NextStateTuple(maze_state.encode_coordinates([5, 4]), 1., 1., True)],
+                'left': [NextStateTuple(maze_state.encode_coordinates([5, 3]), 1., 0., False)],
+                'right': [NextStateTuple(maze_state.encode_coordinates([5, 4]), 1., 1., True)]
+            }
+        )
+        self.transprobfactory = transprobfactory
+        self.maze_state = maze_state
+    def test_policy_iteration(self):
+        policy_finder = OptimalPolicyOnValueFunctions(0.85, self.transprobfactory)
+        values_dict, policy = policy_finder.policy_iteration()
+        for state_value, value in values_dict.items():
+            [x, y] = self.maze_state.decode_coordinates(state_value)
+            print('({}, {}): {}'.format(x, y, value))
+        state, actions_dict, _ = self.transprobfactory.generate_mdp_objects()
+        arrived_destination = False
+        for _ in range(state.state_space_size*2):
+            action_value = policy.get_action_value(state)
+            print('Action value: {}'.format(action_value))
+            action = policy.get_action(state)
+            state = action(state)
+            coordinates = self.maze_state.decode_coordinates(state.state_value)
+            print('at: {}, {}'.format(coordinates[0], coordinates[1]))
+            if coordinates[0] == 5 and coordinates[1] == 4:
+                arrived_destination = True
+                break
+        assert arrived_destination
+    def test_value_iteration(self):
+        policy_finder = OptimalPolicyOnValueFunctions(0.85, self.transprobfactory)
+        values_dict, policy = policy_finder.value_iteration()
+        for state_value, value in values_dict.items():
+            [x, y] = self.maze_state.decode_coordinates(state_value)
+            print('({}, {}): {}'.format(x, y, value))
+        state, actions_dict, _ = self.transprobfactory.generate_mdp_objects()
+        arrived_destination = False
+        for _ in range(state.state_space_size*2):
+            action_value = policy.get_action_value(state)
+            print('Action value: {}'.format(action_value))
+            action = policy.get_action(state)
+            state = action(state)
+            coordinates = self.maze_state.decode_coordinates(state.state_value)
+            print('at: {}, {}'.format(coordinates[0], coordinates[1]))
+            if coordinates[0] == 5 and coordinates[1] == 4:
+                arrived_destination = True
+                break
+        assert arrived_destination
+if __name__ == '__main__':
+    unittest.main()

{pyrlutils-0.0.1 → pyrlutils-0.0.2}/test/test_action.py RENAMED Viewed

@@ -1,8 +1,6 @@
 import unittest
-import numpy as np
 from pyrlutils.state import DiscreteState
 from pyrlutils.action import Action

pyrlutils-0.0.2/test/test_frozenlake.py ADDED Viewed

@@ -0,0 +1,29 @@
+import unittest
+from pyrlutils.transition import OpenAIGymDiscreteEnvironmentTransitionProbabilityFactory
+class TestFrozenLake(unittest.TestCase):
+    def test_factory(self):
+        tranprobfactory = OpenAIGymDiscreteEnvironmentTransitionProbabilityFactory('FrozenLake-v1')
+        state, actions_dict, ind_reward_fcn = tranprobfactory.generate_mdp_objects()
+        assert len(state.get_all_possible_state_values()) == 16
+        assert state.state_value == 0
+        actions_dict[0](state)
+        assert state.state_value in {0, 4}
+        state.state_value = 15
+        actions_dict[2](state)
+        assert state.state_value == 15
+        assert ind_reward_fcn(0, 0, 0) == 0.0
+        assert ind_reward_fcn(14, 3, 15) == 1.0
+        assert abs(tranprobfactory.get_probability(0, 0, 0) - 0.66667) < 1e-4
+        assert abs(tranprobfactory.get_probability(14, 3, 15) - 0.33333) < 1e-4
+if __name__ == '__main__':
+    unittest.main()

pyrlutils-0.0.1/pyrlutils/policy.py DELETED Viewed

@@ -1,34 +0,0 @@
-from abc import ABC, abstractmethod
-from .state import State
-from .action import Action
-class Policy(ABC):
-    @abstractmethod
-    def get_action(self, state: State) -> Action:
-        pass
-    def __call__(self, state: State) -> Action:
-        return self.get_action(state)
-    @property
-    def is_stochastic(self) -> bool:
-        pass
-class DeterministicPolicy(Policy):
-    @property
-    def is_stochastic(self) -> bool:
-        return False
-class StochasticPolicy(Policy):
-    @abstractmethod
-    def get_probability(self, state: State, action: Action) -> float:
-        pass
-    @property
-    def is_stochastic(self) -> bool:
-        return True