PyPI - pyrlutils - Versions diffs - 0.0.4__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

pyrlutils 0.0.4py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyrlutils might be problematic. Click here for more details.

Files changed (21) hide show

pyrlutils/action.py +10 -2
pyrlutils/bandit/reward.py +3 -2
pyrlutils/dp/__init__.py +0 -0
pyrlutils/{valuefcns.py → dp/valuefcns.py} +16 -11
pyrlutils/helpers/__init__.py +0 -0
pyrlutils/helpers/exceptions.py +5 -0
pyrlutils/openai/utils.py +3 -3
pyrlutils/policy.py +79 -12
pyrlutils/state.py +169 -74
pyrlutils/td/__init__.py +0 -0
pyrlutils/td/qlearn.py +86 -0
pyrlutils/td/sarsa.py +86 -0
pyrlutils/td/state_td.py +111 -0
pyrlutils/td/utils.py +258 -0
pyrlutils/transition.py +44 -35
{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.1.dist-info}/METADATA +7 -6
pyrlutils-0.1.1.dist-info/RECORD +25 -0
{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.1.dist-info}/WHEEL +1 -1
pyrlutils-0.0.4.dist-info/RECORD +0 -17
{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.1.dist-info/licenses}/LICENSE +0 -0
{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.1.dist-info}/top_level.txt +0 -0

pyrlutils/td/qlearn.py ADDED Viewed

@@ -0,0 +1,86 @@
+from typing import Annotated
+import numpy as np
+from npdict import NumpyNDArrayWrappedDict
+from .utils import AbstractStateActionValueFunctionTemporalDifferenceLearner, decay_schedule, select_action
+from ..policy import DiscreteDeterminsticPolicy
+class QLearner(AbstractStateActionValueFunctionTemporalDifferenceLearner):
+    def learn(
+            self,
+            episodes: int
+    ) -> tuple[
+        Annotated[NumpyNDArrayWrappedDict, "2D array"],
+        Annotated[NumpyNDArrayWrappedDict, "1D array"],
+        DiscreteDeterminsticPolicy,
+        Annotated[NumpyNDArrayWrappedDict, "3D array"],
+        list[DiscreteDeterminsticPolicy]
+    ]:
+        Q = NumpyNDArrayWrappedDict(
+            [
+                self._state.get_all_possible_state_values(),
+                self._action_names
+            ],
+            default_initial_value=0.0
+        )
+        Q_track = NumpyNDArrayWrappedDict(
+            [
+                list(range(episodes)),
+                self._state.get_all_possible_state_values(),
+                self._action_names
+            ],
+            default_initial_value=0.0
+        )
+        pi_track = []
+        Q_array, Q_track_array = Q.to_numpy(), Q_track.to_numpy()
+        alphas = decay_schedule(
+            self.init_alpha, self.min_alpha, self.alpha_decay_ratio, episodes
+        )
+        epsilons = decay_schedule(
+            self.init_epsilon, self.min_epsilon, self.epsilon_decay_ratio, episodes
+        )
+        for i in range(episodes):
+            self._state.state_index = self.initial_state_index
+            done = False
+            action_value = select_action(self._state.state_value, Q, epsilons[i])
+            while not done:
+                old_state_value = self._state.state_value
+                new_action_value = select_action(self._state.state_value, Q, epsilons[i])
+                new_action_func = self._actions_dict[new_action_value]
+                self._state = new_action_func(self._state)
+                new_state_value = self._state.state_value
+                reward = self._indrewardfcn(old_state_value, action_value, new_state_value)
+                done = self._state.is_terminal
+                new_state_index = Q.get_key_index(0, new_state_value)
+                max_Q_given_state = Q.to_numpy()[new_state_index, :].max()
+                td_target = reward + self.gamma * max_Q_given_state * (not done)
+                td_error = td_target - Q[old_state_value, action_value]
+                Q[old_state_value, action_value] = Q[old_state_value, action_value] + alphas[i] * td_error
+            Q_track_array[i, :, :] = Q_array
+            pi_track.append(DiscreteDeterminsticPolicy(
+                {
+                    state_value: select_action(state_value, Q, epsilon=0.0)
+                    for state_value in self._state.get_all_possible_state_values()
+                }
+            ))
+        V_array = np.max(Q_array, axis=1)
+        V = NumpyNDArrayWrappedDict.from_numpyarray_given_keywords(
+            [self._state.get_all_possible_state_values()],
+            V_array
+        )
+        pi = DiscreteDeterminsticPolicy(
+                {
+                    state_value: select_action(state_value, Q, epsilon=0.0)
+                    for state_value in self._state.get_all_possible_state_values()
+                }
+        )
+        return Q, V, pi, Q_track, pi_track

pyrlutils/td/sarsa.py ADDED Viewed

@@ -0,0 +1,86 @@
+from typing import Annotated
+import numpy as np
+from npdict import NumpyNDArrayWrappedDict
+from .utils import AbstractStateActionValueFunctionTemporalDifferenceLearner, decay_schedule, select_action
+from ..policy import DiscreteDeterminsticPolicy
+class SARSALearner(AbstractStateActionValueFunctionTemporalDifferenceLearner):
+    def learn(
+            self,
+            episodes: int
+    ) -> tuple[
+        Annotated[NumpyNDArrayWrappedDict, "2D array"],
+        Annotated[NumpyNDArrayWrappedDict, "1D array"],
+        DiscreteDeterminsticPolicy,
+        Annotated[NumpyNDArrayWrappedDict, "3D array"],
+        list[DiscreteDeterminsticPolicy]
+    ]:
+        Q = NumpyNDArrayWrappedDict(
+            [
+                self._state.get_all_possible_state_values(),
+                self._action_names
+            ],
+            default_initial_value=0.0
+        )
+        Q_track = NumpyNDArrayWrappedDict(
+            [
+                list(range(episodes)),
+                self._state.get_all_possible_state_values(),
+                self._action_names
+            ],
+            default_initial_value=0.0
+        )
+        pi_track = []
+        Q_array, Q_track_array = Q.to_numpy(), Q_track.to_numpy()
+        alphas = decay_schedule(
+            self.init_alpha, self.min_alpha, self.alpha_decay_ratio, episodes
+        )
+        epsilons = decay_schedule(
+            self.init_epsilon, self.min_epsilon, self.epsilon_decay_ratio, episodes
+        )
+        for i in range(episodes):
+            self._state.state_index = self.initial_state_index
+            done = False
+            action_value = select_action(self._state.state_value, Q, epsilons[i])
+            while not done:
+                old_state_value = self._state.state_value
+                action_func = self._actions_dict[action_value]
+                self._state = action_func(self._state)
+                new_state_value = self._state.state_value
+                reward = self._indrewardfcn(old_state_value, action_value, new_state_value)
+                done = self._state.is_terminal
+                new_action_value = select_action(new_state_value, Q, epsilons[i])
+                td_target = reward + self.gamma * Q[new_state_value, new_action_value] * (not done)
+                td_error = td_target - Q[old_state_value, action_value]
+                Q[old_state_value, action_value] = Q[old_state_value, action_value] + alphas[i] * td_error
+                action_value = new_action_value
+            Q_track_array[i, :, :] = Q_array
+            pi_track.append(DiscreteDeterminsticPolicy(
+                {
+                    state_value: select_action(state_value, Q, epsilon=0.0)
+                    for state_value in self._state.get_all_possible_state_values()
+                }
+            ))
+        V_array = np.max(Q_array, axis=1)
+        V = NumpyNDArrayWrappedDict.from_numpyarray_given_keywords(
+            [self._state.get_all_possible_state_values()],
+            V_array
+        )
+        pi = DiscreteDeterminsticPolicy(
+                {
+                    state_value: select_action(state_value, Q, epsilon=0.0)
+                    for state_value in self._state.get_all_possible_state_values()
+                }
+        )
+        return Q, V, pi, Q_track, pi_track

pyrlutils/td/state_td.py ADDED Viewed

@@ -0,0 +1,111 @@
+from typing import Annotated
+import numpy as np
+from npdict import NumpyNDArrayWrappedDict
+from .utils import decay_schedule, TimeDifferencePathElements, AbstractStateValueFunctionTemporalDifferenceLearner
+class SingleStepTemporalDifferenceLearner(AbstractStateValueFunctionTemporalDifferenceLearner):
+    def learn(
+            self,
+            episodes: int
+    ) -> tuple[Annotated[NumpyNDArrayWrappedDict, "1D Array"], Annotated[NumpyNDArrayWrappedDict, "2D Array"]]:
+        V = NumpyNDArrayWrappedDict(
+            [self._state.get_all_possible_state_values()],
+            default_initial_value=0.0
+        )
+        V_track = NumpyNDArrayWrappedDict(
+            [list(range(episodes)), self._state.get_all_possible_state_values()],
+            default_initial_value=0.0
+        )
+        V_array, V_track_array = V.to_numpy(), V_track.to_numpy()
+        alphas = decay_schedule(
+            self.init_alpha, self.min_alpha, self.alpha_decay_ratio, episodes
+        )
+        for i in range(episodes):
+            self._state.state_index = self.initial_state_index
+            done = False
+            while not done:
+                old_state_value = self._state.state_value
+                action_value = self._policy.get_action_value(self._state.state_value)
+                action_func = self._actions_dict[action_value]
+                self._state = action_func(self._state)
+                new_state_value = self._state.state_value
+                reward = self._indrewardfcn(old_state_value, action_value, new_state_value)
+                done = self._state.is_terminal
+                td_target = reward + self.gamma * V[new_state_value] * (not done)
+                td_error = td_target - V[old_state_value]
+                V[old_state_value] = V[old_state_value] + alphas[i] * td_error
+            V_track_array[i, :] = V_array
+        return V, V_track
+class MultipleStepTemporalDifferenceLearner(AbstractStateValueFunctionTemporalDifferenceLearner):
+    def learn(
+            self,
+            episodes: int,
+            n_steps: int=3
+    ) -> tuple[Annotated[NumpyNDArrayWrappedDict, "1D Array"], Annotated[NumpyNDArrayWrappedDict, "2D Array"]]:
+        V = NumpyNDArrayWrappedDict(
+            [self._state.get_all_possible_state_values()],
+            default_initial_value=0.0
+        )
+        V_track = NumpyNDArrayWrappedDict(
+            [list(range(episodes)), self._state.get_all_possible_state_values()],
+            default_initial_value=0.0
+        )
+        V_array, V_track_array = V.to_numpy(), V_track.to_numpy()
+        alphas = decay_schedule(
+            self.init_alpha, self.min_alpha, self.alpha_decay_ratio, episodes
+        )
+        discounts = np.logspace(0, n_steps-1, num=n_steps+1, base=self.gamma, endpoint=False)
+        for i in range(episodes):
+            self._state.state_index = self.initial_state_index
+            done = False
+            path = []
+            while not done or path is not None:
+                path = path[1:]     # worth revisiting this line
+                new_state_value = self._state._get_state_value_from_index(self._state.nb_state_values-1)
+                while not done and len(path) < n_steps:
+                    old_state_value = self._state.state_value
+                    action_value = self._policy.get_action_value(self._state.state_value)
+                    action_func = self._actions_dict[action_value]
+                    self._state = action_func(self._state)
+                    new_state_value = self._state.state_value
+                    reward = self._indrewardfcn(old_state_value, action_value, new_state_value)
+                    done = self._state.is_terminal
+                    path.append(
+                        TimeDifferencePathElements(
+                            this_state_value=old_state_value,
+                            reward=reward,
+                            next_state_value=new_state_value,
+                            done=done
+                        )
+                    )
+                    if done:
+                        break
+                n = len(path)
+                estimated_state_value = path[0].this_state_value
+                rewards = np.array([this_moment.reward for this_moment in path])
+                partial_return = discounts[n:] * rewards
+                bs_val = discounts[-1] * V[new_state_value] * (not done)
+                ntd_target = np.sum(np.append(partial_return, bs_val))
+                ntd_error = ntd_target - V[estimated_state_value]
+                V[(estimated_state_value,)] = V[estimated_state_value] + alphas[i] * ntd_error
+                if len(path) == 1 and path[0].done:
+                    path = None
+            V_track_array[i, :] = V_array
+        return V, V_track

pyrlutils/td/utils.py ADDED Viewed

@@ -0,0 +1,258 @@
+from typing import Annotated, Union, Optional
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+import numpy as np
+from numpy.typing import NDArray
+from npdict import NumpyNDArrayWrappedDict
+from ..state import DiscreteStateValueType
+from ..action import DiscreteActionValueType
+from ..policy import DiscretePolicy
+from ..transition import TransitionProbabilityFactory
+def decay_schedule(
+        init_value: float,
+        min_value: float,
+        decay_ratio: float,
+        max_steps: int,
+        log_start: int=-2,
+        log_base: int=10
+) -> Annotated[NDArray[np.float64], "1D Array"]:
+    decay_steps = int(max_steps*decay_ratio)
+    rem_steps = max_steps - decay_steps
+    values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
+    values = (values - values.min()) / (values.max() - values.min())
+    values = (init_value - min_value) * values + min_value
+    values = np.pad(values, (0, rem_steps), 'edge')
+    return values
+def select_action(
+        state_value: DiscreteStateValueType,
+        Q: Union[Annotated[NDArray[np.float64], "2D Array"], NumpyNDArrayWrappedDict],
+        epsilon: float,
+) -> Union[DiscreteActionValueType, int]:
+    if np.random.random() <= epsilon:
+        if isinstance(Q, NumpyNDArrayWrappedDict):
+            return np.random.choice(Q._lists_keystrings[1])
+        else:
+            return np.random.choice(np.arange(Q.shape[1]))
+    q_matrix = Q.to_numpy() if isinstance(Q, NumpyNDArrayWrappedDict) else Q
+    state_index = Q.get_key_index(0, state_value) if isinstance(Q, NumpyNDArrayWrappedDict) else state_value
+    max_index = np.argmax(q_matrix[state_index, :])
+    if isinstance(Q, NumpyNDArrayWrappedDict):
+        return Q._lists_keystrings[1][max_index]
+    else:
+        return max_index
+@dataclass
+class TimeDifferencePathElements:
+    this_state_value: DiscreteStateValueType
+    reward: float
+    next_state_value: DiscreteStateValueType
+    done: bool
+class AbstractStateValueFunctionTemporalDifferenceLearner(ABC):
+    def __init__(
+            self,
+            transprobfac: TransitionProbabilityFactory,
+            gamma: float=1.0,
+            init_alpha: float=0.5,
+            min_alpha: float=0.01,
+            alpha_decay_ratio: float=0.3,
+            policy: Optional[DiscretePolicy]=None,
+            initial_state_index: int=0
+    ):
+        self._gamma = gamma
+        self._init_alpha = init_alpha
+        self._min_alpha = min_alpha
+        try:
+            assert 0.0 <= alpha_decay_ratio <= 1.0
+        except AssertionError:
+            raise ValueError("alpha_decay_ratio must be between 0 and 1!")
+        self._alpha_decay_ratio = alpha_decay_ratio
+        self._transprobfac = transprobfac
+        self._state, self._actions_dict, self._indrewardfcn = self._transprobfac.generate_mdp_objects()
+        self._action_names = list(self._actions_dict.keys())
+        self._actions_to_indices = {action_value: idx for idx, action_value in enumerate(self._action_names)}
+        self._policy = policy
+        try:
+            assert 0 <= initial_state_index < self._state.nb_state_values
+        except AssertionError:
+            raise ValueError(f"Initial state index must be between 0 and {self._state.nb_state_values}")
+        self._init_state_index = initial_state_index
+    @abstractmethod
+    def learn(self, *args, **kwargs) -> tuple[Annotated[NDArray[np.float64], "1D Array"], Annotated[NDArray[np.float64], "2D Array"]]:
+        raise NotImplementedError()
+    @property
+    def nb_states(self) -> int:
+        return self._state.nb_state_values
+    @property
+    def policy(self) -> DiscretePolicy:
+        return self._policy
+    @policy.setter
+    def policy(self, val: DiscretePolicy):
+        self._policy = val
+    @property
+    def gamma(self) -> float:
+        return self._gamma
+    @gamma.setter
+    def gamma(self, val: float):
+        self._gamma = val
+    @property
+    def init_alpha(self) -> float:
+        return self._init_alpha
+    @init_alpha.setter
+    def init_alpha(self, val: float):
+        self._init_alpha = val
+    @property
+    def min_alpha(self) -> float:
+        return self._min_alpha
+    @min_alpha.setter
+    def min_alpha(self, val: float):
+        self._min_alpha = val
+    @property
+    def alpha_decay_ratio(self) -> float:
+        return self._alpha_decay_ratio
+    @property
+    def initial_state_index(self) -> int:
+        return self._init_state_index
+    @initial_state_index.setter
+    def initial_state_index(self, val: int):
+        self._init_state_index = val
+class AbstractStateActionValueFunctionTemporalDifferenceLearner(ABC):
+    def __init__(
+            self,
+            transprobfac: TransitionProbabilityFactory,
+            gamma: float=1.0,
+            init_alpha: float=0.5,
+            min_alpha: float=0.01,
+            alpha_decay_ratio: float=0.3,
+            init_epsilon: float=1.0,
+            min_epsilon: float=0.1,
+            epsilon_decay_ratio: float=0.9,
+            policy: Optional[DiscretePolicy]=None,
+            initial_state_index: int=0
+    ):
+        self._gamma = gamma
+        self._init_alpha = init_alpha
+        self._min_alpha = min_alpha
+        try:
+            assert 0.0 <= alpha_decay_ratio <= 1.0
+        except AssertionError:
+            raise ValueError("alpha_decay_ratio must be between 0 and 1!")
+        self._alpha_decay_ratio = alpha_decay_ratio
+        self._init_epsilon = init_epsilon
+        self._min_epsilon = min_epsilon
+        self._epsilon_decay_ratio = epsilon_decay_ratio
+        self._transprobfac = transprobfac
+        self._state, self._actions_dict, self._indrewardfcn = self._transprobfac.generate_mdp_objects()
+        self._action_names = list(self._actions_dict.keys())
+        self._actions_to_indices = {action_value: idx for idx, action_value in enumerate(self._action_names)}
+        self._policy = policy
+        try:
+            assert 0 <= initial_state_index < self._state.nb_state_values
+        except AssertionError:
+            raise ValueError(f"Initial state index must be between 0 and {self._state.nb_state_values}")
+        self._init_state_index = initial_state_index
+    @abstractmethod
+    def learn(self, *args, **kwargs) -> tuple[Annotated[NDArray[np.float64], "1D Array"], Annotated[NDArray[np.float64], "2D Array"]]:
+        raise NotImplementedError()
+    @property
+    def nb_states(self) -> int:
+        return self._state.nb_state_values
+    @property
+    def policy(self) -> DiscretePolicy:
+        return self._policy
+    @policy.setter
+    def policy(self, val: DiscretePolicy):
+        self._policy = val
+    @property
+    def gamma(self) -> float:
+        return self._gamma
+    @gamma.setter
+    def gamma(self, val: float):
+        self._gamma = val
+    @property
+    def init_alpha(self) -> float:
+        return self._init_alpha
+    @init_alpha.setter
+    def init_alpha(self, val: float):
+        self._init_alpha = val
+    @property
+    def min_alpha(self) -> float:
+        return self._min_alpha
+    @min_alpha.setter
+    def min_alpha(self, val: float):
+        self._min_alpha = val
+    @property
+    def alpha_decay_ratio(self) -> float:
+        return self._alpha_decay_ratio
+    @property
+    def init_epsilon(self) -> float:
+        return self._init_epsilon
+    @init_epsilon.setter
+    def init_epsilon(self, val: float):
+        self._init_epsilon = val
+    @property
+    def min_epsilon(self) -> float:
+        return self._min_epsilon
+    @min_epsilon.setter
+    def min_epsilon(self, val: float):
+        self._min_epsilon = val
+    @property
+    def epsilon_decay_ratio(self) -> float:
+        return self._epsilon_decay_ratio
+    @epsilon_decay_ratio.setter
+    def epsilon_decay_ratio(self, val: float):
+        self._epsilon_decay_ratio = val
+    @property
+    def initial_state_index(self) -> int:
+        return self._init_state_index
+    @initial_state_index.setter
+    def initial_state_index(self, val: int):
+        self._init_state_index = val

pyrlutils/transition.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from types import LambdaType
-from typing import Tuple, Dict
+from types import LambdaType, FunctionType
+from typing import Union
+from dataclasses import dataclass
 import numpy as np
@@ -9,28 +10,12 @@ from .reward import IndividualRewardFunction
 from .action import Action, DiscreteActionValueType
+@dataclass
 class NextStateTuple:
-    def __init__(self, next_state_value: DiscreteStateValueType, probability: float, reward: float, terminal: bool):
-        self._next_state_value = next_state_value
-        self._probability = probability
-        self._reward = reward
-        self._terminal = terminal
-    @property
-    def next_state_value(self) -> DiscreteStateValueType:
-        return self._next_state_value
-    @property
-    def probability(self) -> float:
-        return self._probability
-    @property
-    def reward(self) -> float:
-        return self._reward
-    @property
-    def terminal(self) -> bool:
-        return self._terminal
+    next_state_value: DiscreteStateValueType
+    probability: float
+    reward: float
+    terminal: bool
 class TransitionProbabilityFactory:
@@ -40,7 +25,11 @@ class TransitionProbabilityFactory:
         self._all_action_values = []
         self._objects_generated = False
-    def add_state_transitions(self, state_value: DiscreteStateValueType, action_values_to_next_state: dict):
+    def add_state_transitions(
+            self,
+            state_value: DiscreteStateValueType,
+            action_values_to_next_state: dict[DiscreteActionValueType, Union[list[NextStateTuple], dict]]
+    ):
         if state_value not in self._all_state_values:
             self._all_state_values.append(state_value)
@@ -69,7 +58,10 @@ class TransitionProbabilityFactory:
         self._transprobs[state_value] = this_state_transition_dict
-    def _get_probs_for_eachstate(self, action_value: DiscreteActionValueType) -> Dict[DiscreteStateValueType, NextStateTuple]:
+    def _get_probs_for_eachstate(
+            self,
+            action_value: DiscreteActionValueType
+    ) -> dict[DiscreteStateValueType, list[NextStateTuple]]:
         state_nexttuples = {}
         for state_value, action_nexttuples_pair in self._transprobs.items():
             for this_action_value, nexttuples in action_nexttuples_pair.items():
@@ -77,7 +69,10 @@ class TransitionProbabilityFactory:
                     state_nexttuples[state_value] = nexttuples
         return state_nexttuples
-    def _generate_action_function(self, state_nexttuples: dict) -> LambdaType:
+    def _generate_action_function(
+            self,
+            state_nexttuples: dict[DiscreteStateValueType, list[NextStateTuple]]
+    ) -> Union[FunctionType, LambdaType]:
         def _action_function(state: DiscreteState) -> DiscreteState:
             nexttuples = state_nexttuples[state.state_value]
@@ -91,7 +86,11 @@ class TransitionProbabilityFactory:
     def _generate_individual_reward_function(self) -> IndividualRewardFunction:
-        def _individual_reward_function(state_value, action_value, next_state_value) -> float:
+        def _individual_reward_function(
+                state_value: DiscreteStateValueType,
+                action_value: DiscreteActionValueType,
+                next_state_value: DiscreteStateValueType
+        ) -> float:
             if state_value not in self._transprobs.keys():
                 return 0.
@@ -105,15 +104,22 @@ class TransitionProbabilityFactory:
             return reward
         class ThisIndividualRewardFunction(IndividualRewardFunction):
-            def __init__(self):
-                super().__init__()
-            def reward(self, state_value, action_value, next_state_value) -> float:
+            def reward(
+                    self,
+                    state_value: DiscreteStateValueType,
+                    action_value: DiscreteActionValueType,
+                    next_state_value: DiscreteStateValueType
+            ) -> float:
                 return _individual_reward_function(state_value, action_value, next_state_value)
         return ThisIndividualRewardFunction()
-    def get_probability(self, state_value, action_value, new_state_value) -> float:
+    def get_probability(
+            self,
+            state_value: DiscreteStateValueType,
+            action_value: DiscreteActionValueType,
+            new_state_value: DiscreteStateValueType
+    ) -> float:
         if state_value not in self._transprobs.keys():
             return 0.
@@ -127,18 +133,21 @@ class TransitionProbabilityFactory:
         return probs
     @property
-    def transition_probabilities(self) -> dict:
+    def transition_probabilities(self) -> dict[DiscreteStateValueType, dict[DiscreteActionValueType, list[NextStateTuple]]]:
         return self._transprobs
-    def generate_mdp_objects(self) -> Tuple[DiscreteState, Dict[DiscreteActionValueType, Action], IndividualRewardFunction]:
+    def generate_mdp_objects(self) -> tuple[DiscreteState, dict[DiscreteActionValueType, Action], IndividualRewardFunction]:
         state = DiscreteState(self._all_state_values)
         actions_dict = {}
         for action_value in self._all_action_values:
             state_nexttuple = self._get_probs_for_eachstate(action_value)
             actions_dict[action_value] = Action(self._generate_action_function(state_nexttuple))
+            for next_tuples in state_nexttuple.values():
+                for next_tuple in next_tuples:
+                    state._terminal_dict[next_tuple.next_state_value] = next_tuple.terminal
         individual_reward_fcn = self._generate_individual_reward_function()
+        self._objects_generated = True
         return state, actions_dict, individual_reward_fcn
     @property

pyrlutils 0.0.4__py3-none-any.whl → 0.1.1__py3-none-any.whl

Potentially problematic release.

pyrlutils 0.0.4py3-none-any.whl → 0.1.1py3-none-any.whl