PyPI - pyrlutils - Versions diffs - 0.0.4__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

pyrlutils 0.0.4py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyrlutils might be problematic. Click here for more details.

Files changed (19) hide show

pyrlutils/action.py +10 -2
pyrlutils/bandit/reward.py +3 -2
pyrlutils/dp/__init__.py +0 -0
pyrlutils/{valuefcns.py → dp/valuefcns.py} +16 -11
pyrlutils/helpers/__init__.py +0 -0
pyrlutils/helpers/exceptions.py +5 -0
pyrlutils/openai/utils.py +3 -3
pyrlutils/policy.py +79 -12
pyrlutils/state.py +166 -74
pyrlutils/td/__init__.py +0 -0
pyrlutils/td/td.py +101 -0
pyrlutils/td/utils.py +119 -0
pyrlutils/transition.py +44 -35
{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.0.dist-info}/METADATA +6 -6
pyrlutils-0.1.0.dist-info/RECORD +23 -0
{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.0.dist-info}/WHEEL +1 -1
pyrlutils-0.0.4.dist-info/RECORD +0 -17
{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.0.dist-info/licenses}/LICENSE +0 -0
{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.0.dist-info}/top_level.txt +0 -0

pyrlutils/action.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from types import LambdaType
+from types import LambdaType, FunctionType
 from typing import Union
 from .state import State
@@ -8,7 +8,7 @@ from .state import State
 DiscreteActionValueType = Union[float, str]
 class Action:
-    def __init__(self, actionfunc: LambdaType):
+    def __init__(self, actionfunc: Union[FunctionType, LambdaType]):
         self._actionfunc = actionfunc
     def act(self, state: State, *args, **kwargs) -> State:
@@ -17,3 +17,11 @@ class Action:
     def __call__(self, state: State) -> State:
         return self.act(state)
+    @property
+    def action_function(self) -> Union[FunctionType, LambdaType]:
+        return self._actionfunc
+    @action_function.setter
+    def action_function(self, new_func: Union[FunctionType, LambdaType]) -> None:
+        self._actionfunc = new_func

pyrlutils/bandit/reward.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from abc import ABC, abstractmethod
+from typing import Any
 class IndividualBanditRewardFunction(ABC):
     @abstractmethod
-    def reward(self, action_value) -> float:
+    def reward(self, action_value: Any) -> float:
         pass
-    def __call__(self, action_value) -> float:
+    def __call__(self, action_value: Any) -> float:
         return self.reward(action_value)

pyrlutils/dp/__init__.py ADDED Viewed

File without changes

pyrlutils/{valuefcns.py → dp/valuefcns.py} RENAMED Viewed

@@ -1,18 +1,23 @@
 import random
 from copy import copy
-from typing import Tuple, Dict
 from itertools import product
+from typing import Annotated
 import numpy as np
+from numpy.typing import NDArray
-from .state import DiscreteStateValueType
-from .transition import TransitionProbabilityFactory
-from .policy import DiscreteDeterminsticPolicy
+from ..state import DiscreteStateValueType
+from ..transition import TransitionProbabilityFactory
+from ..policy import DiscreteDeterminsticPolicy
 class OptimalPolicyOnValueFunctions:
-    def __init__(self, discount_factor: float, transprobfac: TransitionProbabilityFactory):
+    def __init__(
+            self,
+            discount_factor: float,
+            transprobfac: TransitionProbabilityFactory
+    ):
         try:
             assert 0. <= discount_factor <= 1.
         except AssertionError:
@@ -31,7 +36,7 @@ class OptimalPolicyOnValueFunctions:
         self._theta = 1e-10
         self._policy_evaluation_maxiter = 10000
-    def _policy_evaluation(self, policy: DiscreteDeterminsticPolicy) -> np.ndarray:
+    def _policy_evaluation(self, policy: DiscreteDeterminsticPolicy) -> Annotated[NDArray[np.float64], "1D Array"]:
         prev_V = np.zeros(len(self._states_to_indices))
         for _ in range(self._policy_evaluation_maxiter):
@@ -55,7 +60,7 @@ class OptimalPolicyOnValueFunctions:
         return V
-    def _policy_improvement(self, V: np.ndarray) -> DiscreteDeterminsticPolicy:
+    def _policy_improvement(self, V: Annotated[NDArray[np.float64], "1D Array"]) -> DiscreteDeterminsticPolicy:
         Q = np.zeros((len(self._states_to_indices), len(self._actions_to_indices)))
         for state_value in self._state_names:
@@ -78,7 +83,7 @@ class OptimalPolicyOnValueFunctions:
             optimal_policy.add_deterministic_rule(state_value, action_value)
         return optimal_policy
-    def _policy_iteration(self) -> Tuple[np.ndarray, DiscreteDeterminsticPolicy]:
+    def _policy_iteration(self) -> tuple[Annotated[NDArray[np.float64], "1D Array"], DiscreteDeterminsticPolicy]:
         policy = DiscreteDeterminsticPolicy(self._actions_dict)
         for state_value in self._state_names:
             policy.add_deterministic_rule(state_value, random.choice(self._action_names))
@@ -97,7 +102,7 @@ class OptimalPolicyOnValueFunctions:
         return V, policy
-    def _value_iteration(self) -> Tuple[np.ndarray, DiscreteDeterminsticPolicy]:
+    def _value_iteration(self) -> tuple[Annotated[NDArray[np.float64], "1D Array"], DiscreteDeterminsticPolicy]:
         V = np.zeros(len(self._state_names))
         for _ in range(self._policy_evaluation_maxiter):
@@ -127,7 +132,7 @@ class OptimalPolicyOnValueFunctions:
         return V, policy
-    def policy_iteration(self) -> Tuple[Dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
+    def policy_iteration(self) -> tuple[dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
         V, policy = self._policy_iteration()
         state_values_dict = {
             self._state_names[i]: V[i]
@@ -135,7 +140,7 @@ class OptimalPolicyOnValueFunctions:
         }
         return state_values_dict, policy
-    def value_iteration(self) -> Tuple[Dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
+    def value_iteration(self) -> tuple[dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
         V, policy = self._value_iteration()
         state_values_dict = {
             self._state_names[i]: V[i]

pyrlutils/helpers/__init__.py ADDED Viewed

File without changes

pyrlutils/helpers/exceptions.py ADDED Viewed

@@ -0,0 +1,5 @@
+class InvalidRangeError(Exception):
+    def __init__(self, message=None):
+        self.message = "Invalid range error!" if message is None else message
+        super().__init__(self.message)

pyrlutils/openai/utils.py CHANGED Viewed

@@ -5,7 +5,7 @@ from ..transition import TransitionProbabilityFactory, NextStateTuple
 class OpenAIGymDiscreteEnvironmentTransitionProbabilityFactory(TransitionProbabilityFactory):
-    def __init__(self, envname):
+    def __init__(self, envname: str):
         super().__init__()
         self._envname = envname
         self._gymenv = gym.make(envname)
@@ -23,9 +23,9 @@ class OpenAIGymDiscreteEnvironmentTransitionProbabilityFactory(TransitionProbabi
             self.add_state_transitions(state_value, new_trans_dict)
     @property
-    def envname(self):
+    def envname(self) -> str:
         return self._envname
     @property
-    def gymenv(self):
+    def gymenv(self) -> gym.Env:
         return self._gymenv

pyrlutils/policy.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import Union, Dict
+from typing import Union, Annotated
 from warnings import warn
 import numpy as np
+from numpy.typing import NDArray
 from .state import State, DiscreteState, DiscreteStateValueType
 from .action import Action, DiscreteActionValueType
@@ -12,7 +13,11 @@ from .action import Action, DiscreteActionValueType
 class Policy(ABC):
     @abstractmethod
     def get_action(self, state: State) -> Action:
-        pass
+        raise NotImplemented()
+    @abstractmethod
+    def get_action_value(self, state: State) -> DiscreteActionValueType:
+        raise NotImplemented()
     def __call__(self, state: State) -> Action:
         return self.get_action(state)
@@ -25,7 +30,7 @@ class Policy(ABC):
 class DeterministicPolicy(Policy):
     @abstractmethod
     def add_deterministic_rule(self, *args, **kwargs):
-        pass
+        raise NotImplemented()
     @property
     def is_stochastic(self) -> bool:
@@ -33,16 +38,23 @@ class DeterministicPolicy(Policy):
 class DiscreteDeterminsticPolicy(DeterministicPolicy):
-    def __init__(self, actions_dict: Dict[DiscreteActionValueType, Action]):
+    def __init__(self, actions_dict: dict[DiscreteActionValueType, Action]):
         self._state_to_action = {}
         self._actions_dict = actions_dict
-    def add_deterministic_rule(self, state_value: DiscreteStateValueType, action_value: DiscreteActionValueType):
+    def add_deterministic_rule(
+            self,
+            state_value: DiscreteStateValueType,
+            action_value: DiscreteActionValueType
+    ) -> None:
         if state_value in self._state_to_action:
             warn('State value {} exists in rule; it will be replaced.'.format(state_value))
         self._state_to_action[state_value] = action_value
-    def get_action_value(self, state_value: DiscreteStateValueType) -> DiscreteActionValueType:
+    def get_action_value(
+            self,
+            state_value: DiscreteStateValueType
+    ) -> DiscreteActionValueType:
         return self._state_to_action.get(state_value)
     def get_action(self, state: DiscreteState) -> Action:
@@ -62,10 +74,16 @@ class DiscreteDeterminsticPolicy(DeterministicPolicy):
         return True
+class DiscreteContinuousPolicy(DeterministicPolicy):
+    @abstractmethod
+    def get_action(self, state: State) -> Action:
+        raise NotImplemented()
 class StochasticPolicy(Policy):
     @abstractmethod
     def get_probability(self, *args, **kwargs) -> float:
-        pass
+        raise NotImplemented()
     @property
     def is_stochastic(self) -> bool:
@@ -73,12 +91,61 @@ class StochasticPolicy(Policy):
 class DiscreteStochasticPolicy(StochasticPolicy):
-    @abstractmethod
-    def get_probability(self, state_value: DiscreteStateValueType, action_value: DiscreteActionValueType) -> float:
-        pass
+    def __init__(self, actions_dict: dict[DiscreteActionValueType, Action]):
+        self._state_to_action = {}
+        self._actions_dict = actions_dict
+    def add_stochastic_rule(
+            self,
+            state_value: DiscreteStateValueType,
+            action_values: list[DiscreteActionValueType],
+            probs: Union[list[float], Annotated[NDArray[np.float64], "1D Array"]] = None
+    ):
+        if probs is not None:
+            assert len(action_values) == len(probs)
+            probs = np.array(probs)
+        else:
+            probs = np.repeat(1./len(action_values), len(action_values))
+        if state_value in self._state_to_action:
+            warn('State value {} exists in rule; it will be replaced.'.format(state_value))
+        self._state_to_action[state_value] = {
+            action_value: prob
+            for action_value, prob in zip(action_values, probs)
+        }
+    def get_probability(
+            self,
+            state_value: DiscreteStateValueType,
+            action_value: DiscreteActionValueType
+    ) -> float:
+        if state_value not in self._state_to_action:
+            return 0.0
+        if action_value in self._state_to_action[state_value]:
+            return self._state_to_action[state_value][action_value]
+        else:
+            return 0.0
+    def get_action_value(self, state: State) -> DiscreteActionValueType:
+        allowed_actions = list(self._state_to_action[state].keys())
+        probs = np.array(list(self._state_to_action[state].values()))
+        sumprobs = np.sum(probs)
+        return np.random.choice(allowed_actions, p=probs/sumprobs)
+    def get_action(self, state: DiscreteState) -> Action:
+        return self._actions_dict[self.get_action_value(state.state_value)]
 class ContinuousStochasticPolicy(StochasticPolicy):
     @abstractmethod
-    def get_probability(self, state_value: Union[float, np.ndarray], action_value: DiscreteActionValueType, value: Union[float, np.ndarray]) -> float:
-        pass
+    def get_probability(
+            self,
+            state_value: Union[float, Annotated[NDArray[np.float64], "1D Array"]],
+            action_value: DiscreteActionValueType,
+            value: Union[float, Annotated[NDArray[np.float64], "1D Array"]]
+    ) -> float:
+        raise NotImplemented()
+DiscretePolicy = Union[DiscreteDeterminsticPolicy, DiscreteStochasticPolicy]
+ContinuousPolicy = Union[ContinuousStochasticPolicy]

pyrlutils/state.py CHANGED Viewed

@@ -1,81 +1,84 @@
-from abc import ABC, abstractmethod
+import sys
+from abc import ABC
 from enum import Enum
-from dataclasses import dataclass
-from typing import Tuple, List, Optional, Union
+from typing import Optional, Union, Annotated, Literal
 import numpy as np
+from numpy.typing import NDArray
+if sys.version_info < (3, 11):
+    from typing_extensions import Self
+else:
+    from typing import Self
-class StateValue(ABC):
-    @property
-    @abstractmethod
-    def value(self):
-        pass
-@dataclass
-class DiscreteStateValue(StateValue):
-    enum: Enum
-    @property
-    def value(self):
-        return self.enum.value
-    def name(self):
-        return self.enum.name
-class ContinuousStateValue(StateValue):
-    _value: float
-    @property
-    def value(self) -> float:
-        return self._value
+from .helpers.exceptions import InvalidRangeError
 class State(ABC):
     @property
     def state_value(self):
-        return self.get_state_value()
-    @abstractmethod
-    def set_state_value(self, state_value):
-        pass
+        raise NotImplemented()
-    @abstractmethod
-    def get_state_value(self):
-        pass
-    @state_value.setter
-    def state_value(self, new_state_value):
-        self.set_state_value(new_state_value)
-DiscreteStateValueType = Union[float, str, Tuple[int], Enum]
+DiscreteStateValueType = Union[str, int, tuple[int], Enum]
 class DiscreteState(State):
-    def __init__(self, all_state_values: List[DiscreteStateValueType], initial_values: Optional[List[DiscreteStateValueType]] = None):
+    def __init__(
+            self,
+            all_state_values: list[DiscreteStateValueType],
+            initial_value: Optional[DiscreteStateValueType] = None,
+            terminals: Optional[dict[DiscreteStateValueType, bool]]=None
+    ):
         super().__init__()
         self._all_state_values = all_state_values
-        self._state_value = initial_values if initial_values is not None and initial_values in self._all_state_values else self._all_state_values[0]
+        self._state_values_to_indices = {
+            state_value: idx
+            for idx, state_value in enumerate(self._all_state_values)
+        }
+        if initial_value is not None:
+            self._current_index = self._state_values_to_indices[initial_value]
+        else:
+            self._current_index = 0
+        if terminals is None:
+            self._terminal_dict = {
+                state_value: False
+                for state_value in self._all_state_values
+            }
+        else:
+            self._terminal_dict = terminals.copy()
+            for state_value in self._all_state_values:
+                if self._terminal_dict.get(state_value) is None:
+                    self._terminal_dict[state_value] = False
+    def _get_state_value_from_index(self, index: int) -> DiscreteStateValueType:
+        return self._all_state_values[index]
     def get_state_value(self) -> DiscreteStateValueType:
-        return self._state_value
+        return self._get_state_value_from_index(self._current_index)
-    def set_state_value(self, state_value: DiscreteStateValueType):
+    def set_state_value(self, state_value: DiscreteStateValueType) -> None:
         if state_value in self._all_state_values:
-            self._state_value = state_value
+            self._current_index = self._state_values_to_indices[state_value]
         else:
             raise ValueError('State value {} is invalid.'.format(state_value))
-    def get_all_possible_state_values(self) -> List[DiscreteStateValueType]:
+    def get_all_possible_state_values(self) -> list[DiscreteStateValueType]:
         return self._all_state_values
+    @property
+    def state_index(self) -> int:
+        return self._current_index
+    @state_index.setter
+    def state_index(self, new_index: int) -> None:
+        if new_index >= len(self._all_state_values):
+            raise ValueError(f"Invalid index {new_index}; it must be less than {len(self._all_state_values)}.")
+        self._current_index = new_index
     @property
     def state_value(self) -> DiscreteStateValueType:
-        return self._state_value
+        return self._all_state_values[self._current_index]
     @state_value.setter
     def state_value(self, new_state_value: DiscreteStateValueType):
@@ -85,22 +88,53 @@ class DiscreteState(State):
     def state_space_size(self):
         return len(self._all_state_values)
+    @property
+    def nb_state_values(self) -> int:
+        return len(self._all_state_values)
+    @property
+    def is_terminal(self) -> bool:
+        return self._terminal_dict[self._all_state_values[self._current_index]]
-class InvalidRangeError(Exception):
-    def __init__(self, message=None):
-        self.message = "Invalid range error!" if message is None else message
-        super().__init__(self.message)
+    def __hash__(self):
+        return self._current_index
+    def __eq__(self, other: Self) -> bool:
+        return self._current_index == other._current_index
 class ContinuousState(State):
-    def __init__(self, nbdims: int, ranges: np.array, init_value: Optional[Union[float, np.ndarray]] = None):
+    def __init__(
+            self,
+            nbdims: int,
+            ranges: Union[Annotated[NDArray[np.float64], Literal["2"]], Annotated[NDArray[np.float64], Literal["*", "2"]]],
+            init_value: Optional[Union[float, Annotated[NDArray[np.float64], "1D Array"]]] = None
+    ):
+        super().__init__()
         self._nbdims = nbdims
+        try:
+            assert isinstance(ranges, np.ndarray)
+        except AssertionError:
+            raise TypeError('Range must be a numpy array.')
         try:
             assert (ranges.dtype == np.float64) or (ranges.dtype == np.float32) or (ranges.dtype == np.float16)
         except AssertionError:
             raise TypeError('It has to be floating type numpy.ndarray.')
+        try:
+            assert ranges.ndim == 1 or ranges.ndim == 2
+            match ranges.ndim:
+                case 1:
+                    assert ranges.shape[0] == 2
+                case 2:
+                    assert ranges.shape[1] == 2
+                case _:
+                    raise ValueError("Ranges must be of shape (2, ) or (*, 2).")
+        except AssertionError:
+            raise ValueError("Ranges must be of shape (2, ) or (*, 2).")
         try:
             assert self._nbdims > 0
         except AssertionError:
@@ -146,50 +180,53 @@ class ContinuousState(State):
                     raise ValueError('Initialized value does not have the right dimension.')
                 for i in range(self._nbdims):
                     try:
-                        assert (init_value[i] >= self._ranges[i, 0]) and (init_value[i] <= self.ranges[i, 1])
+                        assert self._ranges[i, 0] <= init_value[i] <= self.ranges[i, 1]
                     except AssertionError:
                         raise InvalidRangeError('Initialized value at dimension {} (value: {}) is not within the permitted range ({} -> {})!'.format(i, init_value[i], self._ranges[i, 0], self._ranges[i, 1]))
             else:
                 try:
-                    assert (init_value >= self._ranges[0, 0]) and (init_value <= self.ranges[0, 1])
+                    assert self._ranges[0, 0] <= init_value <= self.ranges[0, 1]
                 except AssertionError:
                     raise InvalidRangeError('Initialized value is out of range.')
             self._state_value = init_value
-    def set_state_value(self, state_value: Union[float, np.ndarray]):
-        if self.nbdims > 1:
+    def set_state_value(self, state_value: Union[float, Annotated[NDArray[np.float64], "1D Array"]]):
+        if self._nbdims > 1:
             try:
                 assert state_value.shape[0] == self._nbdims
             except AssertionError:
                 raise ValueError('Given value does not have the right dimension.')
-            for i in range(self.nbdims):
+            for i in range(self._nbdims):
                 try:
-                    assert state_value[i] >= self.ranges[i, 0] and state_value[i] <= self.ranges[i, 1]
+                    assert self.ranges[i, 0] <= state_value[i] <= self.ranges[i, 1]
                 except AssertionError:
                     raise InvalidRangeError()
         else:
             try:
-                assert state_value >= self.ranges[0, 0] and state_value <= self.ranges[0, 1]
+                assert self.ranges[0, 0] <= state_value <= self.ranges[0, 1]
             except AssertionError:
                 raise InvalidRangeError()
         self._state_value = state_value
-    def get_state_value(self) -> np.ndarray:
+    def get_state_value(self) -> Annotated[NDArray[np.float64], "1D Array"]:
         return self._state_value
-    def get_state_value_ranges(self) -> np.ndarray:
+    def get_state_value_ranges(self) -> Union[Annotated[NDArray[np.float64], Literal["2"]], Annotated[NDArray[np.float64], Literal["*", "2"]]]:
         return self._ranges
-    def get_state_value_range_at_dimension(self, dimension: int) -> np.ndarray:
-        return self._ranges[dimension]
+    def get_state_value_range_at_dimension(self, dimension: int) -> Annotated[NDArray[np.float64], Literal["2"]]:
+        if dimension < self._nbdims:
+            return self._ranges[dimension]
+        else:
+            raise ValueError(f"There are only {self._nbdims} dimensions!")
     @property
-    def ranges(self) -> np.ndarray:
+    def ranges(self) -> Union[Annotated[NDArray[np.float64], Literal["2"]], Annotated[NDArray[np.float64], Literal["*", "2"]]]:
         return self.get_state_value_ranges()
     @property
-    def state_value(self) -> Union[float, np.ndarray]:
+    def state_value(self) -> Union[float, NDArray[np.float64]]:
         return self.get_state_value()
     @state_value.setter
@@ -200,9 +237,28 @@ class ContinuousState(State):
     def nbdims(self) -> int:
         return self._nbdims
+    def __hash__(self):
+        return hash(tuple(self._state_value))
+    def __eq__(self, other: Self):
+        if self.nbdims != other.nbdims:
+            raise ValueError(f"The two states have two different dimensions. ({self.nbdims} vs. {other.nbdims})")
+        for i in range(self.nbdims):
+            if self.state_value[i] != other.state_value[i]:
+                return False
+        return True
 class Discrete2DCartesianState(DiscreteState):
-    def __init__(self, x_lowlim: int, x_hilim: int, y_lowlim: int, y_hilim: int, initial_coordinate: List[int]=None):
+    def __init__(
+            self,
+            x_lowlim: int,
+            x_hilim: int,
+            y_lowlim: int,
+            y_hilim: int,
+            initial_coordinate: list[int]=None,
+            terminals: Optional[dict[DiscreteStateValueType, bool]] = None
+    ):
         self._x_lowlim = x_lowlim
         self._x_hilim = x_hilim
         self._y_lowlim = y_lowlim
@@ -212,14 +268,50 @@ class Discrete2DCartesianState(DiscreteState):
         if initial_coordinate is None:
             initial_coordinate = [self._x_lowlim, self._y_lowlim]
         initial_value = (initial_coordinate[1] - self._y_lowlim) * self._countx + (initial_coordinate[0] - self._x_lowlim)
-        super().__init__(list(range(self._countx*self._county)), initial_values=initial_value)
+        super().__init__(list(range(self._countx*self._county)), initial_value=initial_value, terminals=terminals)
     def _encode_coordinates(self, x, y) -> int:
         return (y - self._y_lowlim) * self._countx + (x - self._x_lowlim)
-    def encode_coordinates(self, coordinates: List[int]) -> int:
-        assert len(coordinates) == 2
+    def encode_coordinates(self, coordinates: Union[list[int], Annotated[NDArray[np.int64], Literal["2"]]]) -> int:
+        if isinstance(coordinates, list):
+            assert len(coordinates) == 2
         return self._encode_coordinates(coordinates[0], coordinates[1])
-    def decode_coordinates(self, hashcode) -> List[int]:
-        return [hashcode % self._countx, hashcode // self._countx]
+    def decode_coordinates(self, hashcode) -> list[int]:
+        return [hashcode % self._countx + self._x_lowlim, hashcode // self._countx + self._y_lowlim]
+    def get_whether_terminal_given_coordinates(
+            self,
+            coordinates: Union[list[int], Annotated[NDArray[np.int64], Literal["2"]]]
+    ) -> bool:
+        if isinstance(coordinates, list):
+            assert len(coordinates) == 2
+        hashcode = self._encode_coordinates(coordinates[0], coordinates[1])
+        return self._terminal_dict.get(hashcode, False)
+    def set_terminal_given_coordinates(
+            self,
+            coordinates: Union[list[int], Annotated[NDArray[np.int64], Literal["2"]]],
+            terminal_value: bool
+    ) -> None:
+        if isinstance(coordinates, list):
+            assert len(coordinates) == 2
+        hashcode = self._encode_coordinates(coordinates[0], coordinates[1])
+        self._terminal_dict[hashcode] = terminal_value
+    @property
+    def x_lowlim(self) -> int:
+        return self._x_lowlim
+    @property
+    def x_hilim(self) -> int:
+        return self._x_hilim
+    @property
+    def y_lowlim(self) -> int:
+        return self._y_lowlim
+    @property
+    def y_hilim(self) -> int:
+        return self._y_hilim

pyrlutils/td/__init__.py ADDED Viewed

File without changes

pyrlutils/td/td.py ADDED Viewed

@@ -0,0 +1,101 @@
+from typing import Annotated
+import numpy as np
+from numpy.typing import NDArray
+from .utils import decay_schedule, AbstractTemporalDifferenceLearner, TimeDifferencePathElements
+class SingleStepTemporalDifferenceLearner(AbstractTemporalDifferenceLearner):
+    def learn(
+            self,
+            episodes: int
+    ) -> tuple[Annotated[NDArray[np.float64], "1D Array"], Annotated[NDArray[np.float64], "2D Array"]]:
+        V = np.zeros(self.nb_states)
+        V_track = np.zeros((episodes, self.nb_states))
+        alphas = decay_schedule(
+            self.init_alpha, self.min_alpha, self.alpha_decay_ratio, episodes
+        )
+        for i in range(episodes):
+            self._state.set_state_value(self.initial_state_index)
+            done = False
+            while not done:
+                old_state_index = self._state.state_index
+                old_state_value = self._state.state_value
+                action_value = self._policy.get_action_value(self._state.state_value)
+                action_func = self._actions_dict[action_value]
+                self._state = action_func(self._state)
+                new_state_index = self._state.state_index
+                new_state_value = self._state.state_value
+                reward = self._indrewardfcn(old_state_value, action_value, new_state_value)
+                done = self._state.is_terminal
+                td_target = reward + self.gamma * V[new_state_index] * (not done)
+                td_error = td_target - V[old_state_index]
+                V[old_state_index] = V[old_state_index] + alphas[i] * td_error
+            V_track[i, :] = V
+        return V, V_track
+class MultipleStepTemporalDifferenceLearner(AbstractTemporalDifferenceLearner):
+    def learn(
+            self,
+            episodes: int,
+            n_steps: int=3
+    ) -> tuple[Annotated[NDArray[np.float64], "1D Array"], Annotated[NDArray[np.float64], "2D Array"]]:
+        V = np.zeros(self.nb_states)
+        V_track = np.zeros((episodes, self.nb_states))
+        alphas = decay_schedule(
+            self.init_alpha, self.min_alpha, self.alpha_decay_ratio, episodes
+        )
+        discounts = np.logspace(0, n_steps-1, num=n_steps+1, base=self.gamma, endpoint=False)
+        for i in range(episodes):
+            self._state.set_state_value(self.initial_state_index)
+            done = False
+            path = []
+            while not done or path is not None:
+                path = path[1:]     # worth revisiting this line
+                next_state_index = -1
+                while not done and len(path) < n_steps:
+                    old_state_index = self._state.state_index
+                    old_state_value = self._state.state_value
+                    action_value = self._policy.get_action_value(self._state.state_value)
+                    action_func = self._actions_dict[action_value]
+                    self._state = action_func(self._state)
+                    new_state_index = self._state.state_index
+                    new_state_value = self._state.state_value
+                    reward = self._indrewardfcn(old_state_value, action_value, new_state_value)
+                    done = self._state.is_terminal
+                    path.append(
+                        TimeDifferencePathElements(
+                            this_state_index=old_state_index,
+                            reward=reward,
+                            next_state_index=new_state_index,
+                            done=done
+                        )
+                    )
+                    if done:
+                        break
+                n = len(path)
+                estimated_state_index = path[0].this_state_index
+                rewards = np.array([this_moment.reward for this_moment in path])
+                partial_return = discounts[n:] * rewards
+                bs_val = discounts[-1] * V[next_state_index] * (not done)
+                ntd_target = np.sum(np.append(partial_return, bs_val))
+                ntd_error = ntd_target - V[estimated_state_index]
+                V[estimated_state_index] = V[estimated_state_index] + alphas[i] * ntd_error
+                if len(path) == 1 and path[0].done:
+                    path = None
+            V_track[i, :] = V
+        return V, V_track

pyrlutils/td/utils.py ADDED Viewed

@@ -0,0 +1,119 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Annotated
+from dataclasses import dataclass
+import numpy as np
+from numpy.typing import NDArray
+from ..policy import DiscretePolicy
+from ..transition import TransitionProbabilityFactory
+def decay_schedule(
+        init_value: float,
+        min_value: float,
+        decay_ratio: float,
+        max_steps: int,
+        log_start: int=-2,
+        log_base: int=10
+) -> Annotated[NDArray[np.float64], "1D Array"]:
+    decay_steps = int(max_steps*decay_ratio)
+    rem_steps = max_steps - decay_steps
+    values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
+    values = (values - values.min()) / (values.max() - values.min())
+    values = (init_value - min_value) * values + min_value
+    values = np.pad(values, (0, rem_steps), 'edge')
+    return values
+class AbstractTemporalDifferenceLearner(ABC):
+    def __init__(
+            self,
+            transprobfac: TransitionProbabilityFactory,
+            gamma: float=1.0,
+            init_alpha: float=0.5,
+            min_alpha: float=0.01,
+            alpha_decay_ratio: float=0.3,
+            policy: Optional[DiscretePolicy]=None,
+            initial_state_index: int=0
+    ):
+        self._gamma = gamma
+        self._init_alpha = init_alpha
+        self._min_alpha = min_alpha
+        try:
+            assert 0.0 <= alpha_decay_ratio <= 1.0
+        except AssertionError:
+            raise ValueError("alpha_decay_ratio must be between 0 and 1!")
+        self._alpha_decay_ratio = alpha_decay_ratio
+        self._transprobfac = transprobfac
+        self._state, self._actions_dict, self._indrewardfcn = self._transprobfac.generate_mdp_objects()
+        self._action_names = list(self._actions_dict.keys())
+        self._actions_to_indices = {action_value: idx for idx, action_value in enumerate(self._action_names)}
+        self._policy = policy
+        try:
+            assert 0 <= initial_state_index < self._state.nb_state_values
+        except AssertionError:
+            raise ValueError("Initial state index must be between 0 and {}".format(len(self._state_names)))
+        self._init_state_index = initial_state_index
+    @abstractmethod
+    def learn(self, *args, **kwargs) -> tuple[Annotated[NDArray[np.float64], "1D Array"], Annotated[NDArray[np.float64], "2D Array"]]:
+        raise NotImplementedError()
+    @property
+    def nb_states(self) -> int:
+        return self._state.nb_state_values
+    @property
+    def policy(self) -> DiscretePolicy:
+        return self._policy
+    @policy.setter
+    def policy(self, val: DiscretePolicy):
+        self._policy = val
+    @property
+    def gamma(self) -> float:
+        return self._gamma
+    @gamma.setter
+    def gamma(self, val: float):
+        self._gamma = val
+    @property
+    def init_alpha(self) -> float:
+        return self._init_alpha
+    @init_alpha.setter
+    def init_alpha(self, val: float):
+        self._init_alpha = val
+    @property
+    def min_alpha(self) -> float:
+        return self._min_alpha
+    @min_alpha.setter
+    def min_alpha(self, val: float):
+        self._min_alpha = val
+    @property
+    def alpha_decay_ratio(self) -> float:
+        return self._alpha_decay_ratio
+    @property
+    def initial_state_index(self) -> int:
+        return self._init_state_index
+    @initial_state_index.setter
+    def initial_state_index(self, val: int):
+        self._init_state_index = val
+@dataclass
+class TimeDifferencePathElements:
+    this_state_index: int
+    reward: float
+    next_state_index: int
+    done: bool

pyrlutils/transition.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from types import LambdaType
-from typing import Tuple, Dict
+from types import LambdaType, FunctionType
+from typing import Union
+from dataclasses import dataclass
 import numpy as np
@@ -9,28 +10,12 @@ from .reward import IndividualRewardFunction
 from .action import Action, DiscreteActionValueType
+@dataclass
 class NextStateTuple:
-    def __init__(self, next_state_value: DiscreteStateValueType, probability: float, reward: float, terminal: bool):
-        self._next_state_value = next_state_value
-        self._probability = probability
-        self._reward = reward
-        self._terminal = terminal
-    @property
-    def next_state_value(self) -> DiscreteStateValueType:
-        return self._next_state_value
-    @property
-    def probability(self) -> float:
-        return self._probability
-    @property
-    def reward(self) -> float:
-        return self._reward
-    @property
-    def terminal(self) -> bool:
-        return self._terminal
+    next_state_value: DiscreteStateValueType
+    probability: float
+    reward: float
+    terminal: bool
 class TransitionProbabilityFactory:
@@ -40,7 +25,11 @@ class TransitionProbabilityFactory:
         self._all_action_values = []
         self._objects_generated = False
-    def add_state_transitions(self, state_value: DiscreteStateValueType, action_values_to_next_state: dict):
+    def add_state_transitions(
+            self,
+            state_value: DiscreteStateValueType,
+            action_values_to_next_state: dict[DiscreteActionValueType, Union[list[NextStateTuple], dict]]
+    ):
         if state_value not in self._all_state_values:
             self._all_state_values.append(state_value)
@@ -69,7 +58,10 @@ class TransitionProbabilityFactory:
         self._transprobs[state_value] = this_state_transition_dict
-    def _get_probs_for_eachstate(self, action_value: DiscreteActionValueType) -> Dict[DiscreteStateValueType, NextStateTuple]:
+    def _get_probs_for_eachstate(
+            self,
+            action_value: DiscreteActionValueType
+    ) -> dict[DiscreteStateValueType, list[NextStateTuple]]:
         state_nexttuples = {}
         for state_value, action_nexttuples_pair in self._transprobs.items():
             for this_action_value, nexttuples in action_nexttuples_pair.items():
@@ -77,7 +69,10 @@ class TransitionProbabilityFactory:
                     state_nexttuples[state_value] = nexttuples
         return state_nexttuples
-    def _generate_action_function(self, state_nexttuples: dict) -> LambdaType:
+    def _generate_action_function(
+            self,
+            state_nexttuples: dict[DiscreteStateValueType, list[NextStateTuple]]
+    ) -> Union[FunctionType, LambdaType]:
         def _action_function(state: DiscreteState) -> DiscreteState:
             nexttuples = state_nexttuples[state.state_value]
@@ -91,7 +86,11 @@ class TransitionProbabilityFactory:
     def _generate_individual_reward_function(self) -> IndividualRewardFunction:
-        def _individual_reward_function(state_value, action_value, next_state_value) -> float:
+        def _individual_reward_function(
+                state_value: DiscreteStateValueType,
+                action_value: DiscreteActionValueType,
+                next_state_value: DiscreteStateValueType
+        ) -> float:
             if state_value not in self._transprobs.keys():
                 return 0.
@@ -105,15 +104,22 @@ class TransitionProbabilityFactory:
             return reward
         class ThisIndividualRewardFunction(IndividualRewardFunction):
-            def __init__(self):
-                super().__init__()
-            def reward(self, state_value, action_value, next_state_value) -> float:
+            def reward(
+                    self,
+                    state_value: DiscreteStateValueType,
+                    action_value: DiscreteActionValueType,
+                    next_state_value: DiscreteStateValueType
+            ) -> float:
                 return _individual_reward_function(state_value, action_value, next_state_value)
         return ThisIndividualRewardFunction()
-    def get_probability(self, state_value, action_value, new_state_value) -> float:
+    def get_probability(
+            self,
+            state_value: DiscreteStateValueType,
+            action_value: DiscreteActionValueType,
+            new_state_value: DiscreteStateValueType
+    ) -> float:
         if state_value not in self._transprobs.keys():
             return 0.
@@ -127,18 +133,21 @@ class TransitionProbabilityFactory:
         return probs
     @property
-    def transition_probabilities(self) -> dict:
+    def transition_probabilities(self) -> dict[DiscreteStateValueType, dict[DiscreteActionValueType, list[NextStateTuple]]]:
         return self._transprobs
-    def generate_mdp_objects(self) -> Tuple[DiscreteState, Dict[DiscreteActionValueType, Action], IndividualRewardFunction]:
+    def generate_mdp_objects(self) -> tuple[DiscreteState, dict[DiscreteActionValueType, Action], IndividualRewardFunction]:
         state = DiscreteState(self._all_state_values)
         actions_dict = {}
         for action_value in self._all_action_values:
             state_nexttuple = self._get_probs_for_eachstate(action_value)
             actions_dict[action_value] = Action(self._generate_action_function(state_nexttuple))
+            for next_tuples in state_nexttuple.values():
+                for next_tuple in next_tuples:
+                    state._terminal_dict[next_tuple.next_state_value] = next_tuple.terminal
         individual_reward_fcn = self._generate_individual_reward_function()
+        self._objects_generated = True
         return state, actions_dict, individual_reward_fcn
     @property

{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: pyrlutils
-Version: 0.0.4
+Version: 0.1.0
 Summary: Utility and Helpers for Reinformcement Learning
 Author-email: Kwan Yuet Stephen Ho <stephenhky@yahoo.com.hk>
 License: MIT
@@ -11,22 +11,22 @@ Classifier: Topic :: Scientific/Engineering :: Mathematics
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Software Development :: Version Control :: Git
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Intended Audience :: Science/Research
 Classifier: Intended Audience :: Developers
-Requires-Python: >=3.7
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: numpy
+Requires-Dist: typing-extensions
 Provides-Extra: openaigym
 Requires-Dist: gymnasium; extra == "openaigym"
 Provides-Extra: test
 Requires-Dist: unittest; extra == "test"
+Dynamic: license-file
 # PyRLUtils

pyrlutils-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+pyrlutils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pyrlutils/action.py,sha256=QoBdtcGtK_EkYAjb50bruhoB_XIz0agLpQjdGFnGbRQ,732
+pyrlutils/policy.py,sha256=A9bj2eVd6XjNNkClSYVJDoxoGuGkyoYVr1DpVdI0wzs,5120
+pyrlutils/reward.py,sha256=are0swsobMqI1IbrBVBaPMYXWpJnp6lZwAyfgBEm2zg,1211
+pyrlutils/state.py,sha256=A3XJSjNJrsInXUWsUvb1GE7Oq-CY6DNEB-ulrVa1rR4,11774
+pyrlutils/transition.py,sha256=_32jxeYbsiKyaHR9Y2XceUQYbb1jslLCQO2AWL61_EU,6260
+pyrlutils/bandit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pyrlutils/bandit/algo.py,sha256=X2Pn4DOi-RXWz5CNg1h0RJCoV3VlAwEGHRMjkfbckfw,3969
+pyrlutils/bandit/reward.py,sha256=l2H_gZk2qqDxZioHe1M28pD8N47fgSR-K0Q6muchVd0,282
+pyrlutils/dp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pyrlutils/dp/valuefcns.py,sha256=0T7vzdKRIKhLMsaq7JgPqONMmq4lWRGPj7xPtuxVtbE,6546
+pyrlutils/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pyrlutils/helpers/exceptions.py,sha256=4fPGW839BChfap-Gd7b-75Dz-Ed3foqbJQ1lg15TZ-4,192
+pyrlutils/openai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pyrlutils/openai/utils.py,sha256=PJc9WHZM8aM4Z9MlACUxUC8TO7VARp8taatba_ikhew,1056
+pyrlutils/td/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pyrlutils/td/td.py,sha256=EnecL84yyUm7rO2idaHgVfvtWW5LYPxEkefHhI1SVPQ,4269
+pyrlutils/td/utils.py,sha256=PALXGaDLd3PjFh8qDV9DY_MkaBuj3_GpfVWJOb424vE,3571
+pyrlutils-0.1.0.dist-info/licenses/LICENSE,sha256=bnQPjIcaeBdr2ZofX-_j-nELs8pAx5fQ4Cdfgeaspew,1063
+pyrlutils-0.1.0.dist-info/METADATA,sha256=qKVydib9iWVw-NXgMnB3y0JtDibVQcvclyc7zP2PYH0,2185
+pyrlutils-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+pyrlutils-0.1.0.dist-info/top_level.txt,sha256=gOBuxugE2MA4WDXlLhzkQh_rUonZU6nvJnMuomeHMCU,10
+pyrlutils-0.1.0.dist-info/RECORD,,

{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

pyrlutils-0.0.4.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-pyrlutils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pyrlutils/action.py,sha256=2kJqNZxsLOV8yOTl-RcpM8b0zu-WXNREJCrl49uZi2c,437
-pyrlutils/policy.py,sha256=Cx4vsIXzFZi_KEgI06S378Y5E6g-AfK90skDYoGsfOI,2794
-pyrlutils/reward.py,sha256=are0swsobMqI1IbrBVBaPMYXWpJnp6lZwAyfgBEm2zg,1211
-pyrlutils/state.py,sha256=w0YJ50FUyNboPoYduLMX1xaBJJHAOaSlsr3Og1dd0dY,7840
-pyrlutils/transition.py,sha256=lgh4YfOi-YjSIyymWfrXe-ugDWpZYK3MvjdeehgcQhk,5816
-pyrlutils/valuefcns.py,sha256=CJxu0EIFgrdbP0n0x6nzs3X08accFsuJW71tv1rMTkQ,6342
-pyrlutils/bandit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pyrlutils/bandit/algo.py,sha256=X2Pn4DOi-RXWz5CNg1h0RJCoV3VlAwEGHRMjkfbckfw,3969
-pyrlutils/bandit/reward.py,sha256=S_uECjMOg3cmK24J-5uPcckLvtxmU4yllR7JEvMwAQE,249
-pyrlutils/openai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pyrlutils/openai/utils.py,sha256=ehj1cGlDYjQLno3pKMCS3CzZwbZGSTmjxDlU07aSBFo,1033
-pyrlutils-0.0.4.dist-info/LICENSE,sha256=bnQPjIcaeBdr2ZofX-_j-nELs8pAx5fQ4Cdfgeaspew,1063
-pyrlutils-0.0.4.dist-info/METADATA,sha256=7ncLjVrpqIZpdMFMrRjqRNgfZl9LUKc_SZFkw_CoTFc,2228
-pyrlutils-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-pyrlutils-0.0.4.dist-info/top_level.txt,sha256=gOBuxugE2MA4WDXlLhzkQh_rUonZU6nvJnMuomeHMCU,10
-pyrlutils-0.0.4.dist-info/RECORD,,

{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.0.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{pyrlutils-0.0.4.dist-info → pyrlutils-0.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

pyrlutils 0.0.4__py3-none-any.whl → 0.1.0__py3-none-any.whl

Potentially problematic release.

pyrlutils 0.0.4py3-none-any.whl → 0.1.0py3-none-any.whl