pyrlutils 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyrlutils might be problematic. Click here for more details.

pyrlutils/__init__.py ADDED
File without changes
pyrlutils/action.py ADDED
@@ -0,0 +1,19 @@
1
+
2
+ from types import LambdaType
3
+ from typing import Union
4
+
5
+ from .state import State
6
+
7
+
8
+ DiscreteActionValueType = Union[float, str]
9
+
10
+ class Action:
11
+ def __init__(self, actionfunc: LambdaType):
12
+ self._actionfunc = actionfunc
13
+
14
+ def act(self, state: State, *args, **kwargs) -> State:
15
+ self._actionfunc(state, *args, **kwargs)
16
+ return state
17
+
18
+ def __call__(self, state: State) -> State:
19
+ return self.act(state)
File without changes
@@ -0,0 +1,128 @@
1
+
2
+ from abc import ABC, abstractmethod
3
+
4
+ import numpy as np
5
+
6
+ from .reward import IndividualBanditRewardFunction
7
+
8
+
9
+ class BanditAlgorithm(ABC):
10
+ def __init__(self, action_values: list, reward_function: IndividualBanditRewardFunction):
11
+ self._action_values = action_values
12
+ self._reward_function = reward_function
13
+
14
+ @abstractmethod
15
+ def _go_one_loop(self):
16
+ pass
17
+
18
+ def loop(self, nbiterations: int):
19
+ for _ in range(nbiterations):
20
+ self._go_one_loop()
21
+
22
+ def reward(self, action_value) -> float:
23
+ return self._reward_function(action_value)
24
+
25
+ @abstractmethod
26
+ def get_action(self):
27
+ pass
28
+
29
+ @property
30
+ def action_values(self):
31
+ return self._action_values
32
+
33
+ @property
34
+ def reward_function(self) -> IndividualBanditRewardFunction:
35
+ return self._reward_function
36
+
37
+
38
+ class SimpleBandit(BanditAlgorithm):
39
+ def __init__(
40
+ self,
41
+ action_values: list,
42
+ reward_function: IndividualBanditRewardFunction,
43
+ epsilon: float=0.05
44
+ ):
45
+ super().__init__(action_values, reward_function)
46
+ self._epsilon = epsilon
47
+ self._initialize()
48
+
49
+ def _initialize(self):
50
+ self._Q = np.zeros(len(self._action_values))
51
+ self._N = np.zeros(len(self._action_values), dtype=np.int32)
52
+
53
+ def _go_one_loop(self):
54
+ r = np.random.uniform()
55
+ if r < self.epsilon:
56
+ selected_action_idx = np.argmax(self._Q)
57
+ else:
58
+ selected_action_idx = np.random.choice(range(len(self._action_values)))
59
+ reward = self._reward_function(self._action_values[selected_action_idx])
60
+ self._N[selected_action_idx] += 1
61
+ self._Q[selected_action_idx] += (reward - self._Q[selected_action_idx]) / self._N[selected_action_idx]
62
+
63
+ def get_action(self):
64
+ selected_action_idx = np.argmax(self._Q)
65
+ return self._action_values[selected_action_idx]
66
+
67
+ @property
68
+ def epsilon(self) -> float:
69
+ return self._epsilon
70
+
71
+ @epsilon.setter
72
+ def epsilon(self, val: float):
73
+ self._epsilon = val
74
+
75
+
76
+ class GradientBandit(BanditAlgorithm):
77
+ def __init__(self, action_values: list, reward_function: IndividualBanditRewardFunction, temperature: float=1.0, alpha: float=0.1):
78
+ super().__init__(action_values, reward_function)
79
+ self._T = temperature
80
+ self._alpha = alpha
81
+ self._initialize()
82
+
83
+ def _initialize(self):
84
+ self._preferences = np.zeros(len(self._action_values))
85
+ self._rewards_over_time = []
86
+
87
+ def _get_probs(self) -> np.ndarray:
88
+ # getting probabilities using softmax
89
+ exp_preferences = np.exp(self._preferences / self.T)
90
+ sum_exp_preferences = np.sum(exp_preferences)
91
+ return exp_preferences / sum_exp_preferences
92
+
93
+ def get_action(self):
94
+ selected_action_idx = np.argmax(self._preferences)
95
+ return self._action_values[selected_action_idx]
96
+
97
+ def _go_one_loop(self):
98
+ probs = self._get_probs()
99
+ selected_action_idx = np.random.choice(range(self._preferences.shape[0]), p=probs)
100
+ reward = self._reward_function(self._action_values[selected_action_idx])
101
+ self._rewards_over_time.append(reward)
102
+ average_reward = np.mean(self._rewards_over_time) if len(self._rewards_over_time) > 0 else 0.
103
+
104
+ for i in range(len(self._action_values)):
105
+ if i == selected_action_idx:
106
+ self._preferences[i] += self.alpha * (reward - average_reward) * (1 - probs[i])
107
+ else:
108
+ self._preferences[i] -= self.alpha * (reward - average_reward) * probs[i]
109
+
110
+ @property
111
+ def alpha(self) -> float:
112
+ return self._alpha
113
+
114
+ @alpha.setter
115
+ def alpha(self, val: float):
116
+ self._alpha = val
117
+
118
+ @property
119
+ def T(self) -> float:
120
+ return self._T
121
+
122
+ @T.setter
123
+ def T(self, val: float):
124
+ self._T = val
125
+
126
+ @property
127
+ def temperature(self) -> float:
128
+ return self._T
@@ -0,0 +1,11 @@
1
+
2
+ from abc import ABC, abstractmethod
3
+
4
+
5
+ class IndividualBanditRewardFunction(ABC):
6
+ @abstractmethod
7
+ def reward(self, action_value) -> float:
8
+ pass
9
+
10
+ def __call__(self, action_value) -> float:
11
+ return self.reward(action_value)
File without changes
@@ -0,0 +1,31 @@
1
+
2
+ import gymnasium as gym
3
+
4
+ from ..transition import TransitionProbabilityFactory, NextStateTuple
5
+
6
+
7
+ class OpenAIGymDiscreteEnvironmentTransitionProbabilityFactory(TransitionProbabilityFactory):
8
+ def __init__(self, envname):
9
+ super().__init__()
10
+ self._envname = envname
11
+ self._gymenv = gym.make(envname)
12
+ self._convert_openai_gymenv_to_transprob()
13
+
14
+ def _convert_openai_gymenv_to_transprob(self):
15
+ P = self._gymenv.env.env.env.P
16
+ for state_value, trans_dict in P.items():
17
+ new_trans_dict = {}
18
+ for action_value, next_state_list in trans_dict.items():
19
+ new_trans_dict[action_value] = [
20
+ NextStateTuple(next_state[1], next_state[0], next_state[2], next_state[3])
21
+ for next_state in next_state_list
22
+ ]
23
+ self.add_state_transitions(state_value, new_trans_dict)
24
+
25
+ @property
26
+ def envname(self):
27
+ return self._envname
28
+
29
+ @property
30
+ def gymenv(self):
31
+ return self._gymenv
pyrlutils/policy.py ADDED
@@ -0,0 +1,84 @@
1
+
2
+ from abc import ABC, abstractmethod
3
+ from typing import Union, Dict
4
+ from warnings import warn
5
+
6
+ import numpy as np
7
+
8
+ from .state import State, DiscreteState, DiscreteStateValueType
9
+ from .action import Action, DiscreteActionValueType
10
+
11
+
12
+ class Policy(ABC):
13
+ @abstractmethod
14
+ def get_action(self, state: State) -> Action:
15
+ pass
16
+
17
+ def __call__(self, state: State) -> Action:
18
+ return self.get_action(state)
19
+
20
+ @property
21
+ def is_stochastic(self) -> bool:
22
+ raise NotImplemented()
23
+
24
+
25
+ class DeterministicPolicy(Policy):
26
+ @abstractmethod
27
+ def add_deterministic_rule(self, *args, **kwargs):
28
+ pass
29
+
30
+ @property
31
+ def is_stochastic(self) -> bool:
32
+ return False
33
+
34
+
35
+ class DiscreteDeterminsticPolicy(DeterministicPolicy):
36
+ def __init__(self, actions_dict: Dict[DiscreteActionValueType, Action]):
37
+ self._state_to_action = {}
38
+ self._actions_dict = actions_dict
39
+
40
+ def add_deterministic_rule(self, state_value: DiscreteStateValueType, action_value: DiscreteActionValueType):
41
+ if state_value in self._state_to_action:
42
+ warn('State value {} exists in rule; it will be replaced.'.format(state_value))
43
+ self._state_to_action[state_value] = action_value
44
+
45
+ def get_action_value(self, state_value: DiscreteStateValueType) -> DiscreteActionValueType:
46
+ return self._state_to_action.get(state_value)
47
+
48
+ def get_action(self, state: DiscreteState) -> Action:
49
+ return self._actions_dict[self.get_action_value(state.state_value)]
50
+
51
+ def __eq__(self, other) -> bool:
52
+ if len(self._state_to_action) != len(set(self._state_to_action.keys()).union(other._state_to_action.keys())):
53
+ return False
54
+ if len(self._actions_dict) != len(set(self._actions_dict.keys()).union(other._actions_dict.keys())):
55
+ return False
56
+ for action in self._actions_dict.keys():
57
+ if self._actions_dict[action] != other._actions_dict[action]:
58
+ return False
59
+ for state in self._state_to_action.keys():
60
+ if self._state_to_action[state] != other._state_to_action[state]:
61
+ return False
62
+ return True
63
+
64
+
65
+ class StochasticPolicy(Policy):
66
+ @abstractmethod
67
+ def get_probability(self, *args, **kwargs) -> float:
68
+ pass
69
+
70
+ @property
71
+ def is_stochastic(self) -> bool:
72
+ return True
73
+
74
+
75
+ class DiscreteStochasticPolicy(StochasticPolicy):
76
+ @abstractmethod
77
+ def get_probability(self, state_value: DiscreteStateValueType, action_value: DiscreteActionValueType) -> float:
78
+ pass
79
+
80
+
81
+ class ContinuousStochasticPolicy(StochasticPolicy):
82
+ @abstractmethod
83
+ def get_probability(self, state_value: Union[float, np.ndarray], action_value: DiscreteActionValueType, value: Union[float, np.ndarray]) -> float:
84
+ pass
pyrlutils/reward.py ADDED
@@ -0,0 +1,37 @@
1
+
2
+ from abc import ABC, abstractmethod
3
+
4
+
5
+ class IndividualRewardFunction(ABC):
6
+ @abstractmethod
7
+ def reward(self, state_value, action_value, next_state_value) -> float:
8
+ pass
9
+
10
+ def __call__(self, state_value, action_value, next_state_value) -> float:
11
+ return self.reward(state_value, action_value, next_state_value)
12
+
13
+
14
+ class RewardFunction(ABC):
15
+ def __init__(self, discount_factor: float, individual_reward_function: IndividualRewardFunction):
16
+ self._discount_factor = discount_factor
17
+ self._individual_reward_function = individual_reward_function
18
+
19
+ @property
20
+ def discount_factor(self) -> float:
21
+ return self._discount_factor
22
+
23
+ @discount_factor.setter
24
+ def discount_factor(self, discount_factor: float):
25
+ self._discount_factor = discount_factor
26
+
27
+ def individual_reward(self, state_value, action_value, next_state_value) -> float:
28
+ return self._individual_reward_function(state_value, action_value, next_state_value)
29
+
30
+ @abstractmethod
31
+ def total_reward(self, state_value, action_value) -> float:
32
+ pass
33
+
34
+ def __call__(self, state_value, action_value) -> float:
35
+ return self.total_reward(state_value, action_value)
36
+
37
+
pyrlutils/state.py ADDED
@@ -0,0 +1,225 @@
1
+
2
+ from abc import ABC, abstractmethod
3
+ from enum import Enum
4
+ from dataclasses import dataclass
5
+ from typing import Tuple, List, Optional, Union
6
+
7
+ import numpy as np
8
+
9
+
10
+ class StateValue(ABC):
11
+ @property
12
+ @abstractmethod
13
+ def value(self):
14
+ pass
15
+
16
+
17
+ @dataclass
18
+ class DiscreteStateValue(StateValue):
19
+ enum: Enum
20
+
21
+ @property
22
+ def value(self):
23
+ return self.enum.value
24
+
25
+ def name(self):
26
+ return self.enum.name
27
+
28
+
29
+ class ContinuousStateValue(StateValue):
30
+ _value: float
31
+
32
+ @property
33
+ def value(self) -> float:
34
+ return self._value
35
+
36
+
37
+ class State(ABC):
38
+ @property
39
+ def state_value(self):
40
+ return self.get_state_value()
41
+
42
+ @abstractmethod
43
+ def set_state_value(self, state_value):
44
+ pass
45
+
46
+ @abstractmethod
47
+ def get_state_value(self):
48
+ pass
49
+
50
+ @state_value.setter
51
+ def state_value(self, new_state_value):
52
+ self.set_state_value(new_state_value)
53
+
54
+
55
+ DiscreteStateValueType = Union[float, str, Tuple[int], Enum]
56
+
57
+
58
+ class DiscreteState(State):
59
+ def __init__(self, all_state_values: List[DiscreteStateValueType], initial_values: Optional[List[DiscreteStateValueType]] = None):
60
+ super().__init__()
61
+ self._all_state_values = all_state_values
62
+ self._state_value = initial_values if initial_values is not None and initial_values in self._all_state_values else self._all_state_values[0]
63
+
64
+ def get_state_value(self) -> DiscreteStateValueType:
65
+ return self._state_value
66
+
67
+ def set_state_value(self, state_value: DiscreteStateValueType):
68
+ if state_value in self._all_state_values:
69
+ self._state_value = state_value
70
+ else:
71
+ raise ValueError('State value {} is invalid.'.format(state_value))
72
+
73
+ def get_all_possible_state_values(self) -> List[DiscreteStateValueType]:
74
+ return self._all_state_values
75
+
76
+ @property
77
+ def state_value(self) -> DiscreteStateValueType:
78
+ return self._state_value
79
+
80
+ @state_value.setter
81
+ def state_value(self, new_state_value: DiscreteStateValueType):
82
+ self.set_state_value(new_state_value)
83
+
84
+ @property
85
+ def state_space_size(self):
86
+ return len(self._all_state_values)
87
+
88
+
89
+ class InvalidRangeError(Exception):
90
+ def __init__(self, message=None):
91
+ self.message = "Invalid range error!" if message is None else message
92
+ super().__init__(self.message)
93
+
94
+
95
+ class ContinuousState(State):
96
+ def __init__(self, nbdims: int, ranges: np.array, init_value: Optional[Union[float, np.ndarray]] = None):
97
+ self._nbdims = nbdims
98
+
99
+ try:
100
+ assert (ranges.dtype == np.float64) or (ranges.dtype == np.float32) or (ranges.dtype == np.float16)
101
+ except AssertionError:
102
+ raise TypeError('It has to be floating type numpy.ndarray.')
103
+
104
+ try:
105
+ assert self._nbdims > 0
106
+ except AssertionError:
107
+ raise ValueError('Number of dimensions must be positive.')
108
+
109
+ if self._nbdims > 1:
110
+ try:
111
+ assert self._nbdims == ranges.shape[0]
112
+ except AssertionError:
113
+ raise ValueError('Number of ranges does not meet the number of dimensions.')
114
+ try:
115
+ assert ranges.shape[1] == 2
116
+ except AssertionError:
117
+ raise ValueError("Only the smallest and largest values in `ranges'.")
118
+ else:
119
+ try:
120
+ assert ranges.shape[0] == 2
121
+ except AssertionError:
122
+ raise ValueError("Only the smallest and largest values in `ranges'.")
123
+
124
+ if self._nbdims > 1:
125
+ try:
126
+ for i in range(ranges.shape[0]):
127
+ assert ranges[i, 0] <= ranges[i, 1]
128
+ except AssertionError:
129
+ raise InvalidRangeError()
130
+ else:
131
+ try:
132
+ assert ranges[0] <= ranges[1]
133
+ except AssertionError:
134
+ raise InvalidRangeError()
135
+
136
+ self._ranges = ranges if self._nbdims > 1 else np.expand_dims(ranges, axis=0)
137
+ if init_value is None:
138
+ self._state_value = np.zeros(self._nbdims)
139
+ for i in range(self._nbdims):
140
+ self._state_value[i] = np.random.uniform(self._ranges[i, 0], self._ranges[i, 1])
141
+ else:
142
+ if self._nbdims > 1:
143
+ try:
144
+ assert init_value.shape[0] == self._nbdims
145
+ except AssertionError:
146
+ raise ValueError('Initialized value does not have the right dimension.')
147
+ for i in range(self._nbdims):
148
+ try:
149
+ assert (init_value[i] >= self._ranges[i, 0]) and (init_value[i] <= self.ranges[i, 1])
150
+ except AssertionError:
151
+ raise InvalidRangeError('Initialized value at dimension {} (value: {}) is not within the permitted range ({} -> {})!'.format(i, init_value[i], self._ranges[i, 0], self._ranges[i, 1]))
152
+ else:
153
+ try:
154
+ assert (init_value >= self._ranges[0, 0]) and (init_value <= self.ranges[0, 1])
155
+ except AssertionError:
156
+ raise InvalidRangeError('Initialized value is out of range.')
157
+ self._state_value = init_value
158
+
159
+ def set_state_value(self, state_value: Union[float, np.ndarray]):
160
+ if self.nbdims > 1:
161
+ try:
162
+ assert state_value.shape[0] == self._nbdims
163
+ except AssertionError:
164
+ raise ValueError('Given value does not have the right dimension.')
165
+ for i in range(self.nbdims):
166
+ try:
167
+ assert state_value[i] >= self.ranges[i, 0] and state_value[i] <= self.ranges[i, 1]
168
+ except AssertionError:
169
+ raise InvalidRangeError()
170
+ else:
171
+ try:
172
+ assert state_value >= self.ranges[0, 0] and state_value <= self.ranges[0, 1]
173
+ except AssertionError:
174
+ raise InvalidRangeError()
175
+
176
+ self._state_value = state_value
177
+
178
+ def get_state_value(self) -> np.ndarray:
179
+ return self._state_value
180
+
181
+ def get_state_value_ranges(self) -> np.ndarray:
182
+ return self._ranges
183
+
184
+ def get_state_value_range_at_dimension(self, dimension: int) -> np.ndarray:
185
+ return self._ranges[dimension]
186
+
187
+ @property
188
+ def ranges(self) -> np.ndarray:
189
+ return self.get_state_value_ranges()
190
+
191
+ @property
192
+ def state_value(self) -> Union[float, np.ndarray]:
193
+ return self.get_state_value()
194
+
195
+ @state_value.setter
196
+ def state_value(self, new_state_value):
197
+ self.set_state_value(new_state_value)
198
+
199
+ @property
200
+ def nbdims(self) -> int:
201
+ return self._nbdims
202
+
203
+
204
+ class Discrete2DCartesianState(DiscreteState):
205
+ def __init__(self, x_lowlim: int, x_hilim: int, y_lowlim: int, y_hilim: int, initial_coordinate: List[int]=None):
206
+ self._x_lowlim = x_lowlim
207
+ self._x_hilim = x_hilim
208
+ self._y_lowlim = y_lowlim
209
+ self._y_hilim = y_hilim
210
+ self._countx = self._x_hilim - self._x_lowlim + 1
211
+ self._county = self._y_hilim - self._y_lowlim + 1
212
+ if initial_coordinate is None:
213
+ initial_coordinate = [self._x_lowlim, self._y_lowlim]
214
+ initial_value = (initial_coordinate[1] - self._y_lowlim) * self._countx + (initial_coordinate[0] - self._x_lowlim)
215
+ super().__init__(list(range(self._countx*self._county)), initial_values=initial_value)
216
+
217
+ def _encode_coordinates(self, x, y) -> int:
218
+ return (y - self._y_lowlim) * self._countx + (x - self._x_lowlim)
219
+
220
+ def encode_coordinates(self, coordinates: List[int]) -> int:
221
+ assert len(coordinates) == 2
222
+ return self._encode_coordinates(coordinates[0], coordinates[1])
223
+
224
+ def decode_coordinates(self, hashcode) -> List[int]:
225
+ return [hashcode % self._countx, hashcode // self._countx]
@@ -0,0 +1,146 @@
1
+
2
+ from types import LambdaType
3
+ from typing import Tuple, Dict
4
+
5
+ import numpy as np
6
+
7
+ from .state import DiscreteState, DiscreteStateValueType
8
+ from .reward import IndividualRewardFunction
9
+ from .action import Action, DiscreteActionValueType
10
+
11
+
12
+ class NextStateTuple:
13
+ def __init__(self, next_state_value: DiscreteStateValueType, probability: float, reward: float, terminal: bool):
14
+ self._next_state_value = next_state_value
15
+ self._probability = probability
16
+ self._reward = reward
17
+ self._terminal = terminal
18
+
19
+ @property
20
+ def next_state_value(self) -> DiscreteStateValueType:
21
+ return self._next_state_value
22
+
23
+ @property
24
+ def probability(self) -> float:
25
+ return self._probability
26
+
27
+ @property
28
+ def reward(self) -> float:
29
+ return self._reward
30
+
31
+ @property
32
+ def terminal(self) -> bool:
33
+ return self._terminal
34
+
35
+
36
+ class TransitionProbabilityFactory:
37
+ def __init__(self):
38
+ self._transprobs = {}
39
+ self._all_state_values = []
40
+ self._all_action_values = []
41
+ self._objects_generated = False
42
+
43
+ def add_state_transitions(self, state_value: DiscreteStateValueType, action_values_to_next_state: dict):
44
+ if state_value not in self._all_state_values:
45
+ self._all_state_values.append(state_value)
46
+
47
+ this_state_transition_dict = {}
48
+
49
+ for action_value, next_state_tuples in action_values_to_next_state.items():
50
+ this_state_transition_dict[action_value] = []
51
+ for next_state_tuple in next_state_tuples:
52
+ if action_value not in self._all_action_values:
53
+ self._all_action_values.append(action_value)
54
+ if not isinstance(next_state_tuple, NextStateTuple):
55
+ if isinstance(next_state_tuple, dict):
56
+ next_state_tuple = NextStateTuple(
57
+ next_state_tuple['next_state_value'],
58
+ next_state_tuple['probability'],
59
+ next_state_tuple['reward'],
60
+ next_state_tuple['terminal']
61
+ )
62
+ else:
63
+ raise TypeError('"action_values_to_next_state" has to be a dictionary or NextStateTuple instance.')
64
+
65
+ if next_state_tuple.next_state_value not in self._all_state_values:
66
+ self._all_state_values.append(next_state_tuple.next_state_value)
67
+
68
+ this_state_transition_dict[action_value].append(next_state_tuple)
69
+
70
+ self._transprobs[state_value] = this_state_transition_dict
71
+
72
+ def _get_probs_for_eachstate(self, action_value: DiscreteActionValueType) -> Dict[DiscreteStateValueType, NextStateTuple]:
73
+ state_nexttuples = {}
74
+ for state_value, action_nexttuples_pair in self._transprobs.items():
75
+ for this_action_value, nexttuples in action_nexttuples_pair.items():
76
+ if this_action_value == action_value:
77
+ state_nexttuples[state_value] = nexttuples
78
+ return state_nexttuples
79
+
80
+ def _generate_action_function(self, state_nexttuples: dict) -> LambdaType:
81
+
82
+ def _action_function(state: DiscreteState) -> DiscreteState:
83
+ nexttuples = state_nexttuples[state.state_value]
84
+ nextstates = [nexttuple.next_state_value for nexttuple in nexttuples]
85
+ probs = [nexttuple.probability for nexttuple in nexttuples]
86
+ next_state_value = np.random.choice(nextstates, p=probs)
87
+ state.set_state_value(next_state_value)
88
+ return state
89
+
90
+ return _action_function
91
+
92
+ def _generate_individual_reward_function(self) -> IndividualRewardFunction:
93
+
94
+ def _individual_reward_function(state_value, action_value, next_state_value) -> float:
95
+ if state_value not in self._transprobs.keys():
96
+ return 0.
97
+
98
+ if action_value not in self._transprobs[state_value].keys():
99
+ return 0.
100
+
101
+ reward = 0.
102
+ for next_tuple in self._transprobs[state_value][action_value]:
103
+ if next_tuple.next_state_value == next_state_value:
104
+ reward += next_tuple.reward
105
+ return reward
106
+
107
+ class ThisIndividualRewardFunction(IndividualRewardFunction):
108
+ def __init__(self):
109
+ super().__init__()
110
+
111
+ def reward(self, state_value, action_value, next_state_value) -> float:
112
+ return _individual_reward_function(state_value, action_value, next_state_value)
113
+
114
+ return ThisIndividualRewardFunction()
115
+
116
+ def get_probability(self, state_value, action_value, new_state_value) -> float:
117
+ if state_value not in self._transprobs.keys():
118
+ return 0.
119
+
120
+ if action_value not in self._transprobs[state_value]:
121
+ return 0.
122
+
123
+ probs = 0.
124
+ for next_state_tuple in self._transprobs[state_value][action_value]:
125
+ if next_state_tuple.next_state_value == new_state_value:
126
+ probs += next_state_tuple.probability
127
+ return probs
128
+
129
+ @property
130
+ def transition_probabilities(self) -> dict:
131
+ return self._transprobs
132
+
133
+ def generate_mdp_objects(self) -> Tuple[DiscreteState, Dict[DiscreteActionValueType, Action], IndividualRewardFunction]:
134
+ state = DiscreteState(self._all_state_values)
135
+ actions_dict = {}
136
+ for action_value in self._all_action_values:
137
+ state_nexttuple = self._get_probs_for_eachstate(action_value)
138
+ actions_dict[action_value] = Action(self._generate_action_function(state_nexttuple))
139
+
140
+ individual_reward_fcn = self._generate_individual_reward_function()
141
+
142
+ return state, actions_dict, individual_reward_fcn
143
+
144
+ @property
145
+ def objects_generated(self) -> bool:
146
+ return self._objects_generated
pyrlutils/valuefcns.py ADDED
@@ -0,0 +1,144 @@
1
+
2
+ import random
3
+ from copy import copy
4
+ from typing import Tuple, Dict
5
+ from itertools import product
6
+
7
+ import numpy as np
8
+
9
+ from .state import DiscreteStateValueType
10
+ from .transition import TransitionProbabilityFactory
11
+ from .policy import DiscreteDeterminsticPolicy
12
+
13
+
14
+ class OptimalPolicyOnValueFunctions:
15
+ def __init__(self, discount_factor: float, transprobfac: TransitionProbabilityFactory):
16
+ try:
17
+ assert 0. <= discount_factor <= 1.
18
+ except AssertionError:
19
+ raise ValueError('Discount factor must be between 0 and 1.')
20
+ self._gamma = discount_factor
21
+ self._transprobfac = transprobfac
22
+ self._states, self._actions_dict, self._indrewardfcn = self._transprobfac.generate_mdp_objects()
23
+ self._state_names = self._states.get_all_possible_state_values()
24
+ self._states_to_indices = {state: idx for idx, state in enumerate(self._state_names)}
25
+ self._action_names = list(self._actions_dict.keys())
26
+ self._actions_to_indices = {action_value: idx for idx, action_value in enumerate(self._action_names)}
27
+
28
+ self._evaluated = False
29
+ self._improved = False
30
+
31
+ self._theta = 1e-10
32
+ self._policy_evaluation_maxiter = 10000
33
+
34
+ def _policy_evaluation(self, policy: DiscreteDeterminsticPolicy) -> np.ndarray:
35
+ prev_V = np.zeros(len(self._states_to_indices))
36
+
37
+ for _ in range(self._policy_evaluation_maxiter):
38
+ V = np.zeros(len(self._states_to_indices))
39
+ for state_value in self._state_names:
40
+ state_index = self._states_to_indices[state_value]
41
+ action_value = policy.get_action_value(state_value)
42
+ for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
43
+ prob = next_state_tuple.probability
44
+ reward = next_state_tuple.reward
45
+ next_state_value = next_state_tuple.next_state_value
46
+ next_state_index = self._states_to_indices[next_state_value]
47
+ terminal = next_state_tuple.terminal
48
+
49
+ V[state_index] += prob * (reward + (self._gamma*prev_V[next_state_index] if not terminal else 0.))
50
+
51
+ if np.max(np.abs(V-prev_V)) < self._theta:
52
+ break
53
+
54
+ prev_V = V.copy()
55
+
56
+ return V
57
+
58
+ def _policy_improvement(self, V: np.ndarray) -> DiscreteDeterminsticPolicy:
59
+ Q = np.zeros((len(self._states_to_indices), len(self._actions_to_indices)))
60
+
61
+ for state_value in self._state_names:
62
+ state_index = self._states_to_indices[state_value]
63
+ for action_value in self._action_names:
64
+ action_index = self._actions_to_indices[action_value]
65
+ for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
66
+ prob = next_state_tuple.probability
67
+ reward = next_state_tuple.reward
68
+ next_state_value = next_state_tuple.next_state_value
69
+ next_state_index = self._states_to_indices[next_state_value]
70
+ terminal = next_state_tuple.terminal
71
+
72
+ Q[state_index, action_index] += prob * (reward + (self._gamma*V[next_state_index] if not terminal else 0.))
73
+
74
+ optimal_policy = DiscreteDeterminsticPolicy(self._actions_dict)
75
+ optimal_action_indices = np.argmax(Q, axis=1)
76
+ for state_value, action_index in zip(self._state_names, optimal_action_indices):
77
+ action_value = self._action_names[action_index]
78
+ optimal_policy.add_deterministic_rule(state_value, action_value)
79
+ return optimal_policy
80
+
81
+ def _policy_iteration(self) -> Tuple[np.ndarray, DiscreteDeterminsticPolicy]:
82
+ policy = DiscreteDeterminsticPolicy(self._actions_dict)
83
+ for state_value in self._state_names:
84
+ policy.add_deterministic_rule(state_value, random.choice(self._action_names))
85
+ V = None
86
+
87
+ done = False
88
+ while not done:
89
+ old_policy = copy(policy)
90
+
91
+ V = self._policy_evaluation(policy)
92
+ policy = self._policy_improvement(V)
93
+
94
+ if policy == old_policy:
95
+ done = True
96
+
97
+ return V, policy
98
+
99
+
100
+ def _value_iteration(self) -> Tuple[np.ndarray, DiscreteDeterminsticPolicy]:
101
+ V = np.zeros(len(self._state_names))
102
+
103
+ for _ in range(self._policy_evaluation_maxiter):
104
+ Q = np.zeros((len(self._state_names), len(self._action_names)))
105
+ for state_value, action_value in product(self._state_names, self._action_names):
106
+ state_index = self._states_to_indices[state_value]
107
+ action_index = self._actions_to_indices[action_value]
108
+ for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
109
+ prob = next_state_tuple.probability
110
+ reward = next_state_tuple.reward
111
+ next_state_value = next_state_tuple.next_state_value
112
+ next_state_index = self._states_to_indices[next_state_value]
113
+ terminal = next_state_tuple.terminal
114
+
115
+ Q[state_index, action_index] += prob * (reward + (self._gamma * V[next_state_index] if not terminal else 0.))
116
+
117
+ if np.max(np.abs(V-np.max(Q, axis=1))) < self._theta:
118
+ break
119
+
120
+ V = np.max(Q, axis=1)
121
+
122
+ Qmaxj = np.argmax(Q, axis=1)
123
+
124
+ policy = DiscreteDeterminsticPolicy(self._actions_dict)
125
+ for state_value, action_index in zip(self._state_names, Qmaxj):
126
+ policy.add_deterministic_rule(state_value, self._action_names[action_index])
127
+
128
+ return V, policy
129
+
130
+ def policy_iteration(self) -> Tuple[Dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
131
+ V, policy = self._policy_iteration()
132
+ state_values_dict = {
133
+ self._state_names[i]: V[i]
134
+ for i in range(V.shape[0])
135
+ }
136
+ return state_values_dict, policy
137
+
138
+ def value_iteration(self) -> Tuple[Dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
139
+ V, policy = self._value_iteration()
140
+ state_values_dict = {
141
+ self._state_names[i]: V[i]
142
+ for i in range(V.shape[0])
143
+ }
144
+ return state_values_dict, policy
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2023 Kwan Yuet Stephen Ho
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.2
2
+ Name: pyrlutils
3
+ Version: 0.0.4
4
+ Summary: Utility and Helpers for Reinformcement Learning
5
+ Author-email: Kwan Yuet Stephen Ho <stephenhky@yahoo.com.hk>
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/stephenhky/PyRLUtils
8
+ Project-URL: Issues, https://github.com/stephenhky/PyRLUtils/issues
9
+ Keywords: machine learning,reinforcement leaning,artificial intelligence
10
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
+ Classifier: Topic :: Software Development :: Version Control :: Git
14
+ Classifier: Programming Language :: Python :: 3.7
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Intended Audience :: Science/Research
21
+ Classifier: Intended Audience :: Developers
22
+ Requires-Python: >=3.7
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: numpy
26
+ Provides-Extra: openaigym
27
+ Requires-Dist: gymnasium; extra == "openaigym"
28
+ Provides-Extra: test
29
+ Requires-Dist: unittest; extra == "test"
30
+
31
+ # PyRLUtils
32
+
33
+ [![CircleCI](https://circleci.com/gh/stephenhky/PyRLUtils.svg?style=svg)](https://circleci.com/gh/stephenhky/PyRLUtils.svg)
34
+ [![GitHub release](https://img.shields.io/github/release/stephenhky/PyRLUtils.svg?maxAge=3600)](https://github.com/stephenhky/pyqentangle/PyRLUtils)
35
+ [![pypi](https://img.shields.io/pypi/v/PyRLUtils.svg?maxAge=3600)](https://pypi.org/project/pyqentangle/)
36
+ [![download](https://img.shields.io/pypi/dm/PyRLUtils.svg?maxAge=2592000&label=installs&color=%2327B1FF)](https://pypi.org/project/PyRLUtils/)
37
+ [![Updates](https://pyup.io/repos/github/stephenhky/PyRLUtils/shield.svg)](https://pyup.io/repos/github/stephenhky/PyRLUtils/)
38
+ [![Python 3](https://pyup.io/repos/github/stephenhky/PyRLUtils/python-3-shield.svg)](https://pyup.io/repos/github/stephenhky/PyRLUtils/)
39
+
40
+
41
+ This is a Python package with utility classes and helper functions for
42
+ that facilitates the development of any reinformecement learning projects.
@@ -0,0 +1,17 @@
1
+ pyrlutils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ pyrlutils/action.py,sha256=2kJqNZxsLOV8yOTl-RcpM8b0zu-WXNREJCrl49uZi2c,437
3
+ pyrlutils/policy.py,sha256=Cx4vsIXzFZi_KEgI06S378Y5E6g-AfK90skDYoGsfOI,2794
4
+ pyrlutils/reward.py,sha256=are0swsobMqI1IbrBVBaPMYXWpJnp6lZwAyfgBEm2zg,1211
5
+ pyrlutils/state.py,sha256=w0YJ50FUyNboPoYduLMX1xaBJJHAOaSlsr3Og1dd0dY,7840
6
+ pyrlutils/transition.py,sha256=lgh4YfOi-YjSIyymWfrXe-ugDWpZYK3MvjdeehgcQhk,5816
7
+ pyrlutils/valuefcns.py,sha256=CJxu0EIFgrdbP0n0x6nzs3X08accFsuJW71tv1rMTkQ,6342
8
+ pyrlutils/bandit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ pyrlutils/bandit/algo.py,sha256=X2Pn4DOi-RXWz5CNg1h0RJCoV3VlAwEGHRMjkfbckfw,3969
10
+ pyrlutils/bandit/reward.py,sha256=S_uECjMOg3cmK24J-5uPcckLvtxmU4yllR7JEvMwAQE,249
11
+ pyrlutils/openai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ pyrlutils/openai/utils.py,sha256=ehj1cGlDYjQLno3pKMCS3CzZwbZGSTmjxDlU07aSBFo,1033
13
+ pyrlutils-0.0.4.dist-info/LICENSE,sha256=bnQPjIcaeBdr2ZofX-_j-nELs8pAx5fQ4Cdfgeaspew,1063
14
+ pyrlutils-0.0.4.dist-info/METADATA,sha256=7ncLjVrpqIZpdMFMrRjqRNgfZl9LUKc_SZFkw_CoTFc,2228
15
+ pyrlutils-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
+ pyrlutils-0.0.4.dist-info/top_level.txt,sha256=gOBuxugE2MA4WDXlLhzkQh_rUonZU6nvJnMuomeHMCU,10
17
+ pyrlutils-0.0.4.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ pyrlutils