pyrlutils 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyrlutils might be problematic. Click here for more details.
- pyrlutils/__init__.py +0 -0
- pyrlutils/action.py +19 -0
- pyrlutils/bandit/__init__.py +0 -0
- pyrlutils/bandit/algo.py +128 -0
- pyrlutils/bandit/reward.py +11 -0
- pyrlutils/openai/__init__.py +0 -0
- pyrlutils/openai/utils.py +31 -0
- pyrlutils/policy.py +84 -0
- pyrlutils/reward.py +37 -0
- pyrlutils/state.py +225 -0
- pyrlutils/transition.py +146 -0
- pyrlutils/valuefcns.py +144 -0
- pyrlutils-0.0.4.dist-info/LICENSE +19 -0
- pyrlutils-0.0.4.dist-info/METADATA +42 -0
- pyrlutils-0.0.4.dist-info/RECORD +17 -0
- pyrlutils-0.0.4.dist-info/WHEEL +5 -0
- pyrlutils-0.0.4.dist-info/top_level.txt +1 -0
pyrlutils/__init__.py
ADDED
|
File without changes
|
pyrlutils/action.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
|
|
2
|
+
from types import LambdaType
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
from .state import State
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DiscreteActionValueType = Union[float, str]
|
|
9
|
+
|
|
10
|
+
class Action:
|
|
11
|
+
def __init__(self, actionfunc: LambdaType):
|
|
12
|
+
self._actionfunc = actionfunc
|
|
13
|
+
|
|
14
|
+
def act(self, state: State, *args, **kwargs) -> State:
|
|
15
|
+
self._actionfunc(state, *args, **kwargs)
|
|
16
|
+
return state
|
|
17
|
+
|
|
18
|
+
def __call__(self, state: State) -> State:
|
|
19
|
+
return self.act(state)
|
|
File without changes
|
pyrlutils/bandit/algo.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from .reward import IndividualBanditRewardFunction
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BanditAlgorithm(ABC):
|
|
10
|
+
def __init__(self, action_values: list, reward_function: IndividualBanditRewardFunction):
|
|
11
|
+
self._action_values = action_values
|
|
12
|
+
self._reward_function = reward_function
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def _go_one_loop(self):
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
def loop(self, nbiterations: int):
|
|
19
|
+
for _ in range(nbiterations):
|
|
20
|
+
self._go_one_loop()
|
|
21
|
+
|
|
22
|
+
def reward(self, action_value) -> float:
|
|
23
|
+
return self._reward_function(action_value)
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def get_action(self):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def action_values(self):
|
|
31
|
+
return self._action_values
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def reward_function(self) -> IndividualBanditRewardFunction:
|
|
35
|
+
return self._reward_function
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SimpleBandit(BanditAlgorithm):
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
action_values: list,
|
|
42
|
+
reward_function: IndividualBanditRewardFunction,
|
|
43
|
+
epsilon: float=0.05
|
|
44
|
+
):
|
|
45
|
+
super().__init__(action_values, reward_function)
|
|
46
|
+
self._epsilon = epsilon
|
|
47
|
+
self._initialize()
|
|
48
|
+
|
|
49
|
+
def _initialize(self):
|
|
50
|
+
self._Q = np.zeros(len(self._action_values))
|
|
51
|
+
self._N = np.zeros(len(self._action_values), dtype=np.int32)
|
|
52
|
+
|
|
53
|
+
def _go_one_loop(self):
|
|
54
|
+
r = np.random.uniform()
|
|
55
|
+
if r < self.epsilon:
|
|
56
|
+
selected_action_idx = np.argmax(self._Q)
|
|
57
|
+
else:
|
|
58
|
+
selected_action_idx = np.random.choice(range(len(self._action_values)))
|
|
59
|
+
reward = self._reward_function(self._action_values[selected_action_idx])
|
|
60
|
+
self._N[selected_action_idx] += 1
|
|
61
|
+
self._Q[selected_action_idx] += (reward - self._Q[selected_action_idx]) / self._N[selected_action_idx]
|
|
62
|
+
|
|
63
|
+
def get_action(self):
|
|
64
|
+
selected_action_idx = np.argmax(self._Q)
|
|
65
|
+
return self._action_values[selected_action_idx]
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def epsilon(self) -> float:
|
|
69
|
+
return self._epsilon
|
|
70
|
+
|
|
71
|
+
@epsilon.setter
|
|
72
|
+
def epsilon(self, val: float):
|
|
73
|
+
self._epsilon = val
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class GradientBandit(BanditAlgorithm):
|
|
77
|
+
def __init__(self, action_values: list, reward_function: IndividualBanditRewardFunction, temperature: float=1.0, alpha: float=0.1):
|
|
78
|
+
super().__init__(action_values, reward_function)
|
|
79
|
+
self._T = temperature
|
|
80
|
+
self._alpha = alpha
|
|
81
|
+
self._initialize()
|
|
82
|
+
|
|
83
|
+
def _initialize(self):
|
|
84
|
+
self._preferences = np.zeros(len(self._action_values))
|
|
85
|
+
self._rewards_over_time = []
|
|
86
|
+
|
|
87
|
+
def _get_probs(self) -> np.ndarray:
|
|
88
|
+
# getting probabilities using softmax
|
|
89
|
+
exp_preferences = np.exp(self._preferences / self.T)
|
|
90
|
+
sum_exp_preferences = np.sum(exp_preferences)
|
|
91
|
+
return exp_preferences / sum_exp_preferences
|
|
92
|
+
|
|
93
|
+
def get_action(self):
|
|
94
|
+
selected_action_idx = np.argmax(self._preferences)
|
|
95
|
+
return self._action_values[selected_action_idx]
|
|
96
|
+
|
|
97
|
+
def _go_one_loop(self):
|
|
98
|
+
probs = self._get_probs()
|
|
99
|
+
selected_action_idx = np.random.choice(range(self._preferences.shape[0]), p=probs)
|
|
100
|
+
reward = self._reward_function(self._action_values[selected_action_idx])
|
|
101
|
+
self._rewards_over_time.append(reward)
|
|
102
|
+
average_reward = np.mean(self._rewards_over_time) if len(self._rewards_over_time) > 0 else 0.
|
|
103
|
+
|
|
104
|
+
for i in range(len(self._action_values)):
|
|
105
|
+
if i == selected_action_idx:
|
|
106
|
+
self._preferences[i] += self.alpha * (reward - average_reward) * (1 - probs[i])
|
|
107
|
+
else:
|
|
108
|
+
self._preferences[i] -= self.alpha * (reward - average_reward) * probs[i]
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def alpha(self) -> float:
|
|
112
|
+
return self._alpha
|
|
113
|
+
|
|
114
|
+
@alpha.setter
|
|
115
|
+
def alpha(self, val: float):
|
|
116
|
+
self._alpha = val
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def T(self) -> float:
|
|
120
|
+
return self._T
|
|
121
|
+
|
|
122
|
+
@T.setter
|
|
123
|
+
def T(self, val: float):
|
|
124
|
+
self._T = val
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def temperature(self) -> float:
|
|
128
|
+
return self._T
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
|
|
2
|
+
import gymnasium as gym
|
|
3
|
+
|
|
4
|
+
from ..transition import TransitionProbabilityFactory, NextStateTuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OpenAIGymDiscreteEnvironmentTransitionProbabilityFactory(TransitionProbabilityFactory):
|
|
8
|
+
def __init__(self, envname):
|
|
9
|
+
super().__init__()
|
|
10
|
+
self._envname = envname
|
|
11
|
+
self._gymenv = gym.make(envname)
|
|
12
|
+
self._convert_openai_gymenv_to_transprob()
|
|
13
|
+
|
|
14
|
+
def _convert_openai_gymenv_to_transprob(self):
|
|
15
|
+
P = self._gymenv.env.env.env.P
|
|
16
|
+
for state_value, trans_dict in P.items():
|
|
17
|
+
new_trans_dict = {}
|
|
18
|
+
for action_value, next_state_list in trans_dict.items():
|
|
19
|
+
new_trans_dict[action_value] = [
|
|
20
|
+
NextStateTuple(next_state[1], next_state[0], next_state[2], next_state[3])
|
|
21
|
+
for next_state in next_state_list
|
|
22
|
+
]
|
|
23
|
+
self.add_state_transitions(state_value, new_trans_dict)
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def envname(self):
|
|
27
|
+
return self._envname
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def gymenv(self):
|
|
31
|
+
return self._gymenv
|
pyrlutils/policy.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Union, Dict
|
|
4
|
+
from warnings import warn
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .state import State, DiscreteState, DiscreteStateValueType
|
|
9
|
+
from .action import Action, DiscreteActionValueType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Policy(ABC):
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def get_action(self, state: State) -> Action:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
def __call__(self, state: State) -> Action:
|
|
18
|
+
return self.get_action(state)
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def is_stochastic(self) -> bool:
|
|
22
|
+
raise NotImplemented()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DeterministicPolicy(Policy):
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def add_deterministic_rule(self, *args, **kwargs):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def is_stochastic(self) -> bool:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DiscreteDeterminsticPolicy(DeterministicPolicy):
|
|
36
|
+
def __init__(self, actions_dict: Dict[DiscreteActionValueType, Action]):
|
|
37
|
+
self._state_to_action = {}
|
|
38
|
+
self._actions_dict = actions_dict
|
|
39
|
+
|
|
40
|
+
def add_deterministic_rule(self, state_value: DiscreteStateValueType, action_value: DiscreteActionValueType):
|
|
41
|
+
if state_value in self._state_to_action:
|
|
42
|
+
warn('State value {} exists in rule; it will be replaced.'.format(state_value))
|
|
43
|
+
self._state_to_action[state_value] = action_value
|
|
44
|
+
|
|
45
|
+
def get_action_value(self, state_value: DiscreteStateValueType) -> DiscreteActionValueType:
|
|
46
|
+
return self._state_to_action.get(state_value)
|
|
47
|
+
|
|
48
|
+
def get_action(self, state: DiscreteState) -> Action:
|
|
49
|
+
return self._actions_dict[self.get_action_value(state.state_value)]
|
|
50
|
+
|
|
51
|
+
def __eq__(self, other) -> bool:
|
|
52
|
+
if len(self._state_to_action) != len(set(self._state_to_action.keys()).union(other._state_to_action.keys())):
|
|
53
|
+
return False
|
|
54
|
+
if len(self._actions_dict) != len(set(self._actions_dict.keys()).union(other._actions_dict.keys())):
|
|
55
|
+
return False
|
|
56
|
+
for action in self._actions_dict.keys():
|
|
57
|
+
if self._actions_dict[action] != other._actions_dict[action]:
|
|
58
|
+
return False
|
|
59
|
+
for state in self._state_to_action.keys():
|
|
60
|
+
if self._state_to_action[state] != other._state_to_action[state]:
|
|
61
|
+
return False
|
|
62
|
+
return True
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class StochasticPolicy(Policy):
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def get_probability(self, *args, **kwargs) -> float:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def is_stochastic(self) -> bool:
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class DiscreteStochasticPolicy(StochasticPolicy):
|
|
76
|
+
@abstractmethod
|
|
77
|
+
def get_probability(self, state_value: DiscreteStateValueType, action_value: DiscreteActionValueType) -> float:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ContinuousStochasticPolicy(StochasticPolicy):
|
|
82
|
+
@abstractmethod
|
|
83
|
+
def get_probability(self, state_value: Union[float, np.ndarray], action_value: DiscreteActionValueType, value: Union[float, np.ndarray]) -> float:
|
|
84
|
+
pass
|
pyrlutils/reward.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class IndividualRewardFunction(ABC):
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def reward(self, state_value, action_value, next_state_value) -> float:
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
def __call__(self, state_value, action_value, next_state_value) -> float:
|
|
11
|
+
return self.reward(state_value, action_value, next_state_value)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RewardFunction(ABC):
|
|
15
|
+
def __init__(self, discount_factor: float, individual_reward_function: IndividualRewardFunction):
|
|
16
|
+
self._discount_factor = discount_factor
|
|
17
|
+
self._individual_reward_function = individual_reward_function
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def discount_factor(self) -> float:
|
|
21
|
+
return self._discount_factor
|
|
22
|
+
|
|
23
|
+
@discount_factor.setter
|
|
24
|
+
def discount_factor(self, discount_factor: float):
|
|
25
|
+
self._discount_factor = discount_factor
|
|
26
|
+
|
|
27
|
+
def individual_reward(self, state_value, action_value, next_state_value) -> float:
|
|
28
|
+
return self._individual_reward_function(state_value, action_value, next_state_value)
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def total_reward(self, state_value, action_value) -> float:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
def __call__(self, state_value, action_value) -> float:
|
|
35
|
+
return self.total_reward(state_value, action_value)
|
|
36
|
+
|
|
37
|
+
|
pyrlutils/state.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Tuple, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StateValue(ABC):
|
|
11
|
+
@property
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def value(self):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class DiscreteStateValue(StateValue):
|
|
19
|
+
enum: Enum
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def value(self):
|
|
23
|
+
return self.enum.value
|
|
24
|
+
|
|
25
|
+
def name(self):
|
|
26
|
+
return self.enum.name
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ContinuousStateValue(StateValue):
|
|
30
|
+
_value: float
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def value(self) -> float:
|
|
34
|
+
return self._value
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class State(ABC):
|
|
38
|
+
@property
|
|
39
|
+
def state_value(self):
|
|
40
|
+
return self.get_state_value()
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def set_state_value(self, state_value):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def get_state_value(self):
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
@state_value.setter
|
|
51
|
+
def state_value(self, new_state_value):
|
|
52
|
+
self.set_state_value(new_state_value)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
DiscreteStateValueType = Union[float, str, Tuple[int], Enum]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class DiscreteState(State):
|
|
59
|
+
def __init__(self, all_state_values: List[DiscreteStateValueType], initial_values: Optional[List[DiscreteStateValueType]] = None):
|
|
60
|
+
super().__init__()
|
|
61
|
+
self._all_state_values = all_state_values
|
|
62
|
+
self._state_value = initial_values if initial_values is not None and initial_values in self._all_state_values else self._all_state_values[0]
|
|
63
|
+
|
|
64
|
+
def get_state_value(self) -> DiscreteStateValueType:
|
|
65
|
+
return self._state_value
|
|
66
|
+
|
|
67
|
+
def set_state_value(self, state_value: DiscreteStateValueType):
|
|
68
|
+
if state_value in self._all_state_values:
|
|
69
|
+
self._state_value = state_value
|
|
70
|
+
else:
|
|
71
|
+
raise ValueError('State value {} is invalid.'.format(state_value))
|
|
72
|
+
|
|
73
|
+
def get_all_possible_state_values(self) -> List[DiscreteStateValueType]:
|
|
74
|
+
return self._all_state_values
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def state_value(self) -> DiscreteStateValueType:
|
|
78
|
+
return self._state_value
|
|
79
|
+
|
|
80
|
+
@state_value.setter
|
|
81
|
+
def state_value(self, new_state_value: DiscreteStateValueType):
|
|
82
|
+
self.set_state_value(new_state_value)
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def state_space_size(self):
|
|
86
|
+
return len(self._all_state_values)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class InvalidRangeError(Exception):
|
|
90
|
+
def __init__(self, message=None):
|
|
91
|
+
self.message = "Invalid range error!" if message is None else message
|
|
92
|
+
super().__init__(self.message)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ContinuousState(State):
|
|
96
|
+
def __init__(self, nbdims: int, ranges: np.array, init_value: Optional[Union[float, np.ndarray]] = None):
|
|
97
|
+
self._nbdims = nbdims
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
assert (ranges.dtype == np.float64) or (ranges.dtype == np.float32) or (ranges.dtype == np.float16)
|
|
101
|
+
except AssertionError:
|
|
102
|
+
raise TypeError('It has to be floating type numpy.ndarray.')
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
assert self._nbdims > 0
|
|
106
|
+
except AssertionError:
|
|
107
|
+
raise ValueError('Number of dimensions must be positive.')
|
|
108
|
+
|
|
109
|
+
if self._nbdims > 1:
|
|
110
|
+
try:
|
|
111
|
+
assert self._nbdims == ranges.shape[0]
|
|
112
|
+
except AssertionError:
|
|
113
|
+
raise ValueError('Number of ranges does not meet the number of dimensions.')
|
|
114
|
+
try:
|
|
115
|
+
assert ranges.shape[1] == 2
|
|
116
|
+
except AssertionError:
|
|
117
|
+
raise ValueError("Only the smallest and largest values in `ranges'.")
|
|
118
|
+
else:
|
|
119
|
+
try:
|
|
120
|
+
assert ranges.shape[0] == 2
|
|
121
|
+
except AssertionError:
|
|
122
|
+
raise ValueError("Only the smallest and largest values in `ranges'.")
|
|
123
|
+
|
|
124
|
+
if self._nbdims > 1:
|
|
125
|
+
try:
|
|
126
|
+
for i in range(ranges.shape[0]):
|
|
127
|
+
assert ranges[i, 0] <= ranges[i, 1]
|
|
128
|
+
except AssertionError:
|
|
129
|
+
raise InvalidRangeError()
|
|
130
|
+
else:
|
|
131
|
+
try:
|
|
132
|
+
assert ranges[0] <= ranges[1]
|
|
133
|
+
except AssertionError:
|
|
134
|
+
raise InvalidRangeError()
|
|
135
|
+
|
|
136
|
+
self._ranges = ranges if self._nbdims > 1 else np.expand_dims(ranges, axis=0)
|
|
137
|
+
if init_value is None:
|
|
138
|
+
self._state_value = np.zeros(self._nbdims)
|
|
139
|
+
for i in range(self._nbdims):
|
|
140
|
+
self._state_value[i] = np.random.uniform(self._ranges[i, 0], self._ranges[i, 1])
|
|
141
|
+
else:
|
|
142
|
+
if self._nbdims > 1:
|
|
143
|
+
try:
|
|
144
|
+
assert init_value.shape[0] == self._nbdims
|
|
145
|
+
except AssertionError:
|
|
146
|
+
raise ValueError('Initialized value does not have the right dimension.')
|
|
147
|
+
for i in range(self._nbdims):
|
|
148
|
+
try:
|
|
149
|
+
assert (init_value[i] >= self._ranges[i, 0]) and (init_value[i] <= self.ranges[i, 1])
|
|
150
|
+
except AssertionError:
|
|
151
|
+
raise InvalidRangeError('Initialized value at dimension {} (value: {}) is not within the permitted range ({} -> {})!'.format(i, init_value[i], self._ranges[i, 0], self._ranges[i, 1]))
|
|
152
|
+
else:
|
|
153
|
+
try:
|
|
154
|
+
assert (init_value >= self._ranges[0, 0]) and (init_value <= self.ranges[0, 1])
|
|
155
|
+
except AssertionError:
|
|
156
|
+
raise InvalidRangeError('Initialized value is out of range.')
|
|
157
|
+
self._state_value = init_value
|
|
158
|
+
|
|
159
|
+
def set_state_value(self, state_value: Union[float, np.ndarray]):
|
|
160
|
+
if self.nbdims > 1:
|
|
161
|
+
try:
|
|
162
|
+
assert state_value.shape[0] == self._nbdims
|
|
163
|
+
except AssertionError:
|
|
164
|
+
raise ValueError('Given value does not have the right dimension.')
|
|
165
|
+
for i in range(self.nbdims):
|
|
166
|
+
try:
|
|
167
|
+
assert state_value[i] >= self.ranges[i, 0] and state_value[i] <= self.ranges[i, 1]
|
|
168
|
+
except AssertionError:
|
|
169
|
+
raise InvalidRangeError()
|
|
170
|
+
else:
|
|
171
|
+
try:
|
|
172
|
+
assert state_value >= self.ranges[0, 0] and state_value <= self.ranges[0, 1]
|
|
173
|
+
except AssertionError:
|
|
174
|
+
raise InvalidRangeError()
|
|
175
|
+
|
|
176
|
+
self._state_value = state_value
|
|
177
|
+
|
|
178
|
+
def get_state_value(self) -> np.ndarray:
|
|
179
|
+
return self._state_value
|
|
180
|
+
|
|
181
|
+
def get_state_value_ranges(self) -> np.ndarray:
|
|
182
|
+
return self._ranges
|
|
183
|
+
|
|
184
|
+
def get_state_value_range_at_dimension(self, dimension: int) -> np.ndarray:
|
|
185
|
+
return self._ranges[dimension]
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
def ranges(self) -> np.ndarray:
|
|
189
|
+
return self.get_state_value_ranges()
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def state_value(self) -> Union[float, np.ndarray]:
|
|
193
|
+
return self.get_state_value()
|
|
194
|
+
|
|
195
|
+
@state_value.setter
|
|
196
|
+
def state_value(self, new_state_value):
|
|
197
|
+
self.set_state_value(new_state_value)
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def nbdims(self) -> int:
|
|
201
|
+
return self._nbdims
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class Discrete2DCartesianState(DiscreteState):
|
|
205
|
+
def __init__(self, x_lowlim: int, x_hilim: int, y_lowlim: int, y_hilim: int, initial_coordinate: List[int]=None):
|
|
206
|
+
self._x_lowlim = x_lowlim
|
|
207
|
+
self._x_hilim = x_hilim
|
|
208
|
+
self._y_lowlim = y_lowlim
|
|
209
|
+
self._y_hilim = y_hilim
|
|
210
|
+
self._countx = self._x_hilim - self._x_lowlim + 1
|
|
211
|
+
self._county = self._y_hilim - self._y_lowlim + 1
|
|
212
|
+
if initial_coordinate is None:
|
|
213
|
+
initial_coordinate = [self._x_lowlim, self._y_lowlim]
|
|
214
|
+
initial_value = (initial_coordinate[1] - self._y_lowlim) * self._countx + (initial_coordinate[0] - self._x_lowlim)
|
|
215
|
+
super().__init__(list(range(self._countx*self._county)), initial_values=initial_value)
|
|
216
|
+
|
|
217
|
+
def _encode_coordinates(self, x, y) -> int:
|
|
218
|
+
return (y - self._y_lowlim) * self._countx + (x - self._x_lowlim)
|
|
219
|
+
|
|
220
|
+
def encode_coordinates(self, coordinates: List[int]) -> int:
|
|
221
|
+
assert len(coordinates) == 2
|
|
222
|
+
return self._encode_coordinates(coordinates[0], coordinates[1])
|
|
223
|
+
|
|
224
|
+
def decode_coordinates(self, hashcode) -> List[int]:
|
|
225
|
+
return [hashcode % self._countx, hashcode // self._countx]
|
pyrlutils/transition.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
|
|
2
|
+
from types import LambdaType
|
|
3
|
+
from typing import Tuple, Dict
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from .state import DiscreteState, DiscreteStateValueType
|
|
8
|
+
from .reward import IndividualRewardFunction
|
|
9
|
+
from .action import Action, DiscreteActionValueType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NextStateTuple:
|
|
13
|
+
def __init__(self, next_state_value: DiscreteStateValueType, probability: float, reward: float, terminal: bool):
|
|
14
|
+
self._next_state_value = next_state_value
|
|
15
|
+
self._probability = probability
|
|
16
|
+
self._reward = reward
|
|
17
|
+
self._terminal = terminal
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def next_state_value(self) -> DiscreteStateValueType:
|
|
21
|
+
return self._next_state_value
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def probability(self) -> float:
|
|
25
|
+
return self._probability
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def reward(self) -> float:
|
|
29
|
+
return self._reward
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def terminal(self) -> bool:
|
|
33
|
+
return self._terminal
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TransitionProbabilityFactory:
|
|
37
|
+
def __init__(self):
|
|
38
|
+
self._transprobs = {}
|
|
39
|
+
self._all_state_values = []
|
|
40
|
+
self._all_action_values = []
|
|
41
|
+
self._objects_generated = False
|
|
42
|
+
|
|
43
|
+
def add_state_transitions(self, state_value: DiscreteStateValueType, action_values_to_next_state: dict):
|
|
44
|
+
if state_value not in self._all_state_values:
|
|
45
|
+
self._all_state_values.append(state_value)
|
|
46
|
+
|
|
47
|
+
this_state_transition_dict = {}
|
|
48
|
+
|
|
49
|
+
for action_value, next_state_tuples in action_values_to_next_state.items():
|
|
50
|
+
this_state_transition_dict[action_value] = []
|
|
51
|
+
for next_state_tuple in next_state_tuples:
|
|
52
|
+
if action_value not in self._all_action_values:
|
|
53
|
+
self._all_action_values.append(action_value)
|
|
54
|
+
if not isinstance(next_state_tuple, NextStateTuple):
|
|
55
|
+
if isinstance(next_state_tuple, dict):
|
|
56
|
+
next_state_tuple = NextStateTuple(
|
|
57
|
+
next_state_tuple['next_state_value'],
|
|
58
|
+
next_state_tuple['probability'],
|
|
59
|
+
next_state_tuple['reward'],
|
|
60
|
+
next_state_tuple['terminal']
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
raise TypeError('"action_values_to_next_state" has to be a dictionary or NextStateTuple instance.')
|
|
64
|
+
|
|
65
|
+
if next_state_tuple.next_state_value not in self._all_state_values:
|
|
66
|
+
self._all_state_values.append(next_state_tuple.next_state_value)
|
|
67
|
+
|
|
68
|
+
this_state_transition_dict[action_value].append(next_state_tuple)
|
|
69
|
+
|
|
70
|
+
self._transprobs[state_value] = this_state_transition_dict
|
|
71
|
+
|
|
72
|
+
def _get_probs_for_eachstate(self, action_value: DiscreteActionValueType) -> Dict[DiscreteStateValueType, NextStateTuple]:
|
|
73
|
+
state_nexttuples = {}
|
|
74
|
+
for state_value, action_nexttuples_pair in self._transprobs.items():
|
|
75
|
+
for this_action_value, nexttuples in action_nexttuples_pair.items():
|
|
76
|
+
if this_action_value == action_value:
|
|
77
|
+
state_nexttuples[state_value] = nexttuples
|
|
78
|
+
return state_nexttuples
|
|
79
|
+
|
|
80
|
+
def _generate_action_function(self, state_nexttuples: dict) -> LambdaType:
|
|
81
|
+
|
|
82
|
+
def _action_function(state: DiscreteState) -> DiscreteState:
|
|
83
|
+
nexttuples = state_nexttuples[state.state_value]
|
|
84
|
+
nextstates = [nexttuple.next_state_value for nexttuple in nexttuples]
|
|
85
|
+
probs = [nexttuple.probability for nexttuple in nexttuples]
|
|
86
|
+
next_state_value = np.random.choice(nextstates, p=probs)
|
|
87
|
+
state.set_state_value(next_state_value)
|
|
88
|
+
return state
|
|
89
|
+
|
|
90
|
+
return _action_function
|
|
91
|
+
|
|
92
|
+
def _generate_individual_reward_function(self) -> IndividualRewardFunction:
|
|
93
|
+
|
|
94
|
+
def _individual_reward_function(state_value, action_value, next_state_value) -> float:
|
|
95
|
+
if state_value not in self._transprobs.keys():
|
|
96
|
+
return 0.
|
|
97
|
+
|
|
98
|
+
if action_value not in self._transprobs[state_value].keys():
|
|
99
|
+
return 0.
|
|
100
|
+
|
|
101
|
+
reward = 0.
|
|
102
|
+
for next_tuple in self._transprobs[state_value][action_value]:
|
|
103
|
+
if next_tuple.next_state_value == next_state_value:
|
|
104
|
+
reward += next_tuple.reward
|
|
105
|
+
return reward
|
|
106
|
+
|
|
107
|
+
class ThisIndividualRewardFunction(IndividualRewardFunction):
|
|
108
|
+
def __init__(self):
|
|
109
|
+
super().__init__()
|
|
110
|
+
|
|
111
|
+
def reward(self, state_value, action_value, next_state_value) -> float:
|
|
112
|
+
return _individual_reward_function(state_value, action_value, next_state_value)
|
|
113
|
+
|
|
114
|
+
return ThisIndividualRewardFunction()
|
|
115
|
+
|
|
116
|
+
def get_probability(self, state_value, action_value, new_state_value) -> float:
|
|
117
|
+
if state_value not in self._transprobs.keys():
|
|
118
|
+
return 0.
|
|
119
|
+
|
|
120
|
+
if action_value not in self._transprobs[state_value]:
|
|
121
|
+
return 0.
|
|
122
|
+
|
|
123
|
+
probs = 0.
|
|
124
|
+
for next_state_tuple in self._transprobs[state_value][action_value]:
|
|
125
|
+
if next_state_tuple.next_state_value == new_state_value:
|
|
126
|
+
probs += next_state_tuple.probability
|
|
127
|
+
return probs
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def transition_probabilities(self) -> dict:
|
|
131
|
+
return self._transprobs
|
|
132
|
+
|
|
133
|
+
def generate_mdp_objects(self) -> Tuple[DiscreteState, Dict[DiscreteActionValueType, Action], IndividualRewardFunction]:
|
|
134
|
+
state = DiscreteState(self._all_state_values)
|
|
135
|
+
actions_dict = {}
|
|
136
|
+
for action_value in self._all_action_values:
|
|
137
|
+
state_nexttuple = self._get_probs_for_eachstate(action_value)
|
|
138
|
+
actions_dict[action_value] = Action(self._generate_action_function(state_nexttuple))
|
|
139
|
+
|
|
140
|
+
individual_reward_fcn = self._generate_individual_reward_function()
|
|
141
|
+
|
|
142
|
+
return state, actions_dict, individual_reward_fcn
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def objects_generated(self) -> bool:
|
|
146
|
+
return self._objects_generated
|
pyrlutils/valuefcns.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
|
|
2
|
+
import random
|
|
3
|
+
from copy import copy
|
|
4
|
+
from typing import Tuple, Dict
|
|
5
|
+
from itertools import product
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from .state import DiscreteStateValueType
|
|
10
|
+
from .transition import TransitionProbabilityFactory
|
|
11
|
+
from .policy import DiscreteDeterminsticPolicy
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OptimalPolicyOnValueFunctions:
|
|
15
|
+
def __init__(self, discount_factor: float, transprobfac: TransitionProbabilityFactory):
|
|
16
|
+
try:
|
|
17
|
+
assert 0. <= discount_factor <= 1.
|
|
18
|
+
except AssertionError:
|
|
19
|
+
raise ValueError('Discount factor must be between 0 and 1.')
|
|
20
|
+
self._gamma = discount_factor
|
|
21
|
+
self._transprobfac = transprobfac
|
|
22
|
+
self._states, self._actions_dict, self._indrewardfcn = self._transprobfac.generate_mdp_objects()
|
|
23
|
+
self._state_names = self._states.get_all_possible_state_values()
|
|
24
|
+
self._states_to_indices = {state: idx for idx, state in enumerate(self._state_names)}
|
|
25
|
+
self._action_names = list(self._actions_dict.keys())
|
|
26
|
+
self._actions_to_indices = {action_value: idx for idx, action_value in enumerate(self._action_names)}
|
|
27
|
+
|
|
28
|
+
self._evaluated = False
|
|
29
|
+
self._improved = False
|
|
30
|
+
|
|
31
|
+
self._theta = 1e-10
|
|
32
|
+
self._policy_evaluation_maxiter = 10000
|
|
33
|
+
|
|
34
|
+
def _policy_evaluation(self, policy: DiscreteDeterminsticPolicy) -> np.ndarray:
|
|
35
|
+
prev_V = np.zeros(len(self._states_to_indices))
|
|
36
|
+
|
|
37
|
+
for _ in range(self._policy_evaluation_maxiter):
|
|
38
|
+
V = np.zeros(len(self._states_to_indices))
|
|
39
|
+
for state_value in self._state_names:
|
|
40
|
+
state_index = self._states_to_indices[state_value]
|
|
41
|
+
action_value = policy.get_action_value(state_value)
|
|
42
|
+
for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
|
|
43
|
+
prob = next_state_tuple.probability
|
|
44
|
+
reward = next_state_tuple.reward
|
|
45
|
+
next_state_value = next_state_tuple.next_state_value
|
|
46
|
+
next_state_index = self._states_to_indices[next_state_value]
|
|
47
|
+
terminal = next_state_tuple.terminal
|
|
48
|
+
|
|
49
|
+
V[state_index] += prob * (reward + (self._gamma*prev_V[next_state_index] if not terminal else 0.))
|
|
50
|
+
|
|
51
|
+
if np.max(np.abs(V-prev_V)) < self._theta:
|
|
52
|
+
break
|
|
53
|
+
|
|
54
|
+
prev_V = V.copy()
|
|
55
|
+
|
|
56
|
+
return V
|
|
57
|
+
|
|
58
|
+
def _policy_improvement(self, V: np.ndarray) -> DiscreteDeterminsticPolicy:
|
|
59
|
+
Q = np.zeros((len(self._states_to_indices), len(self._actions_to_indices)))
|
|
60
|
+
|
|
61
|
+
for state_value in self._state_names:
|
|
62
|
+
state_index = self._states_to_indices[state_value]
|
|
63
|
+
for action_value in self._action_names:
|
|
64
|
+
action_index = self._actions_to_indices[action_value]
|
|
65
|
+
for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
|
|
66
|
+
prob = next_state_tuple.probability
|
|
67
|
+
reward = next_state_tuple.reward
|
|
68
|
+
next_state_value = next_state_tuple.next_state_value
|
|
69
|
+
next_state_index = self._states_to_indices[next_state_value]
|
|
70
|
+
terminal = next_state_tuple.terminal
|
|
71
|
+
|
|
72
|
+
Q[state_index, action_index] += prob * (reward + (self._gamma*V[next_state_index] if not terminal else 0.))
|
|
73
|
+
|
|
74
|
+
optimal_policy = DiscreteDeterminsticPolicy(self._actions_dict)
|
|
75
|
+
optimal_action_indices = np.argmax(Q, axis=1)
|
|
76
|
+
for state_value, action_index in zip(self._state_names, optimal_action_indices):
|
|
77
|
+
action_value = self._action_names[action_index]
|
|
78
|
+
optimal_policy.add_deterministic_rule(state_value, action_value)
|
|
79
|
+
return optimal_policy
|
|
80
|
+
|
|
81
|
+
def _policy_iteration(self) -> Tuple[np.ndarray, DiscreteDeterminsticPolicy]:
|
|
82
|
+
policy = DiscreteDeterminsticPolicy(self._actions_dict)
|
|
83
|
+
for state_value in self._state_names:
|
|
84
|
+
policy.add_deterministic_rule(state_value, random.choice(self._action_names))
|
|
85
|
+
V = None
|
|
86
|
+
|
|
87
|
+
done = False
|
|
88
|
+
while not done:
|
|
89
|
+
old_policy = copy(policy)
|
|
90
|
+
|
|
91
|
+
V = self._policy_evaluation(policy)
|
|
92
|
+
policy = self._policy_improvement(V)
|
|
93
|
+
|
|
94
|
+
if policy == old_policy:
|
|
95
|
+
done = True
|
|
96
|
+
|
|
97
|
+
return V, policy
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _value_iteration(self) -> Tuple[np.ndarray, DiscreteDeterminsticPolicy]:
|
|
101
|
+
V = np.zeros(len(self._state_names))
|
|
102
|
+
|
|
103
|
+
for _ in range(self._policy_evaluation_maxiter):
|
|
104
|
+
Q = np.zeros((len(self._state_names), len(self._action_names)))
|
|
105
|
+
for state_value, action_value in product(self._state_names, self._action_names):
|
|
106
|
+
state_index = self._states_to_indices[state_value]
|
|
107
|
+
action_index = self._actions_to_indices[action_value]
|
|
108
|
+
for next_state_tuple in self._transprobfac.transition_probabilities[state_value][action_value]:
|
|
109
|
+
prob = next_state_tuple.probability
|
|
110
|
+
reward = next_state_tuple.reward
|
|
111
|
+
next_state_value = next_state_tuple.next_state_value
|
|
112
|
+
next_state_index = self._states_to_indices[next_state_value]
|
|
113
|
+
terminal = next_state_tuple.terminal
|
|
114
|
+
|
|
115
|
+
Q[state_index, action_index] += prob * (reward + (self._gamma * V[next_state_index] if not terminal else 0.))
|
|
116
|
+
|
|
117
|
+
if np.max(np.abs(V-np.max(Q, axis=1))) < self._theta:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
V = np.max(Q, axis=1)
|
|
121
|
+
|
|
122
|
+
Qmaxj = np.argmax(Q, axis=1)
|
|
123
|
+
|
|
124
|
+
policy = DiscreteDeterminsticPolicy(self._actions_dict)
|
|
125
|
+
for state_value, action_index in zip(self._state_names, Qmaxj):
|
|
126
|
+
policy.add_deterministic_rule(state_value, self._action_names[action_index])
|
|
127
|
+
|
|
128
|
+
return V, policy
|
|
129
|
+
|
|
130
|
+
def policy_iteration(self) -> Tuple[Dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
|
|
131
|
+
V, policy = self._policy_iteration()
|
|
132
|
+
state_values_dict = {
|
|
133
|
+
self._state_names[i]: V[i]
|
|
134
|
+
for i in range(V.shape[0])
|
|
135
|
+
}
|
|
136
|
+
return state_values_dict, policy
|
|
137
|
+
|
|
138
|
+
def value_iteration(self) -> Tuple[Dict[DiscreteStateValueType, float], DiscreteDeterminsticPolicy]:
|
|
139
|
+
V, policy = self._value_iteration()
|
|
140
|
+
state_values_dict = {
|
|
141
|
+
self._state_names[i]: V[i]
|
|
142
|
+
for i in range(V.shape[0])
|
|
143
|
+
}
|
|
144
|
+
return state_values_dict, policy
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2023 Kwan Yuet Stephen Ho
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: pyrlutils
|
|
3
|
+
Version: 0.0.4
|
|
4
|
+
Summary: Utility and Helpers for Reinformcement Learning
|
|
5
|
+
Author-email: Kwan Yuet Stephen Ho <stephenhky@yahoo.com.hk>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/stephenhky/PyRLUtils
|
|
8
|
+
Project-URL: Issues, https://github.com/stephenhky/PyRLUtils/issues
|
|
9
|
+
Keywords: machine learning,reinforcement leaning,artificial intelligence
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Classifier: Topic :: Software Development :: Version Control :: Git
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Intended Audience :: Science/Research
|
|
21
|
+
Classifier: Intended Audience :: Developers
|
|
22
|
+
Requires-Python: >=3.7
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: numpy
|
|
26
|
+
Provides-Extra: openaigym
|
|
27
|
+
Requires-Dist: gymnasium; extra == "openaigym"
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: unittest; extra == "test"
|
|
30
|
+
|
|
31
|
+
# PyRLUtils
|
|
32
|
+
|
|
33
|
+
[](https://circleci.com/gh/stephenhky/PyRLUtils.svg)
|
|
34
|
+
[](https://github.com/stephenhky/pyqentangle/PyRLUtils)
|
|
35
|
+
[](https://pypi.org/project/pyqentangle/)
|
|
36
|
+
[](https://pypi.org/project/PyRLUtils/)
|
|
37
|
+
[](https://pyup.io/repos/github/stephenhky/PyRLUtils/)
|
|
38
|
+
[](https://pyup.io/repos/github/stephenhky/PyRLUtils/)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
This is a Python package with utility classes and helper functions for
|
|
42
|
+
that facilitates the development of any reinformecement learning projects.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
pyrlutils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
pyrlutils/action.py,sha256=2kJqNZxsLOV8yOTl-RcpM8b0zu-WXNREJCrl49uZi2c,437
|
|
3
|
+
pyrlutils/policy.py,sha256=Cx4vsIXzFZi_KEgI06S378Y5E6g-AfK90skDYoGsfOI,2794
|
|
4
|
+
pyrlutils/reward.py,sha256=are0swsobMqI1IbrBVBaPMYXWpJnp6lZwAyfgBEm2zg,1211
|
|
5
|
+
pyrlutils/state.py,sha256=w0YJ50FUyNboPoYduLMX1xaBJJHAOaSlsr3Og1dd0dY,7840
|
|
6
|
+
pyrlutils/transition.py,sha256=lgh4YfOi-YjSIyymWfrXe-ugDWpZYK3MvjdeehgcQhk,5816
|
|
7
|
+
pyrlutils/valuefcns.py,sha256=CJxu0EIFgrdbP0n0x6nzs3X08accFsuJW71tv1rMTkQ,6342
|
|
8
|
+
pyrlutils/bandit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
pyrlutils/bandit/algo.py,sha256=X2Pn4DOi-RXWz5CNg1h0RJCoV3VlAwEGHRMjkfbckfw,3969
|
|
10
|
+
pyrlutils/bandit/reward.py,sha256=S_uECjMOg3cmK24J-5uPcckLvtxmU4yllR7JEvMwAQE,249
|
|
11
|
+
pyrlutils/openai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
pyrlutils/openai/utils.py,sha256=ehj1cGlDYjQLno3pKMCS3CzZwbZGSTmjxDlU07aSBFo,1033
|
|
13
|
+
pyrlutils-0.0.4.dist-info/LICENSE,sha256=bnQPjIcaeBdr2ZofX-_j-nELs8pAx5fQ4Cdfgeaspew,1063
|
|
14
|
+
pyrlutils-0.0.4.dist-info/METADATA,sha256=7ncLjVrpqIZpdMFMrRjqRNgfZl9LUKc_SZFkw_CoTFc,2228
|
|
15
|
+
pyrlutils-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
16
|
+
pyrlutils-0.0.4.dist-info/top_level.txt,sha256=gOBuxugE2MA4WDXlLhzkQh_rUonZU6nvJnMuomeHMCU,10
|
|
17
|
+
pyrlutils-0.0.4.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pyrlutils
|