gr-libs 0.1.7.post0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gr_libs/__init__.py +4 -1
- gr_libs/_evaluation/__init__.py +1 -0
- gr_libs/_evaluation/_analyze_results_cross_alg_cross_domain.py +260 -0
- gr_libs/_evaluation/_generate_experiments_results.py +141 -0
- gr_libs/_evaluation/_generate_task_specific_statistics_plots.py +497 -0
- gr_libs/_evaluation/_get_plans_images.py +61 -0
- gr_libs/_evaluation/_increasing_and_decreasing_.py +106 -0
- gr_libs/_version.py +2 -2
- gr_libs/all_experiments.py +294 -0
- gr_libs/environment/__init__.py +30 -9
- gr_libs/environment/_utils/utils.py +27 -0
- gr_libs/environment/environment.py +417 -54
- gr_libs/metrics/__init__.py +7 -0
- gr_libs/metrics/metrics.py +231 -54
- gr_libs/ml/__init__.py +2 -5
- gr_libs/ml/agent.py +21 -6
- gr_libs/ml/base/__init__.py +3 -1
- gr_libs/ml/base/rl_agent.py +81 -13
- gr_libs/ml/consts.py +1 -1
- gr_libs/ml/neural/__init__.py +1 -3
- gr_libs/ml/neural/deep_rl_learner.py +619 -378
- gr_libs/ml/neural/utils/__init__.py +1 -2
- gr_libs/ml/neural/utils/dictlist.py +3 -3
- gr_libs/ml/planner/mcts/{utils → _utils}/__init__.py +1 -1
- gr_libs/ml/planner/mcts/{utils → _utils}/node.py +11 -7
- gr_libs/ml/planner/mcts/{utils → _utils}/tree.py +15 -11
- gr_libs/ml/planner/mcts/mcts_model.py +571 -312
- gr_libs/ml/sequential/__init__.py +0 -1
- gr_libs/ml/sequential/_lstm_model.py +270 -0
- gr_libs/ml/tabular/__init__.py +1 -3
- gr_libs/ml/tabular/state.py +7 -7
- gr_libs/ml/tabular/tabular_q_learner.py +150 -82
- gr_libs/ml/tabular/tabular_rl_agent.py +42 -28
- gr_libs/ml/utils/__init__.py +2 -3
- gr_libs/ml/utils/format.py +28 -97
- gr_libs/ml/utils/math.py +5 -3
- gr_libs/ml/utils/other.py +3 -3
- gr_libs/ml/utils/storage.py +88 -81
- gr_libs/odgr_executor.py +268 -0
- gr_libs/problems/consts.py +1549 -1227
- gr_libs/recognizer/_utils/__init__.py +0 -0
- gr_libs/recognizer/_utils/format.py +18 -0
- gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +233 -88
- gr_libs/recognizer/graml/_gr_dataset.py +233 -0
- gr_libs/recognizer/graml/graml_recognizer.py +586 -252
- gr_libs/recognizer/recognizer.py +90 -30
- gr_libs/tutorials/draco_panda_tutorial.py +58 -0
- gr_libs/tutorials/draco_parking_tutorial.py +56 -0
- gr_libs/tutorials/gcdraco_panda_tutorial.py +62 -0
- gr_libs/tutorials/gcdraco_parking_tutorial.py +57 -0
- gr_libs/tutorials/graml_minigrid_tutorial.py +64 -0
- gr_libs/tutorials/graml_panda_tutorial.py +57 -0
- gr_libs/tutorials/graml_parking_tutorial.py +52 -0
- gr_libs/tutorials/graml_point_maze_tutorial.py +60 -0
- gr_libs/tutorials/graql_minigrid_tutorial.py +50 -0
- {gr_libs-0.1.7.post0.dist-info → gr_libs-0.2.2.dist-info}/METADATA +84 -29
- gr_libs-0.2.2.dist-info/RECORD +71 -0
- {gr_libs-0.1.7.post0.dist-info → gr_libs-0.2.2.dist-info}/WHEEL +1 -1
- gr_libs-0.2.2.dist-info/top_level.txt +2 -0
- tests/test_draco.py +14 -0
- tests/test_gcdraco.py +10 -0
- tests/test_graml.py +12 -8
- tests/test_graql.py +3 -2
- evaluation/analyze_results_cross_alg_cross_domain.py +0 -277
- evaluation/create_minigrid_map_image.py +0 -34
- evaluation/file_system.py +0 -42
- evaluation/generate_experiments_results.py +0 -92
- evaluation/generate_experiments_results_new_ver1.py +0 -254
- evaluation/generate_experiments_results_new_ver2.py +0 -331
- evaluation/generate_task_specific_statistics_plots.py +0 -272
- evaluation/get_plans_images.py +0 -47
- evaluation/increasing_and_decreasing_.py +0 -63
- gr_libs/environment/utils/utils.py +0 -17
- gr_libs/ml/neural/utils/penv.py +0 -57
- gr_libs/ml/sequential/lstm_model.py +0 -192
- gr_libs/recognizer/graml/gr_dataset.py +0 -134
- gr_libs/recognizer/utils/__init__.py +0 -1
- gr_libs/recognizer/utils/format.py +0 -13
- gr_libs-0.1.7.post0.dist-info/RECORD +0 -67
- gr_libs-0.1.7.post0.dist-info/top_level.txt +0 -4
- tutorials/graml_minigrid_tutorial.py +0 -34
- tutorials/graml_panda_tutorial.py +0 -41
- tutorials/graml_parking_tutorial.py +0 -39
- tutorials/graml_point_maze_tutorial.py +0 -39
- tutorials/graql_minigrid_tutorial.py +0 -34
- /gr_libs/environment/{utils → _utils}/__init__.py +0 -0
gr_libs/metrics/metrics.py
CHANGED
@@ -1,22 +1,24 @@
|
|
1
|
+
""" metrics for GR algorithms, to perform distance, similarity, likelihood and other measurements and metrics. """
|
2
|
+
|
1
3
|
import math
|
4
|
+
from collections.abc import Callable, Generator
|
5
|
+
from math import log2
|
6
|
+
from typing import Any
|
7
|
+
|
2
8
|
import dill
|
3
9
|
import gymnasium
|
4
10
|
import numpy as np
|
5
|
-
|
6
|
-
from typing import Callable, Generator, List, Dict, Tuple, Any
|
7
|
-
from math import log2
|
8
|
-
from scipy.stats import wasserstein_distance
|
9
11
|
from gymnasium.spaces.discrete import Discrete
|
10
|
-
|
11
|
-
# from torch.distributions.categorical import Categorical
|
12
|
+
from scipy.stats import wasserstein_distance
|
12
13
|
|
13
14
|
from ..ml.base import State
|
14
15
|
from ..ml.base.rl_agent import RLAgent
|
15
16
|
from ..ml.neural.deep_rl_learner import DeepRLAgent
|
16
17
|
|
17
18
|
|
18
|
-
def kl_divergence(p1:
|
19
|
-
"""
|
19
|
+
def kl_divergence(p1: list[float], p2: list[float]) -> float:
|
20
|
+
"""
|
21
|
+
Computes Kullback–Leibler divergence from two probabilities distributions p1 and p2.
|
20
22
|
We follow the formula in Wikipedia https://en.wikipedia.org/wiki/Kullback–Leibler_divergence
|
21
23
|
|
22
24
|
Args:
|
@@ -26,21 +28,35 @@ def kl_divergence(p1: List[float], p2: List[float]) -> float:
|
|
26
28
|
Returns:
|
27
29
|
float: The KL-divergence between p1 and p2
|
28
30
|
"""
|
29
|
-
assert
|
31
|
+
assert len(p1) == len(p2)
|
30
32
|
return sum(p1[i] * log2(p1[i] / p2[i]) for i in range(len(p1)))
|
31
33
|
|
32
34
|
|
33
|
-
def kl_divergence_norm_softmax(
|
35
|
+
def kl_divergence_norm_softmax(
|
36
|
+
observations: list[tuple[State, Any]], agent, actions: Discrete
|
37
|
+
):
|
38
|
+
"""
|
39
|
+
Calculates the Kullback-Leibler (KL) divergence between two probability distributions.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
observations (list[tuple[State, Any]]): List of observations and corresponding actions.
|
43
|
+
agent: The agent object.
|
44
|
+
actions: The discrete actions.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
float: The mean KL divergence between the two distributions.
|
48
|
+
"""
|
34
49
|
distances = []
|
35
50
|
p_traj = traj_to_policy(observations=observations, actions=actions)
|
36
51
|
|
37
52
|
for (observation, agent_pos), action in observations:
|
38
|
-
state = observation[
|
53
|
+
state = observation["image"]
|
39
54
|
state_pickled = dill.dumps(state)
|
40
55
|
|
41
56
|
qp1 = p_traj[state_pickled]
|
42
|
-
qp2_flatten_distribution_list:
|
43
|
-
observation=(observation, agent_pos)
|
57
|
+
qp2_flatten_distribution_list: list[float] = agent.get_actions_probabilities(
|
58
|
+
observation=(observation, agent_pos)
|
59
|
+
)
|
44
60
|
distances.append(kl_divergence(qp1, qp2_flatten_distribution_list))
|
45
61
|
return np.mean(distances)
|
46
62
|
|
@@ -53,22 +69,58 @@ def amplify(values, alpha=1.0):
|
|
53
69
|
Returns:
|
54
70
|
np.array: amplified softmax probabilities
|
55
71
|
"""
|
56
|
-
values = values[:3]**alpha
|
72
|
+
values = values[:3] ** alpha # currently only choose to turn or move forward
|
57
73
|
return values / np.sum(values)
|
58
74
|
|
75
|
+
|
59
76
|
def stochastic_amplified_selection(actions_probs, alpha=8.0):
|
77
|
+
"""
|
78
|
+
Selects an action based on the given action probabilities, with amplification using the specified alpha value.
|
79
|
+
|
80
|
+
Parameters:
|
81
|
+
actions_probs (list): A list of action probabilities.
|
82
|
+
alpha (float): Amplification factor (default: 8.0).
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
int: The selected action.
|
86
|
+
|
87
|
+
"""
|
60
88
|
action_probs_amplified = amplify(actions_probs, alpha)
|
61
89
|
choice = np.random.choice(len(action_probs_amplified), p=action_probs_amplified)
|
62
90
|
if choice == 3:
|
63
91
|
choice = 6
|
64
92
|
return choice
|
65
93
|
|
94
|
+
|
95
|
+
import numpy as np
|
96
|
+
|
97
|
+
|
66
98
|
def stochastic_selection(actions_probs):
|
99
|
+
"""
|
100
|
+
Selects an action based on the given probabilities using a stochastic selection method.
|
101
|
+
|
102
|
+
Parameters:
|
103
|
+
actions_probs (list): A list of probabilities for each action.
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
int: The index of the selected action.
|
107
|
+
"""
|
67
108
|
return np.random.choice(len(actions_probs), p=actions_probs)
|
68
109
|
|
110
|
+
|
69
111
|
def greedy_selection(actions_probs):
|
112
|
+
"""
|
113
|
+
Selects the action with the highest probability.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
actions_probs (numpy.ndarray): Array of action probabilities.
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
int: Index of the selected action.
|
120
|
+
"""
|
70
121
|
return np.argmax(actions_probs)
|
71
122
|
|
123
|
+
|
72
124
|
def measure_average_sequence_distance(seq1, seq2):
|
73
125
|
"""Measures the sequence similarity between two sequences of observations and actions.
|
74
126
|
|
@@ -82,45 +134,89 @@ def measure_average_sequence_distance(seq1, seq2):
|
|
82
134
|
|
83
135
|
# Ensure both sequences have the same length
|
84
136
|
min_seq_len = np.min([len(seq1), len(seq2)])
|
85
|
-
assert
|
137
|
+
assert (
|
138
|
+
np.max([len(seq1), len(seq2)]) <= 30 * min_seq_len
|
139
|
+
), "We can't really measure similarity in case the sequences are really not the same... maybe just return a default NOT_SIMILAR here."
|
86
140
|
|
87
141
|
# Calculate the Euclidean distance between corresponding elements in the sequences
|
88
142
|
distances = []
|
89
143
|
for i in range(0, min_seq_len):
|
90
|
-
distances.append(np.sum(np.abs(np.array(seq1[i])-np.array(seq2[i]))))
|
144
|
+
distances.append(np.sum(np.abs(np.array(seq1[i]) - np.array(seq2[i]))))
|
91
145
|
|
92
146
|
# Calculate the average distance over all elements
|
93
147
|
return np.mean(np.array(distances))
|
94
148
|
|
95
149
|
|
96
|
-
def traj_to_policy(
|
97
|
-
|
98
|
-
|
99
|
-
|
150
|
+
def traj_to_policy(
|
151
|
+
observations: list[tuple[State, Any]], actions: Discrete, epsilon: float = 0.0
|
152
|
+
) -> dict[str, list[float]]:
|
153
|
+
"""
|
154
|
+
Converts a trajectory from a planner to a policy.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
observations (list[tuple[State, Any]]): List of tuples containing the observation and the corresponding action.
|
158
|
+
actions (Discrete): Discrete action space.
|
159
|
+
epsilon (float, optional): Exploration parameter. Defaults to 0.0.
|
160
|
+
|
161
|
+
Returns:
|
162
|
+
dict[str, list[float]]: Dictionary mapping serialized states to action probabilities.
|
163
|
+
"""
|
100
164
|
trajectory_as_policy = {}
|
101
|
-
for (observation,
|
102
|
-
# in the discrete world the action is the index
|
165
|
+
for (observation, _agent_pos), action in observations:
|
103
166
|
action_index = action
|
104
167
|
|
105
168
|
actions_len = actions.n
|
106
169
|
qs = [1e-6 + epsilon / actions_len for _ in range(actions_len)]
|
107
|
-
qs[action_index] = 1. - 1e-6 * (actions_len - 1) - epsilon
|
170
|
+
qs[action_index] = 1.0 - 1e-6 * (actions_len - 1) - epsilon
|
108
171
|
|
109
|
-
state = observation[
|
172
|
+
state = observation["image"]
|
110
173
|
state_pickled = dill.dumps(state)
|
111
174
|
trajectory_as_policy[state_pickled] = qs
|
112
175
|
return trajectory_as_policy
|
113
176
|
|
114
|
-
|
115
|
-
|
116
|
-
|
177
|
+
|
178
|
+
from collections.abc import Generator
|
179
|
+
from typing import Any
|
180
|
+
|
181
|
+
|
182
|
+
def pass_observation_patcher(
|
183
|
+
observations: list[Any], agent: RLAgent
|
184
|
+
) -> Generator[None, None, None]:
|
185
|
+
"""
|
186
|
+
Generator function that yields observations.
|
187
|
+
|
188
|
+
Args:
|
189
|
+
observations (list): List of observations.
|
190
|
+
agent (RLAgent): RL agent object.
|
191
|
+
|
192
|
+
Yields:
|
193
|
+
None: Yields each observation from the list.
|
194
|
+
|
195
|
+
"""
|
196
|
+
yield from observations
|
197
|
+
|
117
198
|
|
118
199
|
def mean_wasserstein_distance(
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
200
|
+
observations: list[tuple[State, Any]],
|
201
|
+
agent: DeepRLAgent,
|
202
|
+
actions: gymnasium.spaces.Box,
|
203
|
+
observation_patcher: Callable[
|
204
|
+
[list[Any], RLAgent], Generator[None, None, None]
|
205
|
+
] = pass_observation_patcher,
|
123
206
|
):
|
207
|
+
"""
|
208
|
+
Calculates the mean Wasserstein distance between observed actions and actor means.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
observations (list[tuple[State, Any]]): List of observations and corresponding actions.
|
212
|
+
agent (DeepRLAgent): The deep reinforcement learning agent.
|
213
|
+
actions (gymnasium.spaces.Box): The action space.
|
214
|
+
observation_patcher (Callable[[list[Any], RLAgent], Generator[None, None, None]], optional):
|
215
|
+
A function that patches the observations. Defaults to pass_observation_patcher.
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
float: The mean Wasserstein distance between observed actions and actor means.
|
219
|
+
"""
|
124
220
|
distances = []
|
125
221
|
|
126
222
|
for observation, observed_action in observation_patcher(observations, agent):
|
@@ -141,49 +237,98 @@ def mean_wasserstein_distance(
|
|
141
237
|
wasserstein_distances.append(
|
142
238
|
wasserstein_distance([observation_action], [actor_mean])
|
143
239
|
)
|
144
|
-
distances.append(mean(wasserstein_distances))
|
145
|
-
return mean(distances)
|
240
|
+
distances.append(np.mean(wasserstein_distances))
|
241
|
+
return np.mean(distances)
|
146
242
|
|
147
243
|
|
148
|
-
def mean_action_distance_continuous(
|
244
|
+
def mean_action_distance_continuous(
|
245
|
+
observations: list[tuple[State, Any]],
|
246
|
+
agent: DeepRLAgent,
|
247
|
+
actions: gymnasium.spaces.Box,
|
248
|
+
):
|
249
|
+
"""
|
250
|
+
Calculates the mean distance between the predicted actions and the actual actions for a continuous action space.
|
251
|
+
|
252
|
+
Args:
|
253
|
+
observations (list[tuple[State, Any]]): A list of tuples containing the observations and corresponding actions.
|
254
|
+
agent (DeepRLAgent): The deep reinforcement learning agent used to predict actions.
|
255
|
+
actions (gymnasium.spaces.Box): The action space.
|
256
|
+
|
257
|
+
Returns:
|
258
|
+
float: The mean distance between the predicted actions and the actual actions.
|
259
|
+
"""
|
149
260
|
distances = []
|
150
261
|
for observation, action in observations:
|
151
262
|
action2, _ = agent.model.predict(
|
152
263
|
observation,
|
153
264
|
state=None,
|
154
265
|
deterministic=True,
|
155
|
-
episode_start=np.ones((1,), dtype=bool)
|
266
|
+
episode_start=np.ones((1,), dtype=bool),
|
156
267
|
)
|
157
268
|
action_arr, action2_arr = action[0], action2[0]
|
158
269
|
print(f"actor means:{action2}")
|
159
|
-
assert len(action_arr) == len(
|
270
|
+
assert len(action_arr) == len(
|
271
|
+
action2_arr
|
272
|
+
), f"Actions should be on the same length:{action},{action2}"
|
160
273
|
|
161
274
|
total_diff = 0
|
162
|
-
# total_diff = []
|
163
275
|
for action1, action2 in zip(action_arr, action2_arr):
|
164
276
|
total_diff += math.fabs(action1 - action2)
|
165
|
-
# distances.append(statistics.mean(total_diff))
|
166
277
|
distances.append(total_diff)
|
167
|
-
|
168
|
-
|
278
|
+
return np.mean(distances)
|
279
|
+
|
280
|
+
|
281
|
+
from collections.abc import Generator
|
282
|
+
from typing import Any
|
169
283
|
|
170
284
|
|
171
|
-
def set_agent_goal_observation(
|
285
|
+
def set_agent_goal_observation(
|
286
|
+
observations: list[Any], agent: RLAgent
|
287
|
+
) -> Generator[None, None, None]:
|
288
|
+
"""
|
289
|
+
Sets the desired goal in each observation to the agent's goal.
|
290
|
+
|
291
|
+
Args:
|
292
|
+
observations (list): List of observations.
|
293
|
+
agent (RLAgent): The RL agent.
|
294
|
+
|
295
|
+
Yields:
|
296
|
+
tuple: A tuple containing the modified observation and the corresponding action.
|
297
|
+
"""
|
172
298
|
copy_observation = observations.copy()
|
173
299
|
for observation, action in copy_observation:
|
174
|
-
observation[
|
300
|
+
observation["desired_goal"] = agent.goal
|
175
301
|
yield observation, action
|
176
302
|
|
177
303
|
|
178
304
|
def z_score(x, mean_action: float, std_dev: float):
|
179
305
|
return (x - mean_action) / std_dev
|
180
306
|
|
307
|
+
|
181
308
|
def mean_p_value(
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
309
|
+
observations: list[tuple[State, Any]],
|
310
|
+
agent: DeepRLAgent,
|
311
|
+
actions: gymnasium.spaces.Box,
|
312
|
+
observation_patcher: Callable[
|
313
|
+
[list[Any], RLAgent], Generator[None, None, None]
|
314
|
+
] = pass_observation_patcher,
|
186
315
|
):
|
316
|
+
"""
|
317
|
+
Calculate the mean p-value for a given set of observations.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
observations (list[tuple[State, Any]]): List of observations and corresponding actions.
|
321
|
+
agent (DeepRLAgent): The deep reinforcement learning agent.
|
322
|
+
actions (gymnasium.spaces.Box): The action space.
|
323
|
+
observation_patcher (Callable[[list[Any], RLAgent], Generator[None, None, None]], optional):
|
324
|
+
A function that patches the observations. Defaults to pass_observation_patcher.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
float: The mean p-value.
|
328
|
+
|
329
|
+
Raises:
|
330
|
+
Exception: If the lengths of observed actions, actor mean, and std-dev are not equal.
|
331
|
+
"""
|
187
332
|
distances = []
|
188
333
|
for observation, observed_action in observation_patcher(observations, agent):
|
189
334
|
# execute prediction X times and add to list (observed_action * X) |X| Len
|
@@ -194,30 +339,62 @@ def mean_p_value(
|
|
194
339
|
observed_actions = observed_action[0]
|
195
340
|
log_std_dev = log_std_dev[0]
|
196
341
|
|
197
|
-
if
|
342
|
+
if (
|
343
|
+
len(actor_means) != len(observed_actions)
|
344
|
+
or len(actor_means) != len(log_std_dev)
|
345
|
+
or len(observed_actions) != len(log_std_dev)
|
346
|
+
):
|
198
347
|
raise Exception(
|
199
348
|
f"Length of observed actions, actor mean and std-dev should be equal! "
|
200
349
|
f"{len(observed_actions)},{len(actor_means)},{len(log_std_dev)}"
|
201
350
|
)
|
202
351
|
z_scores = []
|
203
|
-
for actor_mean, observation_action, action_log_std_dev in zip(
|
352
|
+
for actor_mean, observation_action, action_log_std_dev in zip(
|
353
|
+
actor_means, observed_actions, log_std_dev
|
354
|
+
):
|
204
355
|
z_scores.append(
|
205
|
-
math.fabs(
|
356
|
+
math.fabs(
|
357
|
+
z_score(
|
358
|
+
observation_action,
|
359
|
+
actor_mean,
|
360
|
+
math.pow(2, math.fabs(action_log_std_dev)),
|
361
|
+
)
|
362
|
+
)
|
206
363
|
)
|
207
|
-
mean_distances = mean(z_scores)
|
364
|
+
mean_distances = np.mean(z_scores)
|
208
365
|
|
209
366
|
distances.append(mean_distances)
|
210
|
-
return mean(distances)
|
367
|
+
return np.mean(distances)
|
368
|
+
|
211
369
|
|
212
|
-
def normalize(values:
|
370
|
+
def normalize(values: list[float]) -> list[float]:
|
371
|
+
"""
|
372
|
+
Normalize a list of values by dividing each value by the sum of all values.
|
373
|
+
|
374
|
+
Args:
|
375
|
+
values (list[float]): The list of values to be normalized.
|
376
|
+
|
377
|
+
Returns:
|
378
|
+
list[float]: The normalized list of values.
|
379
|
+
"""
|
213
380
|
values /= sum(values)
|
214
381
|
return values
|
215
382
|
|
216
|
-
|
383
|
+
|
384
|
+
def maximum(values: list[float]) -> list[float]:
|
385
|
+
"""
|
386
|
+
Returns a list with the same length as the input list, where the maximum value is set to 1.0 and all other values are set to 0.0.
|
387
|
+
|
388
|
+
Args:
|
389
|
+
values (list[float]): The input list of values.
|
390
|
+
|
391
|
+
Returns:
|
392
|
+
list[float]: A list with the same length as the input list, where the maximum value is set to 1.0 and all other values are set to 0.0.
|
393
|
+
"""
|
217
394
|
if not len(values):
|
218
395
|
return values
|
219
396
|
vals = np.array(values)
|
220
397
|
argmax = vals.argmax()
|
221
398
|
vals[:] = 0.0
|
222
399
|
vals[argmax] = 1.0
|
223
|
-
return vals
|
400
|
+
return vals
|
gr_libs/ml/__init__.py
CHANGED
@@ -1,6 +1,3 @@
|
|
1
|
-
from ..ml.utils import device, seed, synthesize
|
2
|
-
# from ml.neural import PPOAlgo
|
3
|
-
from ..ml.tabular import TabularQLearner
|
4
1
|
# from ml.neural import ACModel, RecurrentACModel
|
5
|
-
|
6
|
-
from
|
2
|
+
|
3
|
+
# from ml.neural import PPOAlgo
|
gr_libs/ml/agent.py
CHANGED
@@ -2,6 +2,7 @@ import torch
|
|
2
2
|
|
3
3
|
from gr_libs.ml import utils
|
4
4
|
from gr_libs.ml.utils.other import device
|
5
|
+
|
5
6
|
# from ml.neural import ACModel
|
6
7
|
|
7
8
|
|
@@ -12,15 +13,27 @@ class Agent:
|
|
12
13
|
- to choose an action given an observation,
|
13
14
|
- to analyze the feedback (i.e. reward and done state) of its action."""
|
14
15
|
|
15
|
-
def __init__(
|
16
|
-
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
obs_space,
|
19
|
+
action_space,
|
20
|
+
model_dir,
|
21
|
+
argmax=False,
|
22
|
+
num_envs=1,
|
23
|
+
use_memory=True,
|
24
|
+
use_text=False,
|
25
|
+
):
|
17
26
|
obs_space, self.preprocess_obss = utils.get_obss_preprocessor(obs_space)
|
18
|
-
self.acmodel = ACModel(
|
27
|
+
self.acmodel = ACModel(
|
28
|
+
obs_space, action_space, use_memory=use_memory, use_text=use_text
|
29
|
+
)
|
19
30
|
self.argmax = argmax
|
20
31
|
self.num_envs = num_envs
|
21
32
|
|
22
33
|
if self.acmodel.recurrent:
|
23
|
-
self.memories = torch.zeros(
|
34
|
+
self.memories = torch.zeros(
|
35
|
+
self.num_envs, self.acmodel.memory_size, device=device
|
36
|
+
)
|
24
37
|
|
25
38
|
self.acmodel.load_state_dict(utils.get_model_state(model_dir))
|
26
39
|
self.acmodel.to(device)
|
@@ -49,8 +62,10 @@ class Agent:
|
|
49
62
|
|
50
63
|
def analyze_feedbacks(self, rewards, dones):
|
51
64
|
if self.acmodel.recurrent:
|
52
|
-
masks = 1 - torch.tensor(dones, dtype=torch.float, device=device).unsqueeze(
|
65
|
+
masks = 1 - torch.tensor(dones, dtype=torch.float, device=device).unsqueeze(
|
66
|
+
1
|
67
|
+
)
|
53
68
|
self.memories *= masks
|
54
69
|
|
55
70
|
def analyze_feedback(self, reward, done):
|
56
|
-
return self.analyze_feedbacks([reward], [done])
|
71
|
+
return self.analyze_feedbacks([reward], [done])
|
gr_libs/ml/base/__init__.py
CHANGED
gr_libs/ml/base/rl_agent.py
CHANGED
@@ -1,26 +1,61 @@
|
|
1
|
-
from typing import Any
|
2
1
|
from abc import ABC, abstractmethod
|
3
|
-
|
2
|
+
from typing import Any
|
4
3
|
|
5
4
|
State = Any
|
6
5
|
|
6
|
+
|
7
7
|
class ContextualAgent:
|
8
|
+
"""
|
9
|
+
A class representing a contextual agent for reinforcement learning, including gym properties.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
problem_name (str): The name of the problem the agent is designed to solve.
|
13
|
+
problem_goal (str): The goal of the problem the agent is designed to achieve.
|
14
|
+
agent: The underlying agent implementation.
|
15
|
+
|
16
|
+
Attributes:
|
17
|
+
problem_name (str): The name of the problem the agent is designed to solve.
|
18
|
+
problem_goal (str): The goal of the problem the agent is designed to achieve.
|
19
|
+
agent: The underlying agent implementation.
|
20
|
+
"""
|
21
|
+
|
8
22
|
def __init__(self, problem_name, problem_goal, agent):
|
23
|
+
"""
|
24
|
+
Initializes a reinforcement learning agent.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
problem_name (str): The name of the problem.
|
28
|
+
problem_goal (str): The goal of the problem.
|
29
|
+
agent: The agent object.
|
30
|
+
"""
|
9
31
|
self.problem_name = problem_name
|
10
32
|
self.problem_goal = problem_goal
|
11
33
|
self.agent = agent
|
12
34
|
|
35
|
+
|
13
36
|
class RLAgent(ABC):
|
14
37
|
def __init__(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
38
|
+
self,
|
39
|
+
episodes: int,
|
40
|
+
decaying_eps: bool,
|
41
|
+
epsilon: float,
|
42
|
+
learning_rate: float,
|
43
|
+
gamma: float,
|
44
|
+
problem_name: str,
|
45
|
+
domain_name: str,
|
23
46
|
):
|
47
|
+
"""
|
48
|
+
Initializes a reinforcement learning agent.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
episodes (int): The number of episodes to train the agent.
|
52
|
+
decaying_eps (bool): Whether to use decaying epsilon-greedy exploration.
|
53
|
+
epsilon (float): The exploration rate.
|
54
|
+
learning_rate (float): The learning rate.
|
55
|
+
gamma (float): The discount factor.
|
56
|
+
problem_name (str): The name of the problem.
|
57
|
+
domain_name (str): The name of the domain.
|
58
|
+
"""
|
24
59
|
self.episodes = episodes
|
25
60
|
self.decaying_eps = decaying_eps
|
26
61
|
self.epsilon = epsilon
|
@@ -33,22 +68,55 @@ class RLAgent(ABC):
|
|
33
68
|
|
34
69
|
@abstractmethod
|
35
70
|
def learn(self):
|
36
|
-
|
71
|
+
"""
|
72
|
+
Abstract method for the agent to learn from the environment.
|
73
|
+
"""
|
37
74
|
|
38
75
|
def class_name(self):
|
76
|
+
"""
|
77
|
+
Returns the name of the agent's class.
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
str: The name of the agent's class.
|
81
|
+
"""
|
39
82
|
return self.__class__.__name__
|
40
83
|
|
41
84
|
def get_actions_probabilities(self, observation):
|
85
|
+
"""
|
86
|
+
Get the probabilities of available actions given an observation.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
observation: The observation from the environment.
|
90
|
+
|
91
|
+
Raises:
|
92
|
+
Exception: This function is unimplemented.
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
Any: The probabilities of available actions.
|
96
|
+
"""
|
42
97
|
raise Exception("function get_actions_probabilities is unimplemented")
|
43
98
|
|
44
99
|
def get_number_of_unique_states(self):
|
100
|
+
"""
|
101
|
+
Get the number of unique states encountered by the agent.
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
int: The number of unique states encountered.
|
105
|
+
"""
|
45
106
|
return len(self.states_counter)
|
46
107
|
|
47
108
|
def update_states_counter(self, observation_str: str):
|
109
|
+
"""
|
110
|
+
Update the counter for the number of times each observation state is encountered.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
observation_str (str): The string representation of the observation state.
|
114
|
+
"""
|
48
115
|
if observation_str in self.states_counter:
|
49
|
-
self.states_counter[observation_str] =
|
116
|
+
self.states_counter[observation_str] = (
|
117
|
+
self.states_counter[observation_str] + 1
|
118
|
+
)
|
50
119
|
else:
|
51
120
|
self.states_counter[observation_str] = 1
|
52
121
|
if len(self.states_counter) % 10000 == 0:
|
53
122
|
print(f"probably error to many {len(self.states_counter)}")
|
54
|
-
|
gr_libs/ml/consts.py
CHANGED
gr_libs/ml/neural/__init__.py
CHANGED