gr-libs 0.1.7.post0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gr_libs/__init__.py +4 -1
- gr_libs/_evaluation/__init__.py +1 -0
- gr_libs/_evaluation/_analyze_results_cross_alg_cross_domain.py +260 -0
- gr_libs/_evaluation/_generate_experiments_results.py +141 -0
- gr_libs/_evaluation/_generate_task_specific_statistics_plots.py +497 -0
- gr_libs/_evaluation/_get_plans_images.py +61 -0
- gr_libs/_evaluation/_increasing_and_decreasing_.py +106 -0
- gr_libs/_version.py +2 -2
- gr_libs/all_experiments.py +294 -0
- gr_libs/environment/__init__.py +30 -9
- gr_libs/environment/_utils/utils.py +27 -0
- gr_libs/environment/environment.py +417 -54
- gr_libs/metrics/__init__.py +7 -0
- gr_libs/metrics/metrics.py +231 -54
- gr_libs/ml/__init__.py +2 -5
- gr_libs/ml/agent.py +21 -6
- gr_libs/ml/base/__init__.py +3 -1
- gr_libs/ml/base/rl_agent.py +81 -13
- gr_libs/ml/consts.py +1 -1
- gr_libs/ml/neural/__init__.py +1 -3
- gr_libs/ml/neural/deep_rl_learner.py +619 -378
- gr_libs/ml/neural/utils/__init__.py +1 -2
- gr_libs/ml/neural/utils/dictlist.py +3 -3
- gr_libs/ml/planner/mcts/{utils → _utils}/__init__.py +1 -1
- gr_libs/ml/planner/mcts/{utils → _utils}/node.py +11 -7
- gr_libs/ml/planner/mcts/{utils → _utils}/tree.py +15 -11
- gr_libs/ml/planner/mcts/mcts_model.py +571 -312
- gr_libs/ml/sequential/__init__.py +0 -1
- gr_libs/ml/sequential/_lstm_model.py +270 -0
- gr_libs/ml/tabular/__init__.py +1 -3
- gr_libs/ml/tabular/state.py +7 -7
- gr_libs/ml/tabular/tabular_q_learner.py +150 -82
- gr_libs/ml/tabular/tabular_rl_agent.py +42 -28
- gr_libs/ml/utils/__init__.py +2 -3
- gr_libs/ml/utils/format.py +28 -97
- gr_libs/ml/utils/math.py +5 -3
- gr_libs/ml/utils/other.py +3 -3
- gr_libs/ml/utils/storage.py +88 -81
- gr_libs/odgr_executor.py +268 -0
- gr_libs/problems/consts.py +1549 -1227
- gr_libs/recognizer/_utils/__init__.py +0 -0
- gr_libs/recognizer/_utils/format.py +18 -0
- gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +233 -88
- gr_libs/recognizer/graml/_gr_dataset.py +233 -0
- gr_libs/recognizer/graml/graml_recognizer.py +586 -252
- gr_libs/recognizer/recognizer.py +90 -30
- gr_libs/tutorials/draco_panda_tutorial.py +58 -0
- gr_libs/tutorials/draco_parking_tutorial.py +56 -0
- gr_libs/tutorials/gcdraco_panda_tutorial.py +62 -0
- gr_libs/tutorials/gcdraco_parking_tutorial.py +57 -0
- gr_libs/tutorials/graml_minigrid_tutorial.py +64 -0
- gr_libs/tutorials/graml_panda_tutorial.py +57 -0
- gr_libs/tutorials/graml_parking_tutorial.py +52 -0
- gr_libs/tutorials/graml_point_maze_tutorial.py +60 -0
- gr_libs/tutorials/graql_minigrid_tutorial.py +50 -0
- {gr_libs-0.1.7.post0.dist-info → gr_libs-0.2.2.dist-info}/METADATA +84 -29
- gr_libs-0.2.2.dist-info/RECORD +71 -0
- {gr_libs-0.1.7.post0.dist-info → gr_libs-0.2.2.dist-info}/WHEEL +1 -1
- gr_libs-0.2.2.dist-info/top_level.txt +2 -0
- tests/test_draco.py +14 -0
- tests/test_gcdraco.py +10 -0
- tests/test_graml.py +12 -8
- tests/test_graql.py +3 -2
- evaluation/analyze_results_cross_alg_cross_domain.py +0 -277
- evaluation/create_minigrid_map_image.py +0 -34
- evaluation/file_system.py +0 -42
- evaluation/generate_experiments_results.py +0 -92
- evaluation/generate_experiments_results_new_ver1.py +0 -254
- evaluation/generate_experiments_results_new_ver2.py +0 -331
- evaluation/generate_task_specific_statistics_plots.py +0 -272
- evaluation/get_plans_images.py +0 -47
- evaluation/increasing_and_decreasing_.py +0 -63
- gr_libs/environment/utils/utils.py +0 -17
- gr_libs/ml/neural/utils/penv.py +0 -57
- gr_libs/ml/sequential/lstm_model.py +0 -192
- gr_libs/recognizer/graml/gr_dataset.py +0 -134
- gr_libs/recognizer/utils/__init__.py +0 -1
- gr_libs/recognizer/utils/format.py +0 -13
- gr_libs-0.1.7.post0.dist-info/RECORD +0 -67
- gr_libs-0.1.7.post0.dist-info/top_level.txt +0 -4
- tutorials/graml_minigrid_tutorial.py +0 -34
- tutorials/graml_panda_tutorial.py +0 -41
- tutorials/graml_parking_tutorial.py +0 -39
- tutorials/graml_point_maze_tutorial.py +0 -39
- tutorials/graql_minigrid_tutorial.py +0 -34
- /gr_libs/environment/{utils → _utils}/__init__.py +0 -0
@@ -1,393 +1,634 @@
|
|
1
|
-
from collections import OrderedDict
|
2
1
|
import gc
|
2
|
+
from collections import OrderedDict
|
3
3
|
from types import MethodType
|
4
|
-
|
5
|
-
import gymnasium as gym
|
6
|
-
import numpy as np
|
4
|
+
|
7
5
|
import cv2
|
6
|
+
import numpy as np
|
8
7
|
|
9
|
-
|
8
|
+
from gr_libs.environment.environment import EnvProperty
|
10
9
|
|
11
10
|
if __name__ != "__main__":
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
from gr_libs.ml.utils import device
|
11
|
+
from gr_libs.ml.utils.storage import get_agent_model_dir
|
12
|
+
from gr_libs.ml.utils.format import random_subset_with_order
|
13
|
+
|
14
|
+
import os
|
17
15
|
|
18
16
|
# built-in python modules
|
19
17
|
import random
|
20
|
-
import os
|
21
|
-
import sys
|
22
|
-
|
23
|
-
def create_vec_env(kwargs):
|
24
|
-
# create the model, it will not be a pretrained one anyway
|
25
|
-
# env = gym.make(**kwargs)
|
26
|
-
env = gym.make(**kwargs)
|
27
|
-
return DummyVecEnv([lambda: env])
|
28
|
-
|
29
|
-
def change_goal_to_specific_desired(obs, desired):
|
30
|
-
if desired is not None:
|
31
|
-
obs['desired_goal'] = desired
|
32
|
-
# try:
|
33
|
-
# if desired!=None: obs['desired_goal'] = desired
|
34
|
-
# except Exception as e:
|
35
|
-
# try:
|
36
|
-
# if all(desired!=None): obs['desired_goal'] = desired
|
37
|
-
# except Exception as e:
|
38
|
-
# if all([desiredy!=None for desiredish in desired for desiredy in desiredish]): obs['desired_goal'] = desired
|
39
18
|
|
19
|
+
import gymnasium as gym
|
20
|
+
from stable_baselines3 import PPO, SAC, TD3
|
21
|
+
from stable_baselines3.common.base_class import BaseAlgorithm
|
22
|
+
|
23
|
+
from gr_libs.ml.utils import device
|
40
24
|
|
25
|
+
# TODO do we need this?
|
41
26
|
NETWORK_SETUP = {
|
42
|
-
|
43
|
-
|
44
|
-
|
27
|
+
SAC: OrderedDict(
|
28
|
+
[
|
29
|
+
("batch_size", 512),
|
30
|
+
("buffer_size", 100000),
|
31
|
+
("ent_coef", "auto"),
|
32
|
+
("gamma", 0.95),
|
33
|
+
("learning_rate", 0.001),
|
34
|
+
("learning_starts", 5000),
|
35
|
+
("n_timesteps", 50000.0),
|
36
|
+
("normalize", "{'norm_obs': False, 'norm_reward': False}"),
|
37
|
+
("policy", "MultiInputPolicy"),
|
38
|
+
("policy_kwargs", "dict(net_arch=[64, 64])"),
|
39
|
+
("replay_buffer_class", "HerReplayBuffer"),
|
40
|
+
(
|
41
|
+
"replay_buffer_kwargs",
|
42
|
+
"dict( goal_selection_strategy='future', n_sampled_goal=4 )",
|
43
|
+
),
|
44
|
+
("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
|
45
|
+
]
|
46
|
+
),
|
47
|
+
PPO: OrderedDict(
|
48
|
+
[
|
49
|
+
("batch_size", 256),
|
50
|
+
("ent_coef", 0.01),
|
51
|
+
("gae_lambda", 0.9),
|
52
|
+
("gamma", 0.99),
|
53
|
+
("learning_rate", "lin_0.0001"),
|
54
|
+
("max_grad_norm", 0.5),
|
55
|
+
("n_envs", 8),
|
56
|
+
("n_epochs", 20),
|
57
|
+
("n_steps", 8),
|
58
|
+
("n_timesteps", 25000.0),
|
59
|
+
("normalize_advantage", False),
|
60
|
+
("policy", "MultiInputPolicy"),
|
61
|
+
("policy_kwargs", "dict(log_std_init=-2, ortho_init=False)"),
|
62
|
+
("use_sde", True),
|
63
|
+
("vf_coef", 0.4),
|
64
|
+
("normalize", False),
|
65
|
+
("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
|
66
|
+
]
|
67
|
+
),
|
45
68
|
}
|
46
69
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
70
|
+
|
71
|
+
class DeepRLAgent:
|
72
|
+
"""
|
73
|
+
Deep Reinforcement Learning Agent, wrapping a SB3 agent and adding functionality,
|
74
|
+
needed for GR framework executions such as observation generation and video recording.
|
75
|
+
Supports SAC, PPO and TD3 algorithms.
|
76
|
+
Can be loaded from rl_zoo or trained from scratch.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
domain_name (str): The domain name.
|
80
|
+
problem_name (str): The problem name.
|
81
|
+
num_timesteps (float): The number of timesteps for training.
|
82
|
+
env_prop (EnvProperty): The environment property.
|
83
|
+
algorithm (BaseAlgorithm, optional): The algorithm to use. Defaults to SAC.
|
84
|
+
reward_threshold (float, optional): The reward threshold. Defaults to 450.
|
85
|
+
exploration_rate (float, optional): The exploration rate. Defaults to None.
|
86
|
+
"""
|
87
|
+
|
88
|
+
def __init__(
|
89
|
+
self,
|
90
|
+
domain_name: str,
|
91
|
+
problem_name: str,
|
92
|
+
num_timesteps: float,
|
93
|
+
env_prop: EnvProperty,
|
94
|
+
algorithm: BaseAlgorithm = SAC,
|
95
|
+
reward_threshold: float = 450,
|
96
|
+
exploration_rate=None,
|
97
|
+
):
|
98
|
+
"""
|
99
|
+
Initialize the DeepRLLearner object.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
domain_name (str): The name of the domain.
|
103
|
+
problem_name (str): The name of the problem.
|
104
|
+
num_timesteps (float): The number of timesteps.
|
105
|
+
env_prop (EnvProperty): The environment property.
|
106
|
+
algorithm (BaseAlgorithm, optional): The algorithm to use. Defaults to SAC.
|
107
|
+
reward_threshold (float, optional): The reward threshold. Defaults to 450.
|
108
|
+
exploration_rate (float, optional): The exploration rate. Defaults to None.
|
109
|
+
"""
|
110
|
+
env_kwargs = {"id": problem_name, "render_mode": "rgb_array"}
|
111
|
+
assert algorithm in [SAC, PPO, TD3]
|
112
|
+
|
113
|
+
self.domain_name = domain_name
|
114
|
+
self.problem_name = problem_name
|
115
|
+
self.env_prop = env_prop
|
116
|
+
self.exploration_rate = exploration_rate
|
117
|
+
|
118
|
+
self._model_directory = get_agent_model_dir(
|
119
|
+
domain_name=self.domain_name,
|
120
|
+
model_name=problem_name,
|
121
|
+
class_name=algorithm.__name__,
|
122
|
+
)
|
123
|
+
self.env = self.env_prop.create_vec_env(env_kwargs)
|
124
|
+
self._actions_space = self.env.action_space
|
125
|
+
|
126
|
+
# first_support: SB3 models from RL zoo, with the .zip format.
|
127
|
+
if os.path.exists(os.path.join(self._model_directory, "saved_model.zip")):
|
128
|
+
# TODO check if it's ncessary to give these to the model.load if loading from rl zoo
|
129
|
+
self._model_file_path = os.path.join(
|
130
|
+
self._model_directory, "saved_model.zip"
|
131
|
+
)
|
132
|
+
self.model_kwargs = {
|
133
|
+
"custom_objects": {
|
134
|
+
"learning_rate": 0.0,
|
135
|
+
"lr_schedule": lambda _: 0.0,
|
136
|
+
"clip_range": lambda _: 0.0,
|
137
|
+
},
|
138
|
+
"seed": 0,
|
139
|
+
"buffer_size": 1,
|
140
|
+
}
|
141
|
+
# second support: models saved with SB3's model.save, which is saved as a
|
142
|
+
# formatted .pth file.
|
143
|
+
else:
|
144
|
+
self.model_kwargs = {}
|
145
|
+
self._model_file_path = os.path.join(
|
146
|
+
self._model_directory, "saved_model.pth"
|
147
|
+
)
|
148
|
+
|
149
|
+
self.algorithm = algorithm
|
150
|
+
self.reward_threshold = reward_threshold
|
151
|
+
self.num_timesteps = num_timesteps
|
152
|
+
|
153
|
+
def save_model(self):
|
154
|
+
"""Save the model to a file."""
|
155
|
+
self._model.save(self._model_file_path)
|
156
|
+
|
157
|
+
def try_recording_video(self, video_path, desired=None):
|
158
|
+
"""
|
159
|
+
Try recording a video of the agent's performance.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
video_path (str): The path to save the video.
|
163
|
+
desired (optional): The desired goal. Defaults to None.
|
164
|
+
"""
|
165
|
+
num_tries = 0
|
166
|
+
while True:
|
167
|
+
if num_tries >= 10:
|
168
|
+
assert False, "agent keeps failing on recording an optimal obs."
|
169
|
+
try:
|
170
|
+
self.record_video(video_path, desired)
|
171
|
+
break
|
172
|
+
except Exception:
|
173
|
+
num_tries += 1
|
174
|
+
# print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
|
175
|
+
print(f"generated sequence video at {video_path}.")
|
176
|
+
|
177
|
+
def record_video(self, video_path, desired=None):
|
178
|
+
"""
|
179
|
+
Record a video of the agent's performance.
|
180
|
+
|
181
|
+
Args:
|
182
|
+
video_path (str): The path to save the video.
|
183
|
+
desired (optional): The desired goal. Defaults to None.
|
184
|
+
"""
|
185
|
+
fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
|
186
|
+
fps = 30.0
|
187
|
+
# if is_gc:
|
188
|
+
# assert goal_idx is not None
|
189
|
+
# self.reset_with_goal_idx(goal_idx)
|
190
|
+
# else:
|
191
|
+
# assert goal_idx is None
|
192
|
+
self.env.reset()
|
193
|
+
frame_size = (
|
194
|
+
self.env.render(mode="rgb_array").shape[1],
|
195
|
+
self.env.render(mode="rgb_array").shape[0],
|
196
|
+
)
|
197
|
+
video_path = os.path.join(video_path, "plan_video.mp4")
|
198
|
+
video_writer = cv2.VideoWriter(video_path, fourcc, fps, frame_size)
|
199
|
+
general_done, success_done = False, False
|
200
|
+
gc.collect()
|
201
|
+
obs = self.env.reset()
|
202
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
203
|
+
counter = 0
|
204
|
+
while not (general_done or success_done):
|
205
|
+
counter += 1
|
206
|
+
action, _states = self._model.predict(obs, deterministic=False)
|
207
|
+
obs, rewards, general_done, info = self.env.step(action)
|
208
|
+
if isinstance(general_done, np.ndarray):
|
209
|
+
general_done = general_done[0]
|
210
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
211
|
+
if "success" in info[0].keys():
|
212
|
+
success_done = info[0][
|
213
|
+
"success"
|
214
|
+
] # make sure the agent actually reached the goal within the max time
|
215
|
+
elif "is_success" in info[0].keys():
|
216
|
+
success_done = info[0][
|
217
|
+
"is_success"
|
218
|
+
] # make sure the agent actually reached the goal within the max time
|
219
|
+
elif "step_task_completions" in info[0].keys():
|
220
|
+
success_done = (
|
221
|
+
len(info[0]["step_task_completions"]) == 1
|
222
|
+
) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
|
223
|
+
else:
|
224
|
+
raise NotImplementedError(
|
225
|
+
"no other option for any of the environments."
|
226
|
+
)
|
227
|
+
frame = self.env.render()
|
228
|
+
success_done = self.env_prop.change_done_by_specific_desired(
|
229
|
+
obs, desired, success_done
|
230
|
+
)
|
231
|
+
video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
232
|
+
if general_done == False and success_done == True:
|
233
|
+
assert (
|
234
|
+
desired is not None
|
235
|
+
), f"general_done is false but success_done is true, and desired is None. \
|
236
|
+
This should never happen, since the environment will say 'done' is false \
|
237
|
+
(general_done) while the observation will be close to the goal (success_done) \
|
238
|
+
only in case we incorporated a 'desired' when generating the observation."
|
239
|
+
elif general_done == True and success_done == False:
|
240
|
+
raise Exception("general_done is true but success_done is false")
|
241
|
+
self.env.close()
|
242
|
+
video_writer.release()
|
243
|
+
|
244
|
+
def load_model(self):
|
245
|
+
"""Load the model from a file."""
|
246
|
+
self._model = self.algorithm.load(
|
247
|
+
self._model_file_path, env=self.env, device=device, **self.model_kwargs
|
248
|
+
)
|
249
|
+
|
250
|
+
def learn(self):
|
251
|
+
"""Train the agent."""
|
252
|
+
if os.path.exists(self._model_file_path):
|
253
|
+
print(f"Loading pre-existing model in {self._model_file_path}")
|
254
|
+
self.load_model()
|
255
|
+
else:
|
256
|
+
print(f"No existing model in {self._model_file_path}, starting learning")
|
257
|
+
if self.exploration_rate is not None:
|
258
|
+
self._model = self.algorithm(
|
259
|
+
"MultiInputPolicy",
|
260
|
+
self.env,
|
261
|
+
ent_coef=self.exploration_rate,
|
262
|
+
verbose=1,
|
263
|
+
)
|
264
|
+
else:
|
265
|
+
self._model = self.algorithm("MultiInputPolicy", self.env, verbose=1)
|
266
|
+
self._model.learn(
|
267
|
+
total_timesteps=self.num_timesteps, progress_bar=True
|
268
|
+
) # comment this in a normal env
|
269
|
+
self.save_model()
|
270
|
+
|
271
|
+
def safe_env_reset(self):
|
272
|
+
"""
|
273
|
+
Reset the environment safely.
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
The initial observation.
|
277
|
+
"""
|
278
|
+
try:
|
279
|
+
obs = self.env.reset()
|
280
|
+
except Exception:
|
281
|
+
kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
|
282
|
+
self.env = self.env_prop.create_vec_env(kwargs)
|
283
|
+
obs = self.env.reset()
|
284
|
+
return obs
|
285
|
+
|
286
|
+
def get_mean_and_std_dev(self, observation):
|
287
|
+
"""
|
288
|
+
Get the mean and standard deviation of the action distribution.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
observation: The observation.
|
292
|
+
|
293
|
+
Returns:
|
294
|
+
The mean and standard deviation of the action distribution.
|
295
|
+
"""
|
296
|
+
if self.algorithm == SAC:
|
297
|
+
tensor_observation, _ = self._model.actor.obs_to_tensor(observation)
|
298
|
+
|
299
|
+
mean_actions, log_std_dev, kwargs = (
|
300
|
+
self._model.actor.get_action_dist_params(tensor_observation)
|
301
|
+
)
|
302
|
+
probability_dist = self._model.actor.action_dist.proba_distribution(
|
303
|
+
mean_actions=mean_actions, log_std=log_std_dev
|
304
|
+
)
|
305
|
+
actor_means = probability_dist.get_actions(True).cpu().detach().numpy()
|
306
|
+
log_std_dev = log_std_dev.cpu().detach().numpy()
|
307
|
+
elif self.algorithm == PPO:
|
308
|
+
self._model.policy.set_training_mode(False)
|
309
|
+
tensor_observation, _ = self._model.policy.obs_to_tensor(observation)
|
310
|
+
distribution = self._model.policy.get_distribution(tensor_observation)
|
311
|
+
|
312
|
+
actor_means = distribution.distribution.mean.cpu().detach().numpy()
|
313
|
+
log_std_dev = distribution.distribution.stddev.cpu().detach().numpy()
|
314
|
+
if isinstance(self._model.policy.action_space, gym.spaces.Box):
|
315
|
+
actor_means = np.clip(
|
316
|
+
actor_means,
|
317
|
+
self._model.policy.action_space.low,
|
318
|
+
self._model.policy.action_space.high,
|
319
|
+
)
|
320
|
+
return actor_means, log_std_dev
|
321
|
+
else:
|
322
|
+
assert False
|
323
|
+
return actor_means, log_std_dev
|
324
|
+
|
325
|
+
def simplify_observation(self, observation):
|
326
|
+
"""
|
327
|
+
Simplifies the given observation by concatenating the last dimension of each observation and action.
|
328
|
+
fits agents that generated observations in the form of: list of tuples, each tuple a single
|
329
|
+
step\frame with size 2, comprised of obs and action.
|
330
|
+
the function squashes the 2d array of obs and action in a 1d array, concatenating their
|
331
|
+
values together for training.
|
332
|
+
|
333
|
+
Args:
|
334
|
+
observation (list): List of tuples containing observation and action.
|
335
|
+
|
336
|
+
Returns:
|
337
|
+
list: List of simplified observations.
|
338
|
+
"""
|
339
|
+
return [
|
340
|
+
np.concatenate(
|
341
|
+
(
|
342
|
+
np.array(obs).reshape(obs.shape[-1]),
|
343
|
+
np.array(action[0]).reshape(action[0].shape[-1]),
|
344
|
+
)
|
345
|
+
)
|
346
|
+
for (obs, action) in observation
|
347
|
+
]
|
348
|
+
|
349
|
+
def add_random_optimalism(self, observations, action, constant_initial_action):
|
350
|
+
"""
|
351
|
+
Adds random optimalism to the given action based on the length of observations.
|
352
|
+
|
353
|
+
Parameters:
|
354
|
+
observations (list): List of observations.
|
355
|
+
action (ndarray): Action to modify.
|
356
|
+
constant_initial_action (float): Initial action value.
|
357
|
+
|
358
|
+
Returns:
|
359
|
+
ndarray: Modified action.
|
360
|
+
"""
|
361
|
+
if len(observations) > 3:
|
362
|
+
for i in range(0, len(action[0])):
|
363
|
+
action[0][i] += random.uniform(
|
364
|
+
-0.01 * action[0][i], 0.01 * action[0][i]
|
365
|
+
)
|
366
|
+
else: # just walk in a specific random direction to enable diverse plans
|
367
|
+
action = np.array(np.array([constant_initial_action]), None)
|
368
|
+
return action
|
369
|
+
|
370
|
+
def generate_partial_observation(
|
371
|
+
self,
|
372
|
+
action_selection_method,
|
373
|
+
percentage,
|
374
|
+
is_consecutive,
|
375
|
+
save_fig=False,
|
376
|
+
fig_path=None,
|
377
|
+
random_optimalism=True,
|
378
|
+
):
|
379
|
+
"""
|
380
|
+
Generates a partial observation by selecting a subset of steps from a full observation.
|
381
|
+
|
382
|
+
Args:
|
383
|
+
action_selection_method (str): The method used for selecting actions.
|
384
|
+
percentage (float): The percentage of steps to include in the partial observation.
|
385
|
+
is_consecutive (bool): Whether the selected steps should be consecutive or not.
|
386
|
+
save_fig (bool, optional): Whether to save a figure of the observation. Defaults to False.
|
387
|
+
fig_path (str, optional): The path to save the figure. Defaults to None.
|
388
|
+
random_optimalism (bool, optional): Whether to apply random optimalism during observation generation. Defaults to True.
|
389
|
+
|
390
|
+
Returns:
|
391
|
+
list: A partial observation consisting of a subset of steps from the full observation.
|
392
|
+
"""
|
393
|
+
steps = self.generate_observation(
|
394
|
+
action_selection_method,
|
395
|
+
save_fig=save_fig,
|
396
|
+
random_optimalism=random_optimalism,
|
397
|
+
fig_path=fig_path,
|
398
|
+
) # steps are a full observation
|
399
|
+
return random_subset_with_order(
|
400
|
+
steps, (int)(percentage * len(steps)), is_consecutive
|
401
|
+
)
|
402
|
+
|
403
|
+
def generate_observation(
|
404
|
+
self,
|
405
|
+
action_selection_method: MethodType,
|
406
|
+
random_optimalism,
|
407
|
+
save_fig=False,
|
408
|
+
fig_path=None,
|
409
|
+
with_dict=False,
|
410
|
+
desired=None,
|
411
|
+
) -> list[tuple[np.ndarray, np.ndarray]]:
|
412
|
+
"""
|
413
|
+
Generates observations by interacting with the environment.
|
414
|
+
|
415
|
+
Args:
|
416
|
+
action_selection_method (MethodType): The method used for action selection.
|
417
|
+
random_optimalism (bool): Flag indicating whether to add random optimalism to the actions.
|
418
|
+
save_fig (bool, optional): Flag indicating whether to save a figure. Defaults to False.
|
419
|
+
fig_path (str, optional): The path to save the figure. Required if save_fig is True. Defaults to None.
|
420
|
+
with_dict (bool, optional): Flag indicating whether to include the observation as a dictionary. Defaults to False.
|
421
|
+
desired (Any, optional): The desired goal for the observation. Defaults to None.
|
422
|
+
|
423
|
+
Returns:
|
424
|
+
list[tuple[np.ndarray, np.ndarray]]: A list of tuples containing the observation and the corresponding action.
|
425
|
+
"""
|
426
|
+
if save_fig is False:
|
427
|
+
assert (
|
428
|
+
fig_path is None
|
429
|
+
), "You can't specify a vid path when you don't even save the figure."
|
430
|
+
else:
|
431
|
+
assert (
|
432
|
+
fig_path is not None
|
433
|
+
), "You need to specify a vid path when you save the figure."
|
434
|
+
# The try-except is a bug fix for the env not being reset properly in panda.
|
435
|
+
# If someone wants to check why and provide a robust solution they're welcome.
|
436
|
+
obs = self.safe_env_reset()
|
437
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
438
|
+
observations = []
|
439
|
+
is_successful_observation_made = False
|
440
|
+
num_of_insuccessful_attempts = 0
|
441
|
+
while not is_successful_observation_made:
|
442
|
+
# start as true, if this isn't the case (crash/death/truncation instead of success)
|
443
|
+
is_successful_observation_made = True
|
444
|
+
if random_optimalism:
|
445
|
+
constant_initial_action = self.env.action_space.sample()
|
446
|
+
while True:
|
447
|
+
from gr_libs.metrics.metrics import stochastic_amplified_selection
|
448
|
+
|
449
|
+
deterministic = (
|
450
|
+
action_selection_method != stochastic_amplified_selection
|
451
|
+
)
|
452
|
+
action, _states = self._model.predict(obs, deterministic=deterministic)
|
453
|
+
if random_optimalism:
|
454
|
+
# get the right direction and then start inserting noise to still get a relatively optimal plan
|
455
|
+
self.add_random_optimalism(obs, action, constant_initial_action)
|
456
|
+
if with_dict:
|
457
|
+
observations.append((obs, action))
|
458
|
+
else:
|
459
|
+
observations.append((obs["observation"], action))
|
460
|
+
obs, reward, done, info = self.env.step(action)
|
461
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
462
|
+
general_done = bool(self.env_prop.is_done(done))
|
463
|
+
success_done = self.env_prop.is_success(info)
|
464
|
+
success_done = bool(
|
465
|
+
self.env_prop.change_done_by_specific_desired(
|
466
|
+
obs, desired, success_done
|
467
|
+
)
|
468
|
+
)
|
469
|
+
if general_done is True and success_done is False:
|
470
|
+
# it could be that the stochasticity inserted into the actions made the agent die/crash.
|
471
|
+
# we don't want this observation: it's an insuccessful attempt.
|
472
|
+
num_of_insuccessful_attempts += 1
|
473
|
+
# print(f"for agent for problem {self.problem_name}, its done
|
474
|
+
# {len(observations)} steps, and got to a situation where
|
475
|
+
# general_done != success_done, for the {num_of_insuccessful_attempts} time.")
|
476
|
+
if num_of_insuccessful_attempts > 50:
|
477
|
+
# print(f"got more then 10 insuccessful attempts!")
|
478
|
+
assert (
|
479
|
+
general_done
|
480
|
+
== success_done
|
481
|
+
# we want to make sure the episode is done only
|
482
|
+
# when the agent has actually succeeded with the task.
|
483
|
+
), f"failed on goal: {obs['desired']}"
|
484
|
+
else:
|
485
|
+
# try again by breaking inner loop.
|
486
|
+
# everything is set up to be like the beginning of the function.
|
487
|
+
is_successful_observation_made = False
|
488
|
+
obs = self.safe_env_reset()
|
489
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
490
|
+
observations = (
|
491
|
+
[]
|
492
|
+
) # we want to re-accumulate the observations from scratch, have another try
|
493
|
+
break
|
494
|
+
elif general_done is False and success_done is False:
|
495
|
+
continue
|
496
|
+
elif general_done is True and success_done is True:
|
497
|
+
if num_of_insuccessful_attempts > 0:
|
498
|
+
pass # print(f"after {num_of_insuccessful_attempts}, finally I succeeded!")
|
499
|
+
break
|
500
|
+
elif general_done is False and success_done is True:
|
501
|
+
# The environment will say 'done' is false (general_done) while the observation
|
502
|
+
# will be close to the goal (success_done) only in case we incorporated a 'desired'
|
503
|
+
# when generating the observation.
|
504
|
+
assert (
|
505
|
+
desired is not None
|
506
|
+
), f"general_done is false but success_done is true, and desired is None. This should never happen, since the \
|
507
|
+
environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done) \
|
508
|
+
only in case we incorporated a 'desired' when generating the observation."
|
509
|
+
break
|
510
|
+
|
511
|
+
if save_fig:
|
512
|
+
self.try_recording_video(fig_path, desired)
|
513
|
+
|
514
|
+
self.env.close()
|
515
|
+
return observations
|
516
|
+
|
517
|
+
|
351
518
|
class GCDeepRLAgent(DeepRLAgent):
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
519
|
+
"""
|
520
|
+
A class representing a Goal Conditioned Deep Reinforcement Learning Agent.
|
521
|
+
|
522
|
+
This agent extends the functionality of the base DeepRLAgent class by providing methods for generating partial observations and observations with goal-directed goals or problems.
|
523
|
+
|
524
|
+
Args:
|
525
|
+
DeepRLAgent (class): The base class for DeepRLAgent.
|
526
|
+
|
527
|
+
Attributes:
|
528
|
+
env (object): The environment in which the agent operates.
|
529
|
+
env_prop (object): The environment properties.
|
530
|
+
|
531
|
+
Methods:
|
532
|
+
generate_partial_observation: Generates a partial observation based on a given percentage of steps.
|
533
|
+
generate_observation: Generates an observation with optional goal-directed goals or problems.
|
534
|
+
"""
|
535
|
+
|
536
|
+
def generate_partial_observation(
|
537
|
+
self,
|
538
|
+
action_selection_method,
|
539
|
+
percentage,
|
540
|
+
is_consecutive,
|
541
|
+
goal_directed_problem=None,
|
542
|
+
goal_directed_goal=None,
|
543
|
+
save_fig=False,
|
544
|
+
fig_path=None,
|
545
|
+
random_optimalism=True,
|
546
|
+
):
|
547
|
+
"""
|
548
|
+
Generates a partial observation based on a given percentage of steps.
|
549
|
+
|
550
|
+
Args:
|
551
|
+
action_selection_method (MethodType): The method for selecting actions.
|
552
|
+
percentage (float): The percentage of steps to include in the partial observation.
|
553
|
+
is_consecutive (bool): Whether the steps should be consecutive or randomly selected.
|
554
|
+
goal_directed_problem (str, optional): The goal-directed problem. Defaults to None.
|
555
|
+
goal_directed_goal (object, optional): The goal-directed goal. Defaults to None.
|
556
|
+
save_fig (bool, optional): Whether to save a figure. Defaults to False.
|
557
|
+
fig_path (str, optional): The path to save the figure. Defaults to None.
|
558
|
+
random_optimalism (bool, optional): Whether to use random optimalism. Defaults to True.
|
559
|
+
|
560
|
+
Returns:
|
561
|
+
list: A random subset of steps from the full observation.
|
562
|
+
"""
|
563
|
+
steps = self.generate_observation(
|
564
|
+
action_selection_method,
|
565
|
+
save_fig=save_fig,
|
566
|
+
fig_path=fig_path,
|
567
|
+
random_optimalism=random_optimalism,
|
568
|
+
goal_directed_problem=goal_directed_problem,
|
569
|
+
goal_directed_goal=goal_directed_goal,
|
570
|
+
) # steps are a full observation
|
571
|
+
return random_subset_with_order(
|
572
|
+
steps, (int)(percentage * len(steps)), is_consecutive
|
573
|
+
)
|
574
|
+
|
575
|
+
def generate_observation(
|
576
|
+
self,
|
577
|
+
action_selection_method: MethodType,
|
578
|
+
random_optimalism,
|
579
|
+
goal_directed_problem=None,
|
580
|
+
goal_directed_goal=None,
|
581
|
+
save_fig=False,
|
582
|
+
fig_path=None,
|
583
|
+
with_dict=False,
|
584
|
+
):
|
585
|
+
"""
|
586
|
+
Generates an observation with optional goal-directed goals or problems.
|
587
|
+
|
588
|
+
Args:
|
589
|
+
action_selection_method (MethodType): The method for selecting actions.
|
590
|
+
random_optimalism (bool): Whether to use random optimalism.
|
591
|
+
goal_directed_problem (str, optional): The goal-directed problem. Defaults to None.
|
592
|
+
goal_directed_goal (object, optional): The goal-directed goal. Defaults to None.
|
593
|
+
save_fig (bool, optional): Whether to save a figure. Defaults to False.
|
594
|
+
fig_path (str, optional): The path to save the figure. Defaults to None.
|
595
|
+
with_dict (bool, optional): Whether to include a dictionary in the observation. Defaults to False.
|
596
|
+
|
597
|
+
Returns:
|
598
|
+
list: The generated observation.
|
599
|
+
"""
|
600
|
+
if save_fig:
|
601
|
+
assert (
|
602
|
+
fig_path is not None
|
603
|
+
), "You need to specify a vid path when you save the figure."
|
604
|
+
else:
|
605
|
+
assert fig_path is None
|
606
|
+
|
607
|
+
if goal_directed_problem:
|
608
|
+
assert (
|
609
|
+
goal_directed_goal is None
|
610
|
+
), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
|
611
|
+
kwargs = {"id": goal_directed_problem, "render_mode": "rgb_array"}
|
612
|
+
self.env = self.env_prop.create_vec_env(kwargs)
|
613
|
+
orig_env = self.env
|
614
|
+
observations = super().generate_observation(
|
615
|
+
action_selection_method=action_selection_method,
|
616
|
+
random_optimalism=random_optimalism,
|
617
|
+
save_fig=save_fig,
|
618
|
+
fig_path=fig_path,
|
619
|
+
with_dict=with_dict,
|
620
|
+
)
|
621
|
+
self.env = orig_env
|
622
|
+
else:
|
623
|
+
assert (
|
624
|
+
goal_directed_problem is None
|
625
|
+
), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
|
626
|
+
observations = super().generate_observation(
|
627
|
+
action_selection_method=action_selection_method,
|
628
|
+
random_optimalism=random_optimalism,
|
629
|
+
save_fig=save_fig,
|
630
|
+
fig_path=fig_path,
|
631
|
+
with_dict=with_dict,
|
632
|
+
desired=goal_directed_goal,
|
633
|
+
)
|
634
|
+
return observations
|