gr-libs 0.1.8__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gr_libs/__init__.py +3 -1
- gr_libs/_version.py +2 -2
- gr_libs/all_experiments.py +260 -0
- gr_libs/environment/__init__.py +14 -1
- gr_libs/environment/_utils/__init__.py +0 -0
- gr_libs/environment/{utils → _utils}/utils.py +1 -1
- gr_libs/environment/environment.py +278 -23
- gr_libs/evaluation/__init__.py +1 -0
- gr_libs/evaluation/generate_experiments_results.py +100 -0
- gr_libs/metrics/__init__.py +2 -0
- gr_libs/metrics/metrics.py +166 -31
- gr_libs/ml/__init__.py +1 -6
- gr_libs/ml/base/__init__.py +3 -1
- gr_libs/ml/base/rl_agent.py +68 -3
- gr_libs/ml/neural/__init__.py +1 -3
- gr_libs/ml/neural/deep_rl_learner.py +241 -84
- gr_libs/ml/neural/utils/__init__.py +1 -2
- gr_libs/ml/planner/mcts/{utils → _utils}/tree.py +1 -1
- gr_libs/ml/planner/mcts/mcts_model.py +71 -34
- gr_libs/ml/sequential/__init__.py +0 -1
- gr_libs/ml/sequential/{lstm_model.py → _lstm_model.py} +11 -14
- gr_libs/ml/tabular/__init__.py +1 -3
- gr_libs/ml/tabular/tabular_q_learner.py +27 -9
- gr_libs/ml/tabular/tabular_rl_agent.py +22 -9
- gr_libs/ml/utils/__init__.py +2 -9
- gr_libs/ml/utils/format.py +13 -90
- gr_libs/ml/utils/math.py +3 -2
- gr_libs/ml/utils/other.py +2 -2
- gr_libs/ml/utils/storage.py +41 -94
- gr_libs/odgr_executor.py +263 -0
- gr_libs/problems/consts.py +570 -292
- gr_libs/recognizer/{utils → _utils}/format.py +2 -2
- gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +127 -36
- gr_libs/recognizer/graml/{gr_dataset.py → _gr_dataset.py} +11 -11
- gr_libs/recognizer/graml/graml_recognizer.py +186 -35
- gr_libs/recognizer/recognizer.py +59 -10
- gr_libs/tutorials/draco_panda_tutorial.py +58 -0
- gr_libs/tutorials/draco_parking_tutorial.py +56 -0
- {tutorials → gr_libs/tutorials}/gcdraco_panda_tutorial.py +11 -11
- {tutorials → gr_libs/tutorials}/gcdraco_parking_tutorial.py +6 -8
- {tutorials → gr_libs/tutorials}/graml_minigrid_tutorial.py +18 -14
- {tutorials → gr_libs/tutorials}/graml_panda_tutorial.py +11 -12
- {tutorials → gr_libs/tutorials}/graml_parking_tutorial.py +8 -10
- {tutorials → gr_libs/tutorials}/graml_point_maze_tutorial.py +17 -3
- {tutorials → gr_libs/tutorials}/graql_minigrid_tutorial.py +2 -2
- {gr_libs-0.1.8.dist-info → gr_libs-0.2.5.dist-info}/METADATA +95 -29
- gr_libs-0.2.5.dist-info/RECORD +72 -0
- {gr_libs-0.1.8.dist-info → gr_libs-0.2.5.dist-info}/WHEEL +1 -1
- gr_libs-0.2.5.dist-info/top_level.txt +2 -0
- tests/test_draco.py +14 -0
- tests/test_gcdraco.py +2 -2
- tests/test_graml.py +4 -4
- tests/test_graql.py +1 -1
- tests/test_odgr_executor_expertbasedgraml.py +14 -0
- tests/test_odgr_executor_gcdraco.py +14 -0
- tests/test_odgr_executor_gcgraml.py +14 -0
- tests/test_odgr_executor_graql.py +14 -0
- evaluation/analyze_results_cross_alg_cross_domain.py +0 -267
- evaluation/create_minigrid_map_image.py +0 -38
- evaluation/file_system.py +0 -53
- evaluation/generate_experiments_results.py +0 -141
- evaluation/generate_experiments_results_new_ver1.py +0 -238
- evaluation/generate_experiments_results_new_ver2.py +0 -331
- evaluation/generate_task_specific_statistics_plots.py +0 -500
- evaluation/get_plans_images.py +0 -62
- evaluation/increasing_and_decreasing_.py +0 -104
- gr_libs/ml/neural/utils/penv.py +0 -60
- gr_libs-0.1.8.dist-info/RECORD +0 -70
- gr_libs-0.1.8.dist-info/top_level.txt +0 -4
- /gr_libs/{environment/utils/__init__.py → _evaluation/_generate_experiments_results.py} +0 -0
- /gr_libs/ml/planner/mcts/{utils → _utils}/__init__.py +0 -0
- /gr_libs/ml/planner/mcts/{utils → _utils}/node.py +0 -0
- /gr_libs/recognizer/{utils → _utils}/__init__.py +0 -0
@@ -1,24 +1,26 @@
|
|
1
|
-
from collections import OrderedDict
|
2
1
|
import gc
|
2
|
+
from collections import OrderedDict
|
3
3
|
from types import MethodType
|
4
|
-
|
5
|
-
import numpy as np
|
4
|
+
|
6
5
|
import cv2
|
6
|
+
import numpy as np
|
7
7
|
|
8
|
-
from gr_libs.environment.environment import EnvProperty
|
8
|
+
from gr_libs.environment.environment import EnvProperty, suppress_output
|
9
9
|
|
10
10
|
if __name__ != "__main__":
|
11
11
|
from gr_libs.ml.utils.storage import get_agent_model_dir
|
12
12
|
from gr_libs.ml.utils.format import random_subset_with_order
|
13
|
-
|
14
|
-
|
15
|
-
from gr_libs.ml.utils import device
|
16
|
-
import gymnasium as gym
|
13
|
+
|
14
|
+
import os
|
17
15
|
|
18
16
|
# built-in python modules
|
19
17
|
import random
|
20
|
-
|
21
|
-
import
|
18
|
+
|
19
|
+
import gymnasium as gym
|
20
|
+
from stable_baselines3 import PPO, SAC, TD3
|
21
|
+
from stable_baselines3.common.base_class import BaseAlgorithm
|
22
|
+
|
23
|
+
from gr_libs.ml.utils import device
|
22
24
|
|
23
25
|
# TODO do we need this?
|
24
26
|
NETWORK_SETUP = {
|
@@ -42,7 +44,6 @@ NETWORK_SETUP = {
|
|
42
44
|
("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
|
43
45
|
]
|
44
46
|
),
|
45
|
-
# "tqc": OrderedDict([('batch_size', 256), ('buffer_size', 1000000), ('ent_coef', 'auto'), ('env_wrapper', ['sb3_contrib.common.wrappers.TimeFeatureWrapper']), ('gamma', 0.95), ('learning_rate', 0.001), ('learning_starts', 1000), ('n_timesteps', 25000.0), ('normalize', False), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(net_arch=[64, 64])'), ('replay_buffer_class', 'HerReplayBuffer'), ('replay_buffer_kwargs', "dict( goal_selection_strategy='future', n_sampled_goal=4 )"), ('normalize_kwargs',{'norm_obs':False,'norm_reward':False})]),
|
46
47
|
PPO: OrderedDict(
|
47
48
|
[
|
48
49
|
("batch_size", 256),
|
@@ -68,6 +69,22 @@ NETWORK_SETUP = {
|
|
68
69
|
|
69
70
|
|
70
71
|
class DeepRLAgent:
|
72
|
+
"""
|
73
|
+
Deep Reinforcement Learning Agent, wrapping a SB3 agent and adding functionality,
|
74
|
+
needed for GR framework executions such as observation generation and video recording.
|
75
|
+
Supports SAC, PPO and TD3 algorithms.
|
76
|
+
Can be loaded from rl_zoo or trained from scratch.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
domain_name (str): The domain name.
|
80
|
+
problem_name (str): The problem name.
|
81
|
+
num_timesteps (float): The number of timesteps for training.
|
82
|
+
env_prop (EnvProperty): The environment property.
|
83
|
+
algorithm (BaseAlgorithm, optional): The algorithm to use. Defaults to SAC.
|
84
|
+
reward_threshold (float, optional): The reward threshold. Defaults to 450.
|
85
|
+
exploration_rate (float, optional): The exploration rate. Defaults to None.
|
86
|
+
"""
|
87
|
+
|
71
88
|
def __init__(
|
72
89
|
self,
|
73
90
|
domain_name: str,
|
@@ -78,7 +95,18 @@ class DeepRLAgent:
|
|
78
95
|
reward_threshold: float = 450,
|
79
96
|
exploration_rate=None,
|
80
97
|
):
|
81
|
-
|
98
|
+
"""
|
99
|
+
Initialize the DeepRLLearner object.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
domain_name (str): The name of the domain.
|
103
|
+
problem_name (str): The name of the problem.
|
104
|
+
num_timesteps (float): The number of timesteps.
|
105
|
+
env_prop (EnvProperty): The environment property.
|
106
|
+
algorithm (BaseAlgorithm, optional): The algorithm to use. Defaults to SAC.
|
107
|
+
reward_threshold (float, optional): The reward threshold. Defaults to 450.
|
108
|
+
exploration_rate (float, optional): The exploration rate. Defaults to None.
|
109
|
+
"""
|
82
110
|
env_kwargs = {"id": problem_name, "render_mode": "rgb_array"}
|
83
111
|
assert algorithm in [SAC, PPO, TD3]
|
84
112
|
|
@@ -110,7 +138,8 @@ class DeepRLAgent:
|
|
110
138
|
"seed": 0,
|
111
139
|
"buffer_size": 1,
|
112
140
|
}
|
113
|
-
# second support: models saved with SB3's model.save, which is saved as a
|
141
|
+
# second support: models saved with SB3's model.save, which is saved as a
|
142
|
+
# formatted .pth file.
|
114
143
|
else:
|
115
144
|
self.model_kwargs = {}
|
116
145
|
self._model_file_path = os.path.join(
|
@@ -122,9 +151,17 @@ class DeepRLAgent:
|
|
122
151
|
self.num_timesteps = num_timesteps
|
123
152
|
|
124
153
|
def save_model(self):
|
154
|
+
"""Save the model to a file."""
|
125
155
|
self._model.save(self._model_file_path)
|
126
156
|
|
127
157
|
def try_recording_video(self, video_path, desired=None):
|
158
|
+
"""
|
159
|
+
Try recording a video of the agent's performance.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
video_path (str): The path to save the video.
|
163
|
+
desired (optional): The desired goal. Defaults to None.
|
164
|
+
"""
|
128
165
|
num_tries = 0
|
129
166
|
while True:
|
130
167
|
if num_tries >= 10:
|
@@ -132,21 +169,22 @@ class DeepRLAgent:
|
|
132
169
|
try:
|
133
170
|
self.record_video(video_path, desired)
|
134
171
|
break
|
135
|
-
except Exception
|
172
|
+
except Exception:
|
136
173
|
num_tries += 1
|
137
174
|
# print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
|
138
175
|
print(f"generated sequence video at {video_path}.")
|
139
176
|
|
140
177
|
def record_video(self, video_path, desired=None):
|
141
|
-
"""
|
178
|
+
"""
|
179
|
+
Record a video of the agent's performance.
|
180
|
+
|
181
|
+
Args:
|
182
|
+
video_path (str): The path to save the video.
|
183
|
+
desired (optional): The desired goal. Defaults to None.
|
184
|
+
"""
|
142
185
|
fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
|
143
186
|
fps = 30.0
|
144
|
-
|
145
|
-
# assert goal_idx != None
|
146
|
-
# self.reset_with_goal_idx(goal_idx)
|
147
|
-
# else:
|
148
|
-
# assert goal_idx == None
|
149
|
-
self.env.reset()
|
187
|
+
self.safe_env_reset()
|
150
188
|
frame_size = (
|
151
189
|
self.env.render(mode="rgb_array").shape[1],
|
152
190
|
self.env.render(mode="rgb_array").shape[0],
|
@@ -155,7 +193,7 @@ class DeepRLAgent:
|
|
155
193
|
video_writer = cv2.VideoWriter(video_path, fourcc, fps, frame_size)
|
156
194
|
general_done, success_done = False, False
|
157
195
|
gc.collect()
|
158
|
-
obs = self.
|
196
|
+
obs = self.safe_env_reset()
|
159
197
|
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
160
198
|
counter = 0
|
161
199
|
while not (general_done or success_done):
|
@@ -166,17 +204,11 @@ class DeepRLAgent:
|
|
166
204
|
general_done = general_done[0]
|
167
205
|
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
168
206
|
if "success" in info[0].keys():
|
169
|
-
success_done = info[0][
|
170
|
-
"success"
|
171
|
-
] # make sure the agent actually reached the goal within the max time
|
207
|
+
success_done = info[0]["success"]
|
172
208
|
elif "is_success" in info[0].keys():
|
173
|
-
success_done = info[0][
|
174
|
-
"is_success"
|
175
|
-
] # make sure the agent actually reached the goal within the max time
|
209
|
+
success_done = info[0]["is_success"]
|
176
210
|
elif "step_task_completions" in info[0].keys():
|
177
|
-
success_done = (
|
178
|
-
len(info[0]["step_task_completions"]) == 1
|
179
|
-
) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
|
211
|
+
success_done = len(info[0]["step_task_completions"]) == 1
|
180
212
|
else:
|
181
213
|
raise NotImplementedError(
|
182
214
|
"no other option for any of the environments."
|
@@ -186,34 +218,32 @@ class DeepRLAgent:
|
|
186
218
|
obs, desired, success_done
|
187
219
|
)
|
188
220
|
video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
189
|
-
if general_done == False
|
221
|
+
if general_done == False and success_done == True:
|
190
222
|
assert (
|
191
223
|
desired is not None
|
192
|
-
), f"general_done is false but success_done is true, and desired is None.
|
193
|
-
|
194
|
-
|
195
|
-
|
224
|
+
), f"general_done is false but success_done is true, and desired is None. \
|
225
|
+
This should never happen, since the environment will say 'done' is false \
|
226
|
+
(general_done) while the observation will be close to the goal (success_done) \
|
227
|
+
only in case we incorporated a 'desired' when generating the observation."
|
228
|
+
elif general_done == True and success_done == False:
|
196
229
|
raise Exception("general_done is true but success_done is false")
|
197
230
|
self.env.close()
|
198
231
|
video_writer.release()
|
199
232
|
|
200
233
|
def load_model(self):
|
234
|
+
"""Load the model from a file."""
|
201
235
|
self._model = self.algorithm.load(
|
202
236
|
self._model_file_path, env=self.env, device=device, **self.model_kwargs
|
203
237
|
)
|
204
238
|
|
205
239
|
def learn(self):
|
240
|
+
"""Train the agent."""
|
206
241
|
if os.path.exists(self._model_file_path):
|
207
242
|
print(f"Loading pre-existing model in {self._model_file_path}")
|
208
243
|
self.load_model()
|
209
244
|
else:
|
210
|
-
# Stop training when the model reaches the reward threshold
|
211
|
-
# callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=self.reward_threshold, verbose=1)
|
212
|
-
# eval_callback = EvalCallback(self.env, best_model_save_path="./logs/",
|
213
|
-
# log_path="./logs/", eval_freq=500, callback_on_new_best=callback_on_best, verbose=1, render=True)
|
214
|
-
# self._model.learn(total_timesteps=self.num_timesteps, progress_bar=True, callback=eval_callback)
|
215
245
|
print(f"No existing model in {self._model_file_path}, starting learning")
|
216
|
-
if self.exploration_rate
|
246
|
+
if self.exploration_rate is not None:
|
217
247
|
self._model = self.algorithm(
|
218
248
|
"MultiInputPolicy",
|
219
249
|
self.env,
|
@@ -228,15 +258,30 @@ class DeepRLAgent:
|
|
228
258
|
self.save_model()
|
229
259
|
|
230
260
|
def safe_env_reset(self):
|
261
|
+
"""
|
262
|
+
Reset the environment safely, suppressing output.
|
263
|
+
|
264
|
+
Returns:
|
265
|
+
The initial observation.
|
266
|
+
"""
|
231
267
|
try:
|
232
|
-
obs = self.env
|
233
|
-
except Exception
|
268
|
+
obs = suppress_env_reset(self.env)
|
269
|
+
except Exception:
|
234
270
|
kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
|
235
271
|
self.env = self.env_prop.create_vec_env(kwargs)
|
236
|
-
obs = self.env
|
272
|
+
obs = suppress_env_reset(self.env)
|
237
273
|
return obs
|
238
274
|
|
239
275
|
def get_mean_and_std_dev(self, observation):
|
276
|
+
"""
|
277
|
+
Get the mean and standard deviation of the action distribution.
|
278
|
+
|
279
|
+
Args:
|
280
|
+
observation: The observation.
|
281
|
+
|
282
|
+
Returns:
|
283
|
+
The mean and standard deviation of the action distribution.
|
284
|
+
"""
|
240
285
|
if self.algorithm == SAC:
|
241
286
|
tensor_observation, _ = self._model.actor.obs_to_tensor(observation)
|
242
287
|
|
@@ -266,9 +311,20 @@ class DeepRLAgent:
|
|
266
311
|
assert False
|
267
312
|
return actor_means, log_std_dev
|
268
313
|
|
269
|
-
# fits agents that generated observations in the form of: list of tuples, each tuple a single step\frame with size 2, comprised of obs and action.
|
270
|
-
# the function squashes the 2d array of obs and action in a 1d array, concatenating their values together for training.
|
271
314
|
def simplify_observation(self, observation):
|
315
|
+
"""
|
316
|
+
Simplifies the given observation by concatenating the last dimension of each observation and action.
|
317
|
+
fits agents that generated observations in the form of: list of tuples, each tuple a single
|
318
|
+
step\frame with size 2, comprised of obs and action.
|
319
|
+
the function squashes the 2d array of obs and action in a 1d array, concatenating their
|
320
|
+
values together for training.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
observation (list): List of tuples containing observation and action.
|
324
|
+
|
325
|
+
Returns:
|
326
|
+
list: List of simplified observations.
|
327
|
+
"""
|
272
328
|
return [
|
273
329
|
np.concatenate(
|
274
330
|
(
|
@@ -280,6 +336,17 @@ class DeepRLAgent:
|
|
280
336
|
]
|
281
337
|
|
282
338
|
def add_random_optimalism(self, observations, action, constant_initial_action):
|
339
|
+
"""
|
340
|
+
Adds random optimalism to the given action based on the length of observations.
|
341
|
+
|
342
|
+
Parameters:
|
343
|
+
observations (list): List of observations.
|
344
|
+
action (ndarray): Action to modify.
|
345
|
+
constant_initial_action (float): Initial action value.
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
ndarray: Modified action.
|
349
|
+
"""
|
283
350
|
if len(observations) > 3:
|
284
351
|
for i in range(0, len(action[0])):
|
285
352
|
action[0][i] += random.uniform(
|
@@ -287,6 +354,7 @@ class DeepRLAgent:
|
|
287
354
|
)
|
288
355
|
else: # just walk in a specific random direction to enable diverse plans
|
289
356
|
action = np.array(np.array([constant_initial_action]), None)
|
357
|
+
return action
|
290
358
|
|
291
359
|
def generate_partial_observation(
|
292
360
|
self,
|
@@ -297,6 +365,20 @@ class DeepRLAgent:
|
|
297
365
|
fig_path=None,
|
298
366
|
random_optimalism=True,
|
299
367
|
):
|
368
|
+
"""
|
369
|
+
Generates a partial observation by selecting a subset of steps from a full observation.
|
370
|
+
|
371
|
+
Args:
|
372
|
+
action_selection_method (str): The method used for selecting actions.
|
373
|
+
percentage (float): The percentage of steps to include in the partial observation.
|
374
|
+
is_consecutive (bool): Whether the selected steps should be consecutive or not.
|
375
|
+
save_fig (bool, optional): Whether to save a figure of the observation. Defaults to False.
|
376
|
+
fig_path (str, optional): The path to save the figure. Defaults to None.
|
377
|
+
random_optimalism (bool, optional): Whether to apply random optimalism during observation generation. Defaults to True.
|
378
|
+
|
379
|
+
Returns:
|
380
|
+
list: A partial observation consisting of a subset of steps from the full observation.
|
381
|
+
"""
|
300
382
|
steps = self.generate_observation(
|
301
383
|
action_selection_method,
|
302
384
|
save_fig=save_fig,
|
@@ -315,25 +397,39 @@ class DeepRLAgent:
|
|
315
397
|
fig_path=None,
|
316
398
|
with_dict=False,
|
317
399
|
desired=None,
|
318
|
-
) ->
|
319
|
-
|
320
|
-
|
321
|
-
|
400
|
+
) -> list[tuple[np.ndarray, np.ndarray]]:
|
401
|
+
"""
|
402
|
+
Generates observations by interacting with the environment.
|
403
|
+
|
404
|
+
Args:
|
405
|
+
action_selection_method (MethodType): The method used for action selection.
|
406
|
+
random_optimalism (bool): Flag indicating whether to add random optimalism to the actions.
|
407
|
+
save_fig (bool, optional): Flag indicating whether to save a figure. Defaults to False.
|
408
|
+
fig_path (str, optional): The path to save the figure. Required if save_fig is True. Defaults to None.
|
409
|
+
with_dict (bool, optional): Flag indicating whether to include the observation as a dictionary. Defaults to False.
|
410
|
+
desired (Any, optional): The desired goal for the observation. Defaults to None.
|
411
|
+
|
412
|
+
Returns:
|
413
|
+
list[tuple[np.ndarray, np.ndarray]]: A list of tuples containing the observation and the corresponding action.
|
414
|
+
"""
|
415
|
+
if save_fig is False:
|
322
416
|
assert (
|
323
|
-
fig_path
|
417
|
+
fig_path is None
|
324
418
|
), "You can't specify a vid path when you don't even save the figure."
|
325
419
|
else:
|
326
420
|
assert (
|
327
|
-
fig_path
|
421
|
+
fig_path is not None
|
328
422
|
), "You need to specify a vid path when you save the figure."
|
329
|
-
# The try-except is a bug fix for the env not being reset properly in panda.
|
423
|
+
# The try-except is a bug fix for the env not being reset properly in panda.
|
424
|
+
# If someone wants to check why and provide a robust solution they're welcome.
|
330
425
|
obs = self.safe_env_reset()
|
331
426
|
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
332
427
|
observations = []
|
333
428
|
is_successful_observation_made = False
|
334
429
|
num_of_insuccessful_attempts = 0
|
335
430
|
while not is_successful_observation_made:
|
336
|
-
|
431
|
+
# start as true, if this isn't the case (crash/death/truncation instead of success)
|
432
|
+
is_successful_observation_made = True
|
337
433
|
if random_optimalism:
|
338
434
|
constant_initial_action = self.env.action_space.sample()
|
339
435
|
while True:
|
@@ -343,9 +439,8 @@ class DeepRLAgent:
|
|
343
439
|
action_selection_method != stochastic_amplified_selection
|
344
440
|
)
|
345
441
|
action, _states = self._model.predict(obs, deterministic=deterministic)
|
346
|
-
if
|
347
|
-
|
348
|
-
): # get the right direction and then start inserting noise to still get a relatively optimal plan
|
442
|
+
if random_optimalism:
|
443
|
+
# get the right direction and then start inserting noise to still get a relatively optimal plan
|
349
444
|
self.add_random_optimalism(obs, action, constant_initial_action)
|
350
445
|
if with_dict:
|
351
446
|
observations.append((obs, action))
|
@@ -353,22 +448,31 @@ class DeepRLAgent:
|
|
353
448
|
observations.append((obs["observation"], action))
|
354
449
|
obs, reward, done, info = self.env.step(action)
|
355
450
|
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
356
|
-
general_done = self.env_prop.is_done(done)
|
451
|
+
general_done = bool(self.env_prop.is_done(done))
|
357
452
|
success_done = self.env_prop.is_success(info)
|
358
|
-
success_done =
|
359
|
-
|
453
|
+
success_done = bool(
|
454
|
+
self.env_prop.change_done_by_specific_desired(
|
455
|
+
obs, desired, success_done
|
456
|
+
)
|
360
457
|
)
|
361
|
-
if general_done
|
362
|
-
# it could be that the stochasticity inserted into the actions made the agent die/crash.
|
458
|
+
if general_done is True and success_done is False:
|
459
|
+
# it could be that the stochasticity inserted into the actions made the agent die/crash.
|
460
|
+
# we don't want this observation: it's an insuccessful attempt.
|
363
461
|
num_of_insuccessful_attempts += 1
|
364
|
-
# print(f"for agent for problem {self.problem_name}, its done
|
462
|
+
# print(f"for agent for problem {self.problem_name}, its done
|
463
|
+
# {len(observations)} steps, and got to a situation where
|
464
|
+
# general_done != success_done, for the {num_of_insuccessful_attempts} time.")
|
365
465
|
if num_of_insuccessful_attempts > 50:
|
366
466
|
# print(f"got more then 10 insuccessful attempts!")
|
367
467
|
assert (
|
368
|
-
general_done
|
369
|
-
|
468
|
+
general_done
|
469
|
+
== success_done
|
470
|
+
# we want to make sure the episode is done only
|
471
|
+
# when the agent has actually succeeded with the task.
|
472
|
+
), f"failed on goal: {obs['desired']}"
|
370
473
|
else:
|
371
|
-
# try again by breaking inner loop.
|
474
|
+
# try again by breaking inner loop.
|
475
|
+
# everything is set up to be like the beginning of the function.
|
372
476
|
is_successful_observation_made = False
|
373
477
|
obs = self.safe_env_reset()
|
374
478
|
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
@@ -376,20 +480,21 @@ class DeepRLAgent:
|
|
376
480
|
[]
|
377
481
|
) # we want to re-accumulate the observations from scratch, have another try
|
378
482
|
break
|
379
|
-
elif general_done
|
483
|
+
elif general_done is False and success_done is False:
|
380
484
|
continue
|
381
|
-
elif general_done
|
485
|
+
elif general_done is True and success_done is True:
|
382
486
|
if num_of_insuccessful_attempts > 0:
|
383
487
|
pass # print(f"after {num_of_insuccessful_attempts}, finally I succeeded!")
|
384
488
|
break
|
385
|
-
elif general_done
|
386
|
-
# The environment will say 'done' is false (general_done) while the observation
|
387
|
-
# only in case we incorporated a 'desired'
|
489
|
+
elif general_done is False and success_done is True:
|
490
|
+
# The environment will say 'done' is false (general_done) while the observation
|
491
|
+
# will be close to the goal (success_done) only in case we incorporated a 'desired'
|
492
|
+
# when generating the observation.
|
388
493
|
assert (
|
389
494
|
desired is not None
|
390
495
|
), f"general_done is false but success_done is true, and desired is None. This should never happen, since the \
|
391
|
-
|
392
|
-
|
496
|
+
environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done) \
|
497
|
+
only in case we incorporated a 'desired' when generating the observation."
|
393
498
|
break
|
394
499
|
|
395
500
|
if save_fig:
|
@@ -400,6 +505,23 @@ class DeepRLAgent:
|
|
400
505
|
|
401
506
|
|
402
507
|
class GCDeepRLAgent(DeepRLAgent):
|
508
|
+
"""
|
509
|
+
A class representing a Goal Conditioned Deep Reinforcement Learning Agent.
|
510
|
+
|
511
|
+
This agent extends the functionality of the base DeepRLAgent class by providing methods for generating partial observations and observations with goal-directed goals or problems.
|
512
|
+
|
513
|
+
Args:
|
514
|
+
DeepRLAgent (class): The base class for DeepRLAgent.
|
515
|
+
|
516
|
+
Attributes:
|
517
|
+
env (object): The environment in which the agent operates.
|
518
|
+
env_prop (object): The environment properties.
|
519
|
+
|
520
|
+
Methods:
|
521
|
+
generate_partial_observation: Generates a partial observation based on a given percentage of steps.
|
522
|
+
generate_observation: Generates an observation with optional goal-directed goals or problems.
|
523
|
+
"""
|
524
|
+
|
403
525
|
def generate_partial_observation(
|
404
526
|
self,
|
405
527
|
action_selection_method,
|
@@ -411,6 +533,22 @@ class GCDeepRLAgent(DeepRLAgent):
|
|
411
533
|
fig_path=None,
|
412
534
|
random_optimalism=True,
|
413
535
|
):
|
536
|
+
"""
|
537
|
+
Generates a partial observation based on a given percentage of steps.
|
538
|
+
|
539
|
+
Args:
|
540
|
+
action_selection_method (MethodType): The method for selecting actions.
|
541
|
+
percentage (float): The percentage of steps to include in the partial observation.
|
542
|
+
is_consecutive (bool): Whether the steps should be consecutive or randomly selected.
|
543
|
+
goal_directed_problem (str, optional): The goal-directed problem. Defaults to None.
|
544
|
+
goal_directed_goal (object, optional): The goal-directed goal. Defaults to None.
|
545
|
+
save_fig (bool, optional): Whether to save a figure. Defaults to False.
|
546
|
+
fig_path (str, optional): The path to save the figure. Defaults to None.
|
547
|
+
random_optimalism (bool, optional): Whether to use random optimalism. Defaults to True.
|
548
|
+
|
549
|
+
Returns:
|
550
|
+
list: A random subset of steps from the full observation.
|
551
|
+
"""
|
414
552
|
steps = self.generate_observation(
|
415
553
|
action_selection_method,
|
416
554
|
save_fig=save_fig,
|
@@ -423,8 +561,6 @@ class GCDeepRLAgent(DeepRLAgent):
|
|
423
561
|
steps, (int)(percentage * len(steps)), is_consecutive
|
424
562
|
)
|
425
563
|
|
426
|
-
# TODO move the goal_directed_goal and/or goal_directed_problem mechanism to be a property of the env_property, so deep_rl_learner doesn't depend on it and holds this logic so heavily.
|
427
|
-
# Generate observation with goal_directed_goal or goal_directed_problem is only possible for a GC agent, otherwise - the agent can't act optimally to that new goal.
|
428
564
|
def generate_observation(
|
429
565
|
self,
|
430
566
|
action_selection_method: MethodType,
|
@@ -435,16 +571,31 @@ class GCDeepRLAgent(DeepRLAgent):
|
|
435
571
|
fig_path=None,
|
436
572
|
with_dict=False,
|
437
573
|
):
|
574
|
+
"""
|
575
|
+
Generates an observation with optional goal-directed goals or problems.
|
576
|
+
|
577
|
+
Args:
|
578
|
+
action_selection_method (MethodType): The method for selecting actions.
|
579
|
+
random_optimalism (bool): Whether to use random optimalism.
|
580
|
+
goal_directed_problem (str, optional): The goal-directed problem. Defaults to None.
|
581
|
+
goal_directed_goal (object, optional): The goal-directed goal. Defaults to None.
|
582
|
+
save_fig (bool, optional): Whether to save a figure. Defaults to False.
|
583
|
+
fig_path (str, optional): The path to save the figure. Defaults to None.
|
584
|
+
with_dict (bool, optional): Whether to include a dictionary in the observation. Defaults to False.
|
585
|
+
|
586
|
+
Returns:
|
587
|
+
list: The generated observation.
|
588
|
+
"""
|
438
589
|
if save_fig:
|
439
590
|
assert (
|
440
|
-
fig_path
|
591
|
+
fig_path is not None
|
441
592
|
), "You need to specify a vid path when you save the figure."
|
442
593
|
else:
|
443
|
-
assert fig_path
|
444
|
-
|
594
|
+
assert fig_path is None
|
595
|
+
|
445
596
|
if goal_directed_problem:
|
446
597
|
assert (
|
447
|
-
goal_directed_goal
|
598
|
+
goal_directed_goal is None
|
448
599
|
), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
|
449
600
|
kwargs = {"id": goal_directed_problem, "render_mode": "rgb_array"}
|
450
601
|
self.env = self.env_prop.create_vec_env(kwargs)
|
@@ -457,11 +608,9 @@ class GCDeepRLAgent(DeepRLAgent):
|
|
457
608
|
with_dict=with_dict,
|
458
609
|
)
|
459
610
|
self.env = orig_env
|
460
|
-
# goal_directed_goal employs the agent in the same env on which it trained - with goals that change with every episode sampled from the goal space.
|
461
|
-
# but we manually change the 'desired' part of the observation to be the goal_directed_goal and edit the id_success and is_done accordingly.
|
462
611
|
else:
|
463
612
|
assert (
|
464
|
-
goal_directed_problem
|
613
|
+
goal_directed_problem is None
|
465
614
|
), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
|
466
615
|
observations = super().generate_observation(
|
467
616
|
action_selection_method=action_selection_method,
|
@@ -470,5 +619,13 @@ class GCDeepRLAgent(DeepRLAgent):
|
|
470
619
|
fig_path=fig_path,
|
471
620
|
with_dict=with_dict,
|
472
621
|
desired=goal_directed_goal,
|
473
|
-
)
|
622
|
+
)
|
474
623
|
return observations
|
624
|
+
|
625
|
+
|
626
|
+
def suppress_env_reset(env):
|
627
|
+
"""
|
628
|
+
Utility function to suppress prints during env.reset().
|
629
|
+
"""
|
630
|
+
with suppress_output():
|
631
|
+
return env.reset()
|
@@ -1,2 +1 @@
|
|
1
|
-
|
2
|
-
from gr_libs.ml.neural.utils.penv import ParallelEnv
|
1
|
+
""" utility functions for GR algorithms that use neural networks """
|