gr-libs 0.1.6.post1__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaluation/analyze_results_cross_alg_cross_domain.py +236 -246
- evaluation/create_minigrid_map_image.py +10 -6
- evaluation/file_system.py +16 -5
- evaluation/generate_experiments_results.py +123 -74
- evaluation/generate_experiments_results_new_ver1.py +227 -243
- evaluation/generate_experiments_results_new_ver2.py +317 -317
- evaluation/generate_task_specific_statistics_plots.py +481 -253
- evaluation/get_plans_images.py +41 -26
- evaluation/increasing_and_decreasing_.py +97 -56
- gr_libs/__init__.py +6 -1
- gr_libs/_version.py +2 -2
- gr_libs/environment/__init__.py +17 -9
- gr_libs/environment/environment.py +167 -39
- gr_libs/environment/utils/utils.py +22 -12
- gr_libs/metrics/__init__.py +5 -0
- gr_libs/metrics/metrics.py +76 -34
- gr_libs/ml/__init__.py +2 -0
- gr_libs/ml/agent.py +21 -6
- gr_libs/ml/base/__init__.py +1 -1
- gr_libs/ml/base/rl_agent.py +13 -10
- gr_libs/ml/consts.py +1 -1
- gr_libs/ml/neural/deep_rl_learner.py +433 -352
- gr_libs/ml/neural/utils/__init__.py +1 -1
- gr_libs/ml/neural/utils/dictlist.py +3 -3
- gr_libs/ml/neural/utils/penv.py +5 -2
- gr_libs/ml/planner/mcts/mcts_model.py +524 -302
- gr_libs/ml/planner/mcts/utils/__init__.py +1 -1
- gr_libs/ml/planner/mcts/utils/node.py +11 -7
- gr_libs/ml/planner/mcts/utils/tree.py +14 -10
- gr_libs/ml/sequential/__init__.py +1 -1
- gr_libs/ml/sequential/lstm_model.py +256 -175
- gr_libs/ml/tabular/state.py +7 -7
- gr_libs/ml/tabular/tabular_q_learner.py +123 -73
- gr_libs/ml/tabular/tabular_rl_agent.py +20 -19
- gr_libs/ml/utils/__init__.py +8 -2
- gr_libs/ml/utils/format.py +78 -70
- gr_libs/ml/utils/math.py +2 -1
- gr_libs/ml/utils/other.py +1 -1
- gr_libs/ml/utils/storage.py +95 -28
- gr_libs/problems/consts.py +1549 -1227
- gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +145 -80
- gr_libs/recognizer/graml/gr_dataset.py +209 -110
- gr_libs/recognizer/graml/graml_recognizer.py +431 -231
- gr_libs/recognizer/recognizer.py +38 -27
- gr_libs/recognizer/utils/__init__.py +1 -1
- gr_libs/recognizer/utils/format.py +8 -3
- {gr_libs-0.1.6.post1.dist-info → gr_libs-0.1.8.dist-info}/METADATA +1 -1
- gr_libs-0.1.8.dist-info/RECORD +70 -0
- {gr_libs-0.1.6.post1.dist-info → gr_libs-0.1.8.dist-info}/WHEEL +1 -1
- {gr_libs-0.1.6.post1.dist-info → gr_libs-0.1.8.dist-info}/top_level.txt +0 -1
- tests/test_gcdraco.py +10 -0
- tests/test_graml.py +8 -4
- tests/test_graql.py +2 -1
- tutorials/gcdraco_panda_tutorial.py +66 -0
- tutorials/gcdraco_parking_tutorial.py +61 -0
- tutorials/graml_minigrid_tutorial.py +42 -12
- tutorials/graml_panda_tutorial.py +35 -14
- tutorials/graml_parking_tutorial.py +37 -19
- tutorials/graml_point_maze_tutorial.py +33 -13
- tutorials/graql_minigrid_tutorial.py +31 -15
- CI/README.md +0 -12
- CI/docker_build_context/Dockerfile +0 -15
- gr_libs/recognizer/recognizer_doc.md +0 -61
- gr_libs-0.1.6.post1.dist-info/RECORD +0 -70
@@ -2,392 +2,473 @@ from collections import OrderedDict
|
|
2
2
|
import gc
|
3
3
|
from types import MethodType
|
4
4
|
from typing import List, Tuple
|
5
|
-
import gymnasium as gym
|
6
5
|
import numpy as np
|
7
6
|
import cv2
|
8
7
|
|
9
|
-
|
8
|
+
from gr_libs.environment.environment import EnvProperty
|
10
9
|
|
11
10
|
if __name__ != "__main__":
|
12
|
-
|
13
|
-
|
14
|
-
from stable_baselines3 import SAC, PPO
|
15
|
-
from stable_baselines3.common.
|
11
|
+
from gr_libs.ml.utils.storage import get_agent_model_dir
|
12
|
+
from gr_libs.ml.utils.format import random_subset_with_order
|
13
|
+
from stable_baselines3 import SAC, PPO, TD3
|
14
|
+
from stable_baselines3.common.base_class import BaseAlgorithm
|
16
15
|
from gr_libs.ml.utils import device
|
16
|
+
import gymnasium as gym
|
17
17
|
|
18
18
|
# built-in python modules
|
19
19
|
import random
|
20
20
|
import os
|
21
21
|
import sys
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
23
|
+
# TODO do we need this?
|
24
|
+
NETWORK_SETUP = {
|
25
|
+
SAC: OrderedDict(
|
26
|
+
[
|
27
|
+
("batch_size", 512),
|
28
|
+
("buffer_size", 100000),
|
29
|
+
("ent_coef", "auto"),
|
30
|
+
("gamma", 0.95),
|
31
|
+
("learning_rate", 0.001),
|
32
|
+
("learning_starts", 5000),
|
33
|
+
("n_timesteps", 50000.0),
|
34
|
+
("normalize", "{'norm_obs': False, 'norm_reward': False}"),
|
35
|
+
("policy", "MultiInputPolicy"),
|
36
|
+
("policy_kwargs", "dict(net_arch=[64, 64])"),
|
37
|
+
("replay_buffer_class", "HerReplayBuffer"),
|
38
|
+
(
|
39
|
+
"replay_buffer_kwargs",
|
40
|
+
"dict( goal_selection_strategy='future', n_sampled_goal=4 )",
|
41
|
+
),
|
42
|
+
("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
|
43
|
+
]
|
44
|
+
),
|
45
|
+
# "tqc": OrderedDict([('batch_size', 256), ('buffer_size', 1000000), ('ent_coef', 'auto'), ('env_wrapper', ['sb3_contrib.common.wrappers.TimeFeatureWrapper']), ('gamma', 0.95), ('learning_rate', 0.001), ('learning_starts', 1000), ('n_timesteps', 25000.0), ('normalize', False), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(net_arch=[64, 64])'), ('replay_buffer_class', 'HerReplayBuffer'), ('replay_buffer_kwargs', "dict( goal_selection_strategy='future', n_sampled_goal=4 )"), ('normalize_kwargs',{'norm_obs':False,'norm_reward':False})]),
|
46
|
+
PPO: OrderedDict(
|
47
|
+
[
|
48
|
+
("batch_size", 256),
|
49
|
+
("ent_coef", 0.01),
|
50
|
+
("gae_lambda", 0.9),
|
51
|
+
("gamma", 0.99),
|
52
|
+
("learning_rate", "lin_0.0001"),
|
53
|
+
("max_grad_norm", 0.5),
|
54
|
+
("n_envs", 8),
|
55
|
+
("n_epochs", 20),
|
56
|
+
("n_steps", 8),
|
57
|
+
("n_timesteps", 25000.0),
|
58
|
+
("normalize_advantage", False),
|
59
|
+
("policy", "MultiInputPolicy"),
|
60
|
+
("policy_kwargs", "dict(log_std_init=-2, ortho_init=False)"),
|
61
|
+
("use_sde", True),
|
62
|
+
("vf_coef", 0.4),
|
63
|
+
("normalize", False),
|
64
|
+
("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
|
65
|
+
]
|
66
|
+
),
|
67
|
+
}
|
28
68
|
|
29
|
-
def change_goal_to_specific_desired(obs, desired):
|
30
|
-
if desired is not None:
|
31
|
-
obs['desired_goal'] = desired
|
32
|
-
# try:
|
33
|
-
# if desired!=None: obs['desired_goal'] = desired
|
34
|
-
# except Exception as e:
|
35
|
-
# try:
|
36
|
-
# if all(desired!=None): obs['desired_goal'] = desired
|
37
|
-
# except Exception as e:
|
38
|
-
# if all([desiredy!=None for desiredish in desired for desiredy in desiredish]): obs['desired_goal'] = desired
|
39
69
|
|
70
|
+
class DeepRLAgent:
|
71
|
+
def __init__(
|
72
|
+
self,
|
73
|
+
domain_name: str,
|
74
|
+
problem_name: str,
|
75
|
+
num_timesteps: float,
|
76
|
+
env_prop: EnvProperty,
|
77
|
+
algorithm: BaseAlgorithm = SAC,
|
78
|
+
reward_threshold: float = 450,
|
79
|
+
exploration_rate=None,
|
80
|
+
):
|
81
|
+
# Need to change reward threshold to change according to which task the agent is training on, becuase it changes from task to task.
|
82
|
+
env_kwargs = {"id": problem_name, "render_mode": "rgb_array"}
|
83
|
+
assert algorithm in [SAC, PPO, TD3]
|
40
84
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
}
|
85
|
+
self.domain_name = domain_name
|
86
|
+
self.problem_name = problem_name
|
87
|
+
self.env_prop = env_prop
|
88
|
+
self.exploration_rate = exploration_rate
|
46
89
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
self.problem_name = problem_name
|
90
|
+
self._model_directory = get_agent_model_dir(
|
91
|
+
domain_name=self.domain_name,
|
92
|
+
model_name=problem_name,
|
93
|
+
class_name=algorithm.__name__,
|
94
|
+
)
|
95
|
+
self.env = self.env_prop.create_vec_env(env_kwargs)
|
96
|
+
self._actions_space = self.env.action_space
|
55
97
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
98
|
+
# first_support: SB3 models from RL zoo, with the .zip format.
|
99
|
+
if os.path.exists(os.path.join(self._model_directory, "saved_model.zip")):
|
100
|
+
# TODO check if it's ncessary to give these to the model.load if loading from rl zoo
|
101
|
+
self._model_file_path = os.path.join(
|
102
|
+
self._model_directory, "saved_model.zip"
|
103
|
+
)
|
104
|
+
self.model_kwargs = {
|
105
|
+
"custom_objects": {
|
106
|
+
"learning_rate": 0.0,
|
107
|
+
"lr_schedule": lambda _: 0.0,
|
108
|
+
"clip_range": lambda _: 0.0,
|
109
|
+
},
|
110
|
+
"seed": 0,
|
111
|
+
"buffer_size": 1,
|
112
|
+
}
|
113
|
+
# second support: models saved with SB3's model.save, which is saved as a formatted .pth file.
|
114
|
+
else:
|
115
|
+
self.model_kwargs = {}
|
116
|
+
self._model_file_path = os.path.join(
|
117
|
+
self._model_directory, "saved_model.pth"
|
118
|
+
)
|
70
119
|
|
71
|
-
|
72
|
-
|
120
|
+
self.algorithm = algorithm
|
121
|
+
self.reward_threshold = reward_threshold
|
122
|
+
self.num_timesteps = num_timesteps
|
73
123
|
|
74
|
-
|
75
|
-
|
76
|
-
"""Record a video of the agent's performance."""
|
77
|
-
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
|
78
|
-
fps = 30.0
|
79
|
-
# if is_gc:
|
80
|
-
# assert goal_idx != None
|
81
|
-
# self.reset_with_goal_idx(goal_idx)
|
82
|
-
# else:
|
83
|
-
# assert goal_idx == None
|
84
|
-
self.env.reset()
|
85
|
-
frame_size = (self.env.render(mode='rgb_array').shape[1], self.env.render(mode='rgb_array').shape[0])
|
86
|
-
video_path = os.path.join(video_path, "plan_video.mp4")
|
87
|
-
video_writer = cv2.VideoWriter(video_path, fourcc, fps, frame_size)
|
88
|
-
general_done, success_done = False, False
|
89
|
-
gc.collect()
|
90
|
-
obs = self.env.reset()
|
91
|
-
change_goal_to_specific_desired(obs, desired)
|
92
|
-
counter = 0
|
93
|
-
while not (general_done or success_done):
|
94
|
-
counter += 1
|
95
|
-
action, _states = self._model.predict(obs, deterministic=False)
|
96
|
-
obs, rewards, general_done, info = self.env.step(action)
|
97
|
-
if isinstance(general_done, np.ndarray): general_done = general_done[0]
|
98
|
-
change_goal_to_specific_desired(obs, desired)
|
99
|
-
if "success" in info[0].keys(): success_done = info[0]["success"] # make sure the agent actually reached the goal within the max time
|
100
|
-
elif "is_success" in info[0].keys(): success_done = info[0]["is_success"] # make sure the agent actually reached the goal within the max time
|
101
|
-
elif "step_task_completions" in info[0].keys(): success_done = (len(info[0]["step_task_completions"]) == 1) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
|
102
|
-
else: raise NotImplementedError("no other option for any of the environments.")
|
103
|
-
frame = self.env.render()
|
104
|
-
success_done = self.change_done_by_specific_desired(obs, desired, success_done)
|
105
|
-
video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
106
|
-
if general_done == False != success_done == True:
|
107
|
-
assert HACK_HAPPENED
|
108
|
-
elif general_done == True != success_done == False:
|
109
|
-
raise Exception("general_done is true but success_done is false")
|
110
|
-
self.env.close()
|
111
|
-
video_writer.release()
|
124
|
+
def save_model(self):
|
125
|
+
self._model.save(self._model_file_path)
|
112
126
|
|
113
|
-
|
127
|
+
def try_recording_video(self, video_path, desired=None):
|
128
|
+
num_tries = 0
|
129
|
+
while True:
|
130
|
+
if num_tries >= 10:
|
131
|
+
assert False, "agent keeps failing on recording an optimal obs."
|
132
|
+
try:
|
133
|
+
self.record_video(video_path, desired)
|
134
|
+
break
|
135
|
+
except Exception as e:
|
136
|
+
num_tries += 1
|
137
|
+
# print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
|
138
|
+
print(f"generated sequence video at {video_path}.")
|
114
139
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
140
|
+
def record_video(self, video_path, desired=None):
|
141
|
+
"""Record a video of the agent's performance."""
|
142
|
+
fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
|
143
|
+
fps = 30.0
|
144
|
+
# if is_gc:
|
145
|
+
# assert goal_idx != None
|
146
|
+
# self.reset_with_goal_idx(goal_idx)
|
147
|
+
# else:
|
148
|
+
# assert goal_idx == None
|
149
|
+
self.env.reset()
|
150
|
+
frame_size = (
|
151
|
+
self.env.render(mode="rgb_array").shape[1],
|
152
|
+
self.env.render(mode="rgb_array").shape[0],
|
153
|
+
)
|
154
|
+
video_path = os.path.join(video_path, "plan_video.mp4")
|
155
|
+
video_writer = cv2.VideoWriter(video_path, fourcc, fps, frame_size)
|
156
|
+
general_done, success_done = False, False
|
157
|
+
gc.collect()
|
158
|
+
obs = self.env.reset()
|
159
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
160
|
+
counter = 0
|
161
|
+
while not (general_done or success_done):
|
162
|
+
counter += 1
|
163
|
+
action, _states = self._model.predict(obs, deterministic=False)
|
164
|
+
obs, rewards, general_done, info = self.env.step(action)
|
165
|
+
if isinstance(general_done, np.ndarray):
|
166
|
+
general_done = general_done[0]
|
167
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
168
|
+
if "success" in info[0].keys():
|
169
|
+
success_done = info[0][
|
170
|
+
"success"
|
171
|
+
] # make sure the agent actually reached the goal within the max time
|
172
|
+
elif "is_success" in info[0].keys():
|
173
|
+
success_done = info[0][
|
174
|
+
"is_success"
|
175
|
+
] # make sure the agent actually reached the goal within the max time
|
176
|
+
elif "step_task_completions" in info[0].keys():
|
177
|
+
success_done = (
|
178
|
+
len(info[0]["step_task_completions"]) == 1
|
179
|
+
) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
|
180
|
+
else:
|
181
|
+
raise NotImplementedError(
|
182
|
+
"no other option for any of the environments."
|
183
|
+
)
|
184
|
+
frame = self.env.render()
|
185
|
+
success_done = self.env_prop.change_done_by_specific_desired(
|
186
|
+
obs, desired, success_done
|
187
|
+
)
|
188
|
+
video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
189
|
+
if general_done == False != success_done == True:
|
190
|
+
assert (
|
191
|
+
desired is not None
|
192
|
+
), f"general_done is false but success_done is true, and desired is None. This should never happen, since the \
|
193
|
+
environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done) \
|
194
|
+
only in case we incorporated a 'desired' when generating the observation."
|
195
|
+
elif general_done == True != success_done == False:
|
196
|
+
raise Exception("general_done is true but success_done is false")
|
197
|
+
self.env.close()
|
198
|
+
video_writer.release()
|
157
199
|
|
158
|
-
|
159
|
-
|
200
|
+
def load_model(self):
|
201
|
+
self._model = self.algorithm.load(
|
202
|
+
self._model_file_path, env=self.env, device=device, **self.model_kwargs
|
203
|
+
)
|
160
204
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
205
|
+
def learn(self):
|
206
|
+
if os.path.exists(self._model_file_path):
|
207
|
+
print(f"Loading pre-existing model in {self._model_file_path}")
|
208
|
+
self.load_model()
|
209
|
+
else:
|
210
|
+
# Stop training when the model reaches the reward threshold
|
211
|
+
# callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=self.reward_threshold, verbose=1)
|
212
|
+
# eval_callback = EvalCallback(self.env, best_model_save_path="./logs/",
|
213
|
+
# log_path="./logs/", eval_freq=500, callback_on_new_best=callback_on_best, verbose=1, render=True)
|
214
|
+
# self._model.learn(total_timesteps=self.num_timesteps, progress_bar=True, callback=eval_callback)
|
215
|
+
print(f"No existing model in {self._model_file_path}, starting learning")
|
216
|
+
if self.exploration_rate != None:
|
217
|
+
self._model = self.algorithm(
|
218
|
+
"MultiInputPolicy",
|
219
|
+
self.env,
|
220
|
+
ent_coef=self.exploration_rate,
|
221
|
+
verbose=1,
|
222
|
+
)
|
223
|
+
else:
|
224
|
+
self._model = self.algorithm("MultiInputPolicy", self.env, verbose=1)
|
225
|
+
self._model.learn(
|
226
|
+
total_timesteps=self.num_timesteps, progress_bar=True
|
227
|
+
) # comment this in a normal env
|
228
|
+
self.save_model()
|
182
229
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
break
|
192
|
-
episode_start = done
|
230
|
+
def safe_env_reset(self):
|
231
|
+
try:
|
232
|
+
obs = self.env.reset()
|
233
|
+
except Exception as e:
|
234
|
+
kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
|
235
|
+
self.env = self.env_prop.create_vec_env(kwargs)
|
236
|
+
obs = self.env.reset()
|
237
|
+
return obs
|
193
238
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
custom_objects = {
|
198
|
-
"learning_rate": 0.0,
|
199
|
-
"lr_schedule": lambda _: 0.0,
|
200
|
-
"clip_range": lambda _: 0.0,
|
201
|
-
}
|
202
|
-
kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
|
203
|
-
self.env = create_vec_env(kwargs)
|
204
|
-
self._actions_space = self.env.action_space
|
205
|
-
kwargs = {'seed': 0, 'buffer_size': 1}
|
239
|
+
def get_mean_and_std_dev(self, observation):
|
240
|
+
if self.algorithm == SAC:
|
241
|
+
tensor_observation, _ = self._model.actor.obs_to_tensor(observation)
|
206
242
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
self.save_model()
|
243
|
+
mean_actions, log_std_dev, kwargs = (
|
244
|
+
self._model.actor.get_action_dist_params(tensor_observation)
|
245
|
+
)
|
246
|
+
probability_dist = self._model.actor.action_dist.proba_distribution(
|
247
|
+
mean_actions=mean_actions, log_std=log_std_dev
|
248
|
+
)
|
249
|
+
actor_means = probability_dist.get_actions(True).cpu().detach().numpy()
|
250
|
+
log_std_dev = log_std_dev.cpu().detach().numpy()
|
251
|
+
elif self.algorithm == PPO:
|
252
|
+
self._model.policy.set_training_mode(False)
|
253
|
+
tensor_observation, _ = self._model.policy.obs_to_tensor(observation)
|
254
|
+
distribution = self._model.policy.get_distribution(tensor_observation)
|
220
255
|
|
221
|
-
|
222
|
-
|
223
|
-
|
256
|
+
actor_means = distribution.distribution.mean.cpu().detach().numpy()
|
257
|
+
log_std_dev = distribution.distribution.stddev.cpu().detach().numpy()
|
258
|
+
if isinstance(self._model.policy.action_space, gym.spaces.Box):
|
259
|
+
actor_means = np.clip(
|
260
|
+
actor_means,
|
261
|
+
self._model.policy.action_space.low,
|
262
|
+
self._model.policy.action_space.high,
|
263
|
+
)
|
264
|
+
return actor_means, log_std_dev
|
265
|
+
else:
|
266
|
+
assert False
|
267
|
+
return actor_means, log_std_dev
|
224
268
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
269
|
+
# fits agents that generated observations in the form of: list of tuples, each tuple a single step\frame with size 2, comprised of obs and action.
|
270
|
+
# the function squashes the 2d array of obs and action in a 1d array, concatenating their values together for training.
|
271
|
+
def simplify_observation(self, observation):
|
272
|
+
return [
|
273
|
+
np.concatenate(
|
274
|
+
(
|
275
|
+
np.array(obs).reshape(obs.shape[-1]),
|
276
|
+
np.array(action[0]).reshape(action[0].shape[-1]),
|
277
|
+
)
|
278
|
+
)
|
279
|
+
for (obs, action) in observation
|
280
|
+
]
|
236
281
|
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
return actor_means, log_std_dev
|
246
|
-
else:
|
247
|
-
assert False
|
248
|
-
return actor_means, log_std_dev
|
282
|
+
def add_random_optimalism(self, observations, action, constant_initial_action):
|
283
|
+
if len(observations) > 3:
|
284
|
+
for i in range(0, len(action[0])):
|
285
|
+
action[0][i] += random.uniform(
|
286
|
+
-0.01 * action[0][i], 0.01 * action[0][i]
|
287
|
+
)
|
288
|
+
else: # just walk in a specific random direction to enable diverse plans
|
289
|
+
action = np.array(np.array([constant_initial_action]), None)
|
249
290
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
291
|
+
def generate_partial_observation(
|
292
|
+
self,
|
293
|
+
action_selection_method,
|
294
|
+
percentage,
|
295
|
+
is_consecutive,
|
296
|
+
save_fig=False,
|
297
|
+
fig_path=None,
|
298
|
+
random_optimalism=True,
|
299
|
+
):
|
300
|
+
steps = self.generate_observation(
|
301
|
+
action_selection_method,
|
302
|
+
save_fig=save_fig,
|
303
|
+
random_optimalism=random_optimalism,
|
304
|
+
fig_path=fig_path,
|
305
|
+
) # steps are a full observation
|
306
|
+
return random_subset_with_order(
|
307
|
+
steps, (int)(percentage * len(steps)), is_consecutive
|
308
|
+
)
|
254
309
|
|
255
|
-
|
256
|
-
|
257
|
-
|
310
|
+
def generate_observation(
|
311
|
+
self,
|
312
|
+
action_selection_method: MethodType,
|
313
|
+
random_optimalism,
|
314
|
+
save_fig=False,
|
315
|
+
fig_path=None,
|
316
|
+
with_dict=False,
|
317
|
+
desired=None,
|
318
|
+
) -> List[
|
319
|
+
Tuple[np.ndarray, np.ndarray]
|
320
|
+
]: # TODO make sure to add a linter to alert when a method doesn't accept or return the type it should
|
321
|
+
if save_fig == False:
|
322
|
+
assert (
|
323
|
+
fig_path == None
|
324
|
+
), "You can't specify a vid path when you don't even save the figure."
|
325
|
+
else:
|
326
|
+
assert (
|
327
|
+
fig_path != None
|
328
|
+
), "You need to specify a vid path when you save the figure."
|
329
|
+
# The try-except is a bug fix for the env not being reset properly in panda. If someone wants to check why and provide a robust solution they're welcome.
|
330
|
+
obs = self.safe_env_reset()
|
331
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
332
|
+
observations = []
|
333
|
+
is_successful_observation_made = False
|
334
|
+
num_of_insuccessful_attempts = 0
|
335
|
+
while not is_successful_observation_made:
|
336
|
+
is_successful_observation_made = True # start as true, if this isn't the case (crash/death/truncation instead of success)
|
337
|
+
if random_optimalism:
|
338
|
+
constant_initial_action = self.env.action_space.sample()
|
339
|
+
while True:
|
340
|
+
from gr_libs.metrics.metrics import stochastic_amplified_selection
|
258
341
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
is_successful_observation_made = False
|
312
|
-
try:
|
313
|
-
obs = self.env.reset()
|
314
|
-
change_goal_to_specific_desired(obs, desired)
|
315
|
-
except Exception as e:
|
316
|
-
kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
|
317
|
-
self.env = create_vec_env(kwargs)
|
318
|
-
obs = self.env.reset()
|
319
|
-
change_goal_to_specific_desired(obs, desired)
|
320
|
-
observations = [] # we want to re-accumulate the observations from scratch, have another try
|
321
|
-
break
|
322
|
-
elif general_done == False and success_done == False:
|
323
|
-
continue
|
324
|
-
elif general_done == True and success_done == True:
|
325
|
-
if num_of_insuccessful_attempts > 0:
|
326
|
-
pass # print(f"after {num_of_insuccessful_attempts}, finally I succeeded!")
|
327
|
-
break
|
328
|
-
elif general_done == False and success_done == True:
|
329
|
-
assert HACK_HAPPENED == True # happens only if hack happened
|
330
|
-
break
|
331
|
-
# self.env.close()
|
332
|
-
if save_fig:
|
333
|
-
num_tries = 0
|
334
|
-
while True:
|
335
|
-
if num_tries >= 10:
|
336
|
-
assert False, "agent keeps failing on recording an optimal obs."
|
337
|
-
try:
|
338
|
-
self.record_video(fig_path, desired)
|
339
|
-
break
|
340
|
-
except Exception as e:
|
341
|
-
num_tries += 1
|
342
|
-
#print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
|
343
|
-
print(f"generated sequence video at {fig_path}.")
|
344
|
-
self.env.close()
|
345
|
-
return observations
|
342
|
+
deterministic = (
|
343
|
+
action_selection_method != stochastic_amplified_selection
|
344
|
+
)
|
345
|
+
action, _states = self._model.predict(obs, deterministic=deterministic)
|
346
|
+
if (
|
347
|
+
random_optimalism
|
348
|
+
): # get the right direction and then start inserting noise to still get a relatively optimal plan
|
349
|
+
self.add_random_optimalism(obs, action, constant_initial_action)
|
350
|
+
if with_dict:
|
351
|
+
observations.append((obs, action))
|
352
|
+
else:
|
353
|
+
observations.append((obs["observation"], action))
|
354
|
+
obs, reward, done, info = self.env.step(action)
|
355
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
356
|
+
general_done = self.env_prop.is_done(done)
|
357
|
+
success_done = self.env_prop.is_success(info)
|
358
|
+
success_done = self.env_prop.change_done_by_specific_desired(
|
359
|
+
obs, desired, success_done
|
360
|
+
)
|
361
|
+
if general_done == True and success_done == False:
|
362
|
+
# it could be that the stochasticity inserted into the actions made the agent die/crash. we don't want this observation: it's an insuccessful attempt.
|
363
|
+
num_of_insuccessful_attempts += 1
|
364
|
+
# print(f"for agent for problem {self.problem_name}, its done {len(observations)} steps, and got to a situation where general_done != success_done, for the {num_of_insuccessful_attempts} time.")
|
365
|
+
if num_of_insuccessful_attempts > 50:
|
366
|
+
# print(f"got more then 10 insuccessful attempts!")
|
367
|
+
assert (
|
368
|
+
general_done == success_done
|
369
|
+
), f"failed on goal: {obs['desired']}" # we want to make sure the episode is done only when the agent has actually succeeded with the task.
|
370
|
+
else:
|
371
|
+
# try again by breaking inner loop. everything is set up to be like the beginning of the function.
|
372
|
+
is_successful_observation_made = False
|
373
|
+
obs = self.safe_env_reset()
|
374
|
+
self.env_prop.change_goal_to_specific_desired(obs, desired)
|
375
|
+
observations = (
|
376
|
+
[]
|
377
|
+
) # we want to re-accumulate the observations from scratch, have another try
|
378
|
+
break
|
379
|
+
elif general_done == False and success_done == False:
|
380
|
+
continue
|
381
|
+
elif general_done == True and success_done == True:
|
382
|
+
if num_of_insuccessful_attempts > 0:
|
383
|
+
pass # print(f"after {num_of_insuccessful_attempts}, finally I succeeded!")
|
384
|
+
break
|
385
|
+
elif general_done == False and success_done == True:
|
386
|
+
# The environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done)
|
387
|
+
# only in case we incorporated a 'desired' when generating the observation.
|
388
|
+
assert (
|
389
|
+
desired is not None
|
390
|
+
), f"general_done is false but success_done is true, and desired is None. This should never happen, since the \
|
391
|
+
environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done) \
|
392
|
+
only in case we incorporated a 'desired' when generating the observation."
|
393
|
+
break
|
346
394
|
|
347
|
-
|
348
|
-
|
349
|
-
# return self.env.reset()
|
350
|
-
|
351
|
-
class GCDeepRLAgent(DeepRLAgent):
|
352
|
-
def generate_partial_observation(self, action_selection_method, percentage, is_consecutive, goal_directed_problem=None, goal_directed_goal=None, save_fig=False, fig_path=None, random_optimalism=True):
|
353
|
-
steps = self.generate_observation(action_selection_method, save_fig=save_fig, fig_path=fig_path, random_optimalism=random_optimalism, goal_directed_problem=goal_directed_problem, goal_directed_goal=goal_directed_goal) # steps are a full observation
|
354
|
-
return random_subset_with_order(steps, (int)(percentage * len(steps)), is_consecutive)
|
395
|
+
if save_fig:
|
396
|
+
self.try_recording_video(fig_path, desired)
|
355
397
|
|
356
|
-
|
357
|
-
|
358
|
-
# print(f"hyperparams:{hyperparams}")
|
359
|
-
if goal_directed_problem:
|
360
|
-
if save_fig:
|
361
|
-
assert fig_path != None, "You need to specify a vid path when you save the figure."
|
362
|
-
else:
|
363
|
-
assert fig_path == None
|
364
|
-
assert goal_directed_goal == None, "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
|
365
|
-
kwargs = {"id": goal_directed_problem, "render_mode": "rgb_array"}
|
366
|
-
self.env = create_vec_env(kwargs)
|
367
|
-
orig_env = self.env
|
368
|
-
observations = super().generate_observation(action_selection_method=action_selection_method, random_optimalism=random_optimalism,
|
369
|
-
save_fig=save_fig, fig_path=fig_path, with_dict=with_dict)
|
370
|
-
self.env = orig_env
|
371
|
-
else: #goal_directed_goal!=None
|
372
|
-
if save_fig:
|
373
|
-
assert fig_path != None, "You need to specify a vid path when you save the figure."
|
374
|
-
else:
|
375
|
-
assert fig_path == None
|
376
|
-
assert goal_directed_problem == None, "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
|
377
|
-
observations = super().generate_observation(action_selection_method=action_selection_method, random_optimalism=random_optimalism,
|
378
|
-
save_fig=save_fig, fig_path=fig_path, with_dict=with_dict, desired=goal_directed_goal) # TODO tutorial on how to use the deepRLAgent for sequence generation and examination and plotting of the sequence
|
379
|
-
return observations
|
380
|
-
|
398
|
+
self.env.close()
|
399
|
+
return observations
|
381
400
|
|
382
|
-
if __name__ == "__main__":
|
383
|
-
package_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
384
|
-
print("this is package root:" + package_root)
|
385
|
-
if package_root not in sys.path:
|
386
|
-
sys.path.insert(0, package_root)
|
387
401
|
|
388
|
-
|
402
|
+
class GCDeepRLAgent(DeepRLAgent):
|
403
|
+
def generate_partial_observation(
|
404
|
+
self,
|
405
|
+
action_selection_method,
|
406
|
+
percentage,
|
407
|
+
is_consecutive,
|
408
|
+
goal_directed_problem=None,
|
409
|
+
goal_directed_goal=None,
|
410
|
+
save_fig=False,
|
411
|
+
fig_path=None,
|
412
|
+
random_optimalism=True,
|
413
|
+
):
|
414
|
+
steps = self.generate_observation(
|
415
|
+
action_selection_method,
|
416
|
+
save_fig=save_fig,
|
417
|
+
fig_path=fig_path,
|
418
|
+
random_optimalism=random_optimalism,
|
419
|
+
goal_directed_problem=goal_directed_problem,
|
420
|
+
goal_directed_goal=goal_directed_goal,
|
421
|
+
) # steps are a full observation
|
422
|
+
return random_subset_with_order(
|
423
|
+
steps, (int)(percentage * len(steps)), is_consecutive
|
424
|
+
)
|
389
425
|
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
426
|
+
# TODO move the goal_directed_goal and/or goal_directed_problem mechanism to be a property of the env_property, so deep_rl_learner doesn't depend on it and holds this logic so heavily.
|
427
|
+
# Generate observation with goal_directed_goal or goal_directed_problem is only possible for a GC agent, otherwise - the agent can't act optimally to that new goal.
|
428
|
+
def generate_observation(
|
429
|
+
self,
|
430
|
+
action_selection_method: MethodType,
|
431
|
+
random_optimalism,
|
432
|
+
goal_directed_problem=None,
|
433
|
+
goal_directed_goal=None,
|
434
|
+
save_fig=False,
|
435
|
+
fig_path=None,
|
436
|
+
with_dict=False,
|
437
|
+
):
|
438
|
+
if save_fig:
|
439
|
+
assert (
|
440
|
+
fig_path != None
|
441
|
+
), "You need to specify a vid path when you save the figure."
|
442
|
+
else:
|
443
|
+
assert fig_path == None
|
444
|
+
# goal_directed_problem employs the GC agent in a new env with a static, predefined goal, and has him generate an observation sequence in it.
|
445
|
+
if goal_directed_problem:
|
446
|
+
assert (
|
447
|
+
goal_directed_goal == None
|
448
|
+
), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
|
449
|
+
kwargs = {"id": goal_directed_problem, "render_mode": "rgb_array"}
|
450
|
+
self.env = self.env_prop.create_vec_env(kwargs)
|
451
|
+
orig_env = self.env
|
452
|
+
observations = super().generate_observation(
|
453
|
+
action_selection_method=action_selection_method,
|
454
|
+
random_optimalism=random_optimalism,
|
455
|
+
save_fig=save_fig,
|
456
|
+
fig_path=fig_path,
|
457
|
+
with_dict=with_dict,
|
458
|
+
)
|
459
|
+
self.env = orig_env
|
460
|
+
# goal_directed_goal employs the agent in the same env on which it trained - with goals that change with every episode sampled from the goal space.
|
461
|
+
# but we manually change the 'desired' part of the observation to be the goal_directed_goal and edit the id_success and is_done accordingly.
|
462
|
+
else:
|
463
|
+
assert (
|
464
|
+
goal_directed_problem == None
|
465
|
+
), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
|
466
|
+
observations = super().generate_observation(
|
467
|
+
action_selection_method=action_selection_method,
|
468
|
+
random_optimalism=random_optimalism,
|
469
|
+
save_fig=save_fig,
|
470
|
+
fig_path=fig_path,
|
471
|
+
with_dict=with_dict,
|
472
|
+
desired=goal_directed_goal,
|
473
|
+
) # TODO tutorial on how to use the deepRLAgent for sequence generation and examination and plotting of the sequence
|
474
|
+
return observations
|