gr-libs 0.1.6.post1__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. evaluation/analyze_results_cross_alg_cross_domain.py +236 -246
  2. evaluation/create_minigrid_map_image.py +10 -6
  3. evaluation/file_system.py +16 -5
  4. evaluation/generate_experiments_results.py +123 -74
  5. evaluation/generate_experiments_results_new_ver1.py +227 -243
  6. evaluation/generate_experiments_results_new_ver2.py +317 -317
  7. evaluation/generate_task_specific_statistics_plots.py +481 -253
  8. evaluation/get_plans_images.py +41 -26
  9. evaluation/increasing_and_decreasing_.py +97 -56
  10. gr_libs/__init__.py +6 -1
  11. gr_libs/_version.py +2 -2
  12. gr_libs/environment/__init__.py +17 -9
  13. gr_libs/environment/environment.py +167 -39
  14. gr_libs/environment/utils/utils.py +22 -12
  15. gr_libs/metrics/__init__.py +5 -0
  16. gr_libs/metrics/metrics.py +76 -34
  17. gr_libs/ml/__init__.py +2 -0
  18. gr_libs/ml/agent.py +21 -6
  19. gr_libs/ml/base/__init__.py +1 -1
  20. gr_libs/ml/base/rl_agent.py +13 -10
  21. gr_libs/ml/consts.py +1 -1
  22. gr_libs/ml/neural/deep_rl_learner.py +433 -352
  23. gr_libs/ml/neural/utils/__init__.py +1 -1
  24. gr_libs/ml/neural/utils/dictlist.py +3 -3
  25. gr_libs/ml/neural/utils/penv.py +5 -2
  26. gr_libs/ml/planner/mcts/mcts_model.py +524 -302
  27. gr_libs/ml/planner/mcts/utils/__init__.py +1 -1
  28. gr_libs/ml/planner/mcts/utils/node.py +11 -7
  29. gr_libs/ml/planner/mcts/utils/tree.py +14 -10
  30. gr_libs/ml/sequential/__init__.py +1 -1
  31. gr_libs/ml/sequential/lstm_model.py +256 -175
  32. gr_libs/ml/tabular/state.py +7 -7
  33. gr_libs/ml/tabular/tabular_q_learner.py +123 -73
  34. gr_libs/ml/tabular/tabular_rl_agent.py +20 -19
  35. gr_libs/ml/utils/__init__.py +8 -2
  36. gr_libs/ml/utils/format.py +78 -70
  37. gr_libs/ml/utils/math.py +2 -1
  38. gr_libs/ml/utils/other.py +1 -1
  39. gr_libs/ml/utils/storage.py +95 -28
  40. gr_libs/problems/consts.py +1549 -1227
  41. gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +145 -80
  42. gr_libs/recognizer/graml/gr_dataset.py +209 -110
  43. gr_libs/recognizer/graml/graml_recognizer.py +431 -231
  44. gr_libs/recognizer/recognizer.py +38 -27
  45. gr_libs/recognizer/utils/__init__.py +1 -1
  46. gr_libs/recognizer/utils/format.py +8 -3
  47. {gr_libs-0.1.6.post1.dist-info → gr_libs-0.1.8.dist-info}/METADATA +1 -1
  48. gr_libs-0.1.8.dist-info/RECORD +70 -0
  49. {gr_libs-0.1.6.post1.dist-info → gr_libs-0.1.8.dist-info}/WHEEL +1 -1
  50. {gr_libs-0.1.6.post1.dist-info → gr_libs-0.1.8.dist-info}/top_level.txt +0 -1
  51. tests/test_gcdraco.py +10 -0
  52. tests/test_graml.py +8 -4
  53. tests/test_graql.py +2 -1
  54. tutorials/gcdraco_panda_tutorial.py +66 -0
  55. tutorials/gcdraco_parking_tutorial.py +61 -0
  56. tutorials/graml_minigrid_tutorial.py +42 -12
  57. tutorials/graml_panda_tutorial.py +35 -14
  58. tutorials/graml_parking_tutorial.py +37 -19
  59. tutorials/graml_point_maze_tutorial.py +33 -13
  60. tutorials/graql_minigrid_tutorial.py +31 -15
  61. CI/README.md +0 -12
  62. CI/docker_build_context/Dockerfile +0 -15
  63. gr_libs/recognizer/recognizer_doc.md +0 -61
  64. gr_libs-0.1.6.post1.dist-info/RECORD +0 -70
@@ -2,392 +2,473 @@ from collections import OrderedDict
2
2
  import gc
3
3
  from types import MethodType
4
4
  from typing import List, Tuple
5
- import gymnasium as gym
6
5
  import numpy as np
7
6
  import cv2
8
7
 
9
- HACK_HAPPENED = False
8
+ from gr_libs.environment.environment import EnvProperty
10
9
 
11
10
  if __name__ != "__main__":
12
- from gr_libs.ml.utils.storage import get_agent_model_dir
13
- from gr_libs.ml.utils.format import random_subset_with_order
14
- from stable_baselines3 import SAC, PPO
15
- from stable_baselines3.common.vec_env import DummyVecEnv
11
+ from gr_libs.ml.utils.storage import get_agent_model_dir
12
+ from gr_libs.ml.utils.format import random_subset_with_order
13
+ from stable_baselines3 import SAC, PPO, TD3
14
+ from stable_baselines3.common.base_class import BaseAlgorithm
16
15
  from gr_libs.ml.utils import device
16
+ import gymnasium as gym
17
17
 
18
18
  # built-in python modules
19
19
  import random
20
20
  import os
21
21
  import sys
22
22
 
23
- def create_vec_env(kwargs):
24
- # create the model, it will not be a pretrained one anyway
25
- # env = gym.make(**kwargs)
26
- env = gym.make(**kwargs)
27
- return DummyVecEnv([lambda: env])
23
+ # TODO do we need this?
24
+ NETWORK_SETUP = {
25
+ SAC: OrderedDict(
26
+ [
27
+ ("batch_size", 512),
28
+ ("buffer_size", 100000),
29
+ ("ent_coef", "auto"),
30
+ ("gamma", 0.95),
31
+ ("learning_rate", 0.001),
32
+ ("learning_starts", 5000),
33
+ ("n_timesteps", 50000.0),
34
+ ("normalize", "{'norm_obs': False, 'norm_reward': False}"),
35
+ ("policy", "MultiInputPolicy"),
36
+ ("policy_kwargs", "dict(net_arch=[64, 64])"),
37
+ ("replay_buffer_class", "HerReplayBuffer"),
38
+ (
39
+ "replay_buffer_kwargs",
40
+ "dict( goal_selection_strategy='future', n_sampled_goal=4 )",
41
+ ),
42
+ ("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
43
+ ]
44
+ ),
45
+ # "tqc": OrderedDict([('batch_size', 256), ('buffer_size', 1000000), ('ent_coef', 'auto'), ('env_wrapper', ['sb3_contrib.common.wrappers.TimeFeatureWrapper']), ('gamma', 0.95), ('learning_rate', 0.001), ('learning_starts', 1000), ('n_timesteps', 25000.0), ('normalize', False), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(net_arch=[64, 64])'), ('replay_buffer_class', 'HerReplayBuffer'), ('replay_buffer_kwargs', "dict( goal_selection_strategy='future', n_sampled_goal=4 )"), ('normalize_kwargs',{'norm_obs':False,'norm_reward':False})]),
46
+ PPO: OrderedDict(
47
+ [
48
+ ("batch_size", 256),
49
+ ("ent_coef", 0.01),
50
+ ("gae_lambda", 0.9),
51
+ ("gamma", 0.99),
52
+ ("learning_rate", "lin_0.0001"),
53
+ ("max_grad_norm", 0.5),
54
+ ("n_envs", 8),
55
+ ("n_epochs", 20),
56
+ ("n_steps", 8),
57
+ ("n_timesteps", 25000.0),
58
+ ("normalize_advantage", False),
59
+ ("policy", "MultiInputPolicy"),
60
+ ("policy_kwargs", "dict(log_std_init=-2, ortho_init=False)"),
61
+ ("use_sde", True),
62
+ ("vf_coef", 0.4),
63
+ ("normalize", False),
64
+ ("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
65
+ ]
66
+ ),
67
+ }
28
68
 
29
- def change_goal_to_specific_desired(obs, desired):
30
- if desired is not None:
31
- obs['desired_goal'] = desired
32
- # try:
33
- # if desired!=None: obs['desired_goal'] = desired
34
- # except Exception as e:
35
- # try:
36
- # if all(desired!=None): obs['desired_goal'] = desired
37
- # except Exception as e:
38
- # if all([desiredy!=None for desiredish in desired for desiredy in desiredish]): obs['desired_goal'] = desired
39
69
 
70
+ class DeepRLAgent:
71
+ def __init__(
72
+ self,
73
+ domain_name: str,
74
+ problem_name: str,
75
+ num_timesteps: float,
76
+ env_prop: EnvProperty,
77
+ algorithm: BaseAlgorithm = SAC,
78
+ reward_threshold: float = 450,
79
+ exploration_rate=None,
80
+ ):
81
+ # Need to change reward threshold to change according to which task the agent is training on, becuase it changes from task to task.
82
+ env_kwargs = {"id": problem_name, "render_mode": "rgb_array"}
83
+ assert algorithm in [SAC, PPO, TD3]
40
84
 
41
- NETWORK_SETUP = {
42
- SAC: OrderedDict([('batch_size', 512), ('buffer_size', 100000), ('ent_coef', 'auto'), ('gamma', 0.95), ('learning_rate', 0.001), ('learning_starts', 5000), ('n_timesteps', 50000.0), ('normalize', "{'norm_obs': False, 'norm_reward': False}"), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(net_arch=[64, 64])'), ('replay_buffer_class', 'HerReplayBuffer'), ('replay_buffer_kwargs', "dict( goal_selection_strategy='future', n_sampled_goal=4 )"), ('normalize_kwargs', {'norm_obs': False, 'norm_reward': False})]),
43
- #"tqc": OrderedDict([('batch_size', 256), ('buffer_size', 1000000), ('ent_coef', 'auto'), ('env_wrapper', ['sb3_contrib.common.wrappers.TimeFeatureWrapper']), ('gamma', 0.95), ('learning_rate', 0.001), ('learning_starts', 1000), ('n_timesteps', 25000.0), ('normalize', False), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(net_arch=[64, 64])'), ('replay_buffer_class', 'HerReplayBuffer'), ('replay_buffer_kwargs', "dict( goal_selection_strategy='future', n_sampled_goal=4 )"), ('normalize_kwargs',{'norm_obs':False,'norm_reward':False})]),
44
- PPO: OrderedDict([('batch_size', 256), ('ent_coef', 0.01), ('gae_lambda', 0.9), ('gamma', 0.99), ('learning_rate', 'lin_0.0001'), ('max_grad_norm', 0.5), ('n_envs', 8), ('n_epochs', 20), ('n_steps', 8), ('n_timesteps', 25000.0), ('normalize_advantage', False), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(log_std_init=-2, ortho_init=False)'), ('use_sde', True), ('vf_coef', 0.4), ('normalize', False), ('normalize_kwargs', {'norm_obs': False, 'norm_reward': False})]),
45
- }
85
+ self.domain_name = domain_name
86
+ self.problem_name = problem_name
87
+ self.env_prop = env_prop
88
+ self.exploration_rate = exploration_rate
46
89
 
47
- class DeepRLAgent():
48
- def __init__(self, domain_name: str, problem_name: str, num_timesteps:float, algorithm=SAC, reward_threshold: float=450,
49
- exploration_rate=None):
50
- # Need to change reward threshold to change according to which task the agent is training on, becuase it changes from task to task.
51
- kwargs = {"id":problem_name, "render_mode":"rgb_array"}
52
-
53
- self.domain_name = domain_name
54
- self.problem_name = problem_name
90
+ self._model_directory = get_agent_model_dir(
91
+ domain_name=self.domain_name,
92
+ model_name=problem_name,
93
+ class_name=algorithm.__name__,
94
+ )
95
+ self.env = self.env_prop.create_vec_env(env_kwargs)
96
+ self._actions_space = self.env.action_space
55
97
 
56
- self._model_directory = get_agent_model_dir(domain_name=self.domain_name, model_name=problem_name, class_name=algorithm.__name__)
57
- if os.path.exists(os.path.join(self._model_directory, "saved_model.zip")):
58
- self.pre_trained_model = True
59
- self._model_file_path = os.path.join(self._model_directory, "saved_model.zip")
60
- else:
61
- self.pre_trained_model = False
62
- self.env = create_vec_env(kwargs)
63
- self._actions_space = self.env.action_space
64
- if exploration_rate != None: self._model = algorithm("MultiInputPolicy", self.env, ent_coef=exploration_rate, verbose=1)
65
- else: self._model = algorithm("MultiInputPolicy", self.env, verbose=1)
66
- self._model_file_path = os.path.join(self._model_directory, "saved_model.pth")
67
- self.algorithm = algorithm
68
- self.reward_threshold = reward_threshold
69
- self.num_timesteps = num_timesteps
98
+ # first_support: SB3 models from RL zoo, with the .zip format.
99
+ if os.path.exists(os.path.join(self._model_directory, "saved_model.zip")):
100
+ # TODO check if it's ncessary to give these to the model.load if loading from rl zoo
101
+ self._model_file_path = os.path.join(
102
+ self._model_directory, "saved_model.zip"
103
+ )
104
+ self.model_kwargs = {
105
+ "custom_objects": {
106
+ "learning_rate": 0.0,
107
+ "lr_schedule": lambda _: 0.0,
108
+ "clip_range": lambda _: 0.0,
109
+ },
110
+ "seed": 0,
111
+ "buffer_size": 1,
112
+ }
113
+ # second support: models saved with SB3's model.save, which is saved as a formatted .pth file.
114
+ else:
115
+ self.model_kwargs = {}
116
+ self._model_file_path = os.path.join(
117
+ self._model_directory, "saved_model.pth"
118
+ )
70
119
 
71
- def save_model(self):
72
- self._model.save(self._model_file_path)
120
+ self.algorithm = algorithm
121
+ self.reward_threshold = reward_threshold
122
+ self.num_timesteps = num_timesteps
73
123
 
74
- def record_video(self, video_path, desired=None):
75
- global HACK_HAPPENED
76
- """Record a video of the agent's performance."""
77
- fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
78
- fps = 30.0
79
- # if is_gc:
80
- # assert goal_idx != None
81
- # self.reset_with_goal_idx(goal_idx)
82
- # else:
83
- # assert goal_idx == None
84
- self.env.reset()
85
- frame_size = (self.env.render(mode='rgb_array').shape[1], self.env.render(mode='rgb_array').shape[0])
86
- video_path = os.path.join(video_path, "plan_video.mp4")
87
- video_writer = cv2.VideoWriter(video_path, fourcc, fps, frame_size)
88
- general_done, success_done = False, False
89
- gc.collect()
90
- obs = self.env.reset()
91
- change_goal_to_specific_desired(obs, desired)
92
- counter = 0
93
- while not (general_done or success_done):
94
- counter += 1
95
- action, _states = self._model.predict(obs, deterministic=False)
96
- obs, rewards, general_done, info = self.env.step(action)
97
- if isinstance(general_done, np.ndarray): general_done = general_done[0]
98
- change_goal_to_specific_desired(obs, desired)
99
- if "success" in info[0].keys(): success_done = info[0]["success"] # make sure the agent actually reached the goal within the max time
100
- elif "is_success" in info[0].keys(): success_done = info[0]["is_success"] # make sure the agent actually reached the goal within the max time
101
- elif "step_task_completions" in info[0].keys(): success_done = (len(info[0]["step_task_completions"]) == 1) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
102
- else: raise NotImplementedError("no other option for any of the environments.")
103
- frame = self.env.render()
104
- success_done = self.change_done_by_specific_desired(obs, desired, success_done)
105
- video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
106
- if general_done == False != success_done == True:
107
- assert HACK_HAPPENED
108
- elif general_done == True != success_done == False:
109
- raise Exception("general_done is true but success_done is false")
110
- self.env.close()
111
- video_writer.release()
124
+ def save_model(self):
125
+ self._model.save(self._model_file_path)
112
126
 
113
- #def set_success_done(self, success_done, desired, )
127
+ def try_recording_video(self, video_path, desired=None):
128
+ num_tries = 0
129
+ while True:
130
+ if num_tries >= 10:
131
+ assert False, "agent keeps failing on recording an optimal obs."
132
+ try:
133
+ self.record_video(video_path, desired)
134
+ break
135
+ except Exception as e:
136
+ num_tries += 1
137
+ # print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
138
+ print(f"generated sequence video at {video_path}.")
114
139
 
115
- def change_done_by_specific_desired(self, obs, desired, old_success_done):
116
- global HACK_HAPPENED
117
- try:
118
- if desired!=None:
119
- HACK_HAPPENED = True
120
- if 'Panda' in self.problem_name:
121
- assert obs['achieved_goal'].shape == desired.shape
122
- d = np.linalg.norm(obs['achieved_goal'] - desired, axis=-1)
123
- # print(f"achieved_goal:{achieved_goal}, desired_goal:{desired_goal}, distance:{d}, is finished:{d < self.distance_threshold}")
124
- return (d < 0.04)[0]
125
- elif 'Parking' in self.problem_name: # shuoldn't be used for now
126
- # TODO
127
- return self.env.task.is_success()
128
- else:
129
- return old_success_done
130
- except Exception as e:
131
- try:
132
- if all(desired!=None):
133
- HACK_HAPPENED = True
134
- if 'Panda' in self.problem_name:
135
- assert obs['achieved_goal'].shape == desired.shape
136
- d = np.linalg.norm(obs['achieved_goal'] - desired, axis=-1)
137
- # print(f"achieved_goal:{achieved_goal}, desired_goal:{desired_goal}, distance:{d}, is finished:{d < self.distance_threshold}")
138
- return (d < 0.04)[0]
139
- elif 'Parking' in self.problem_name:
140
- # TODO add all of this to the environment property. recognizer shouldn't know anything about it.
141
- return self.env.task.is_success()
142
- else:
143
- return old_success_done
144
- except Exception as e:
145
- if all([desiredy!=None for desiredish in desired for desiredy in desiredish]):
146
- HACK_HAPPENED = True
147
- if 'Panda' in self.problem_name:
148
- assert obs['achieved_goal'].shape == desired.shape
149
- d = np.linalg.norm(obs['achieved_goal'] - desired, axis=-1)
150
- # print(f"achieved_goal:{achieved_goal}, desired_goal:{desired_goal}, distance:{d}, is finished:{d < self.distance_threshold}")
151
- return (d < 0.04)[0]
152
- elif 'Parking' in self.problem_name:
153
- # TODO
154
- return self.env.task.is_success()
155
- else:
156
- return old_success_done
140
+ def record_video(self, video_path, desired=None):
141
+ """Record a video of the agent's performance."""
142
+ fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
143
+ fps = 30.0
144
+ # if is_gc:
145
+ # assert goal_idx != None
146
+ # self.reset_with_goal_idx(goal_idx)
147
+ # else:
148
+ # assert goal_idx == None
149
+ self.env.reset()
150
+ frame_size = (
151
+ self.env.render(mode="rgb_array").shape[1],
152
+ self.env.render(mode="rgb_array").shape[0],
153
+ )
154
+ video_path = os.path.join(video_path, "plan_video.mp4")
155
+ video_writer = cv2.VideoWriter(video_path, fourcc, fps, frame_size)
156
+ general_done, success_done = False, False
157
+ gc.collect()
158
+ obs = self.env.reset()
159
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
160
+ counter = 0
161
+ while not (general_done or success_done):
162
+ counter += 1
163
+ action, _states = self._model.predict(obs, deterministic=False)
164
+ obs, rewards, general_done, info = self.env.step(action)
165
+ if isinstance(general_done, np.ndarray):
166
+ general_done = general_done[0]
167
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
168
+ if "success" in info[0].keys():
169
+ success_done = info[0][
170
+ "success"
171
+ ] # make sure the agent actually reached the goal within the max time
172
+ elif "is_success" in info[0].keys():
173
+ success_done = info[0][
174
+ "is_success"
175
+ ] # make sure the agent actually reached the goal within the max time
176
+ elif "step_task_completions" in info[0].keys():
177
+ success_done = (
178
+ len(info[0]["step_task_completions"]) == 1
179
+ ) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
180
+ else:
181
+ raise NotImplementedError(
182
+ "no other option for any of the environments."
183
+ )
184
+ frame = self.env.render()
185
+ success_done = self.env_prop.change_done_by_specific_desired(
186
+ obs, desired, success_done
187
+ )
188
+ video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
189
+ if general_done == False != success_done == True:
190
+ assert (
191
+ desired is not None
192
+ ), f"general_done is false but success_done is true, and desired is None. This should never happen, since the \
193
+ environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done) \
194
+ only in case we incorporated a 'desired' when generating the observation."
195
+ elif general_done == True != success_done == False:
196
+ raise Exception("general_done is true but success_done is false")
197
+ self.env.close()
198
+ video_writer.release()
157
199
 
158
- def load_model(self):
159
- self._model = self.algorithm.load(self._model_file_path, env=self.env, device=device)
200
+ def load_model(self):
201
+ self._model = self.algorithm.load(
202
+ self._model_file_path, env=self.env, device=device, **self.model_kwargs
203
+ )
160
204
 
161
- def learn(self):
162
- if os.path.exists(self._model_file_path):
163
- print(f"Loading pre-existing model in {self._model_file_path}")
164
- if self.pre_trained_model:
165
- def test(env):
166
- obs = env.reset()
167
- lstm_states = None
168
- episode_start = np.ones((1,), dtype=bool)
169
- deterministic = True
170
- episode_reward = 0.0
171
- ep_len = 0
172
- generator = range(5000)
173
- for i in generator:
174
- # print(f"iteration {i}:{obs=}")
175
- action, lstm_states = self._model.predict(
176
- obs, # type: ignore[arg-type]
177
- state=lstm_states,
178
- episode_start=episode_start,
179
- deterministic=deterministic,
180
- )
181
- obs, reward, done, infos = env.step(action)
205
+ def learn(self):
206
+ if os.path.exists(self._model_file_path):
207
+ print(f"Loading pre-existing model in {self._model_file_path}")
208
+ self.load_model()
209
+ else:
210
+ # Stop training when the model reaches the reward threshold
211
+ # callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=self.reward_threshold, verbose=1)
212
+ # eval_callback = EvalCallback(self.env, best_model_save_path="./logs/",
213
+ # log_path="./logs/", eval_freq=500, callback_on_new_best=callback_on_best, verbose=1, render=True)
214
+ # self._model.learn(total_timesteps=self.num_timesteps, progress_bar=True, callback=eval_callback)
215
+ print(f"No existing model in {self._model_file_path}, starting learning")
216
+ if self.exploration_rate != None:
217
+ self._model = self.algorithm(
218
+ "MultiInputPolicy",
219
+ self.env,
220
+ ent_coef=self.exploration_rate,
221
+ verbose=1,
222
+ )
223
+ else:
224
+ self._model = self.algorithm("MultiInputPolicy", self.env, verbose=1)
225
+ self._model.learn(
226
+ total_timesteps=self.num_timesteps, progress_bar=True
227
+ ) # comment this in a normal env
228
+ self.save_model()
182
229
 
183
- assert len(reward) == 1, f"length of rewards list is not 1, rewards:{reward}"
184
- if "success" in infos[0].keys(): is_success = infos[0]["success"] # make sure the agent actually reached the goal within the max time
185
- elif "is_success" in infos[0].keys(): is_success = infos[0]["is_success"] # make sure the agent actually reached the goal within the max time
186
- elif "step_task_completions" in infos[0].keys(): is_success = (len(infos[0]["step_task_completions"]) == 1) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
187
- else: raise NotImplementedError("no other option for any of the environments.")
188
- # print(f"(action,is_done,info):({action},{done},{infos})")
189
- if is_success:
190
- #print(f"breaking due to GG, took {i} steps")
191
- break
192
- episode_start = done
230
+ def safe_env_reset(self):
231
+ try:
232
+ obs = self.env.reset()
233
+ except Exception as e:
234
+ kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
235
+ self.env = self.env_prop.create_vec_env(kwargs)
236
+ obs = self.env.reset()
237
+ return obs
193
238
 
194
- episode_reward += reward[0]
195
- ep_len += 1
196
- env.close()
197
- custom_objects = {
198
- "learning_rate": 0.0,
199
- "lr_schedule": lambda _: 0.0,
200
- "clip_range": lambda _: 0.0,
201
- }
202
- kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
203
- self.env = create_vec_env(kwargs)
204
- self._actions_space = self.env.action_space
205
- kwargs = {'seed': 0, 'buffer_size': 1}
239
+ def get_mean_and_std_dev(self, observation):
240
+ if self.algorithm == SAC:
241
+ tensor_observation, _ = self._model.actor.obs_to_tensor(observation)
206
242
 
207
- self._model = self.algorithm.load(self._model_file_path, env=self.env, custom_objects=custom_objects, device=device, **kwargs)
208
- test(self.env)
209
- else:
210
- self.load_model()
211
- else:
212
- # Stop training when the model reaches the reward threshold
213
- # callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=self.reward_threshold, verbose=1)
214
- # eval_callback = EvalCallback(self.env, best_model_save_path="./logs/",
215
- # log_path="./logs/", eval_freq=500, callback_on_new_best=callback_on_best, verbose=1, render=True)
216
- # self._model.learn(total_timesteps=self.num_timesteps, progress_bar=True, callback=eval_callback)
217
- print(f"No existing model in {self._model_file_path}, starting learning")
218
- self._model.learn(total_timesteps=self.num_timesteps, progress_bar=True) # comment this in a normal env
219
- self.save_model()
243
+ mean_actions, log_std_dev, kwargs = (
244
+ self._model.actor.get_action_dist_params(tensor_observation)
245
+ )
246
+ probability_dist = self._model.actor.action_dist.proba_distribution(
247
+ mean_actions=mean_actions, log_std=log_std_dev
248
+ )
249
+ actor_means = probability_dist.get_actions(True).cpu().detach().numpy()
250
+ log_std_dev = log_std_dev.cpu().detach().numpy()
251
+ elif self.algorithm == PPO:
252
+ self._model.policy.set_training_mode(False)
253
+ tensor_observation, _ = self._model.policy.obs_to_tensor(observation)
254
+ distribution = self._model.policy.get_distribution(tensor_observation)
220
255
 
221
- def get_mean_and_std_dev(self, observation):
222
- if self.algorithm == SAC:
223
- tensor_observation, _ = self._model.actor.obs_to_tensor(observation)
256
+ actor_means = distribution.distribution.mean.cpu().detach().numpy()
257
+ log_std_dev = distribution.distribution.stddev.cpu().detach().numpy()
258
+ if isinstance(self._model.policy.action_space, gym.spaces.Box):
259
+ actor_means = np.clip(
260
+ actor_means,
261
+ self._model.policy.action_space.low,
262
+ self._model.policy.action_space.high,
263
+ )
264
+ return actor_means, log_std_dev
265
+ else:
266
+ assert False
267
+ return actor_means, log_std_dev
224
268
 
225
- mean_actions, log_std_dev, kwargs = self._model.actor.get_action_dist_params(tensor_observation)
226
- probability_dist = self._model.actor.action_dist.proba_distribution(
227
- mean_actions=mean_actions,
228
- log_std=log_std_dev
229
- )
230
- actor_means = probability_dist.get_actions(True).cpu().detach().numpy()
231
- log_std_dev = log_std_dev.cpu().detach().numpy()
232
- elif self.algorithm == PPO:
233
- self._model.policy.set_training_mode(False)
234
- tensor_observation, _ = self._model.policy.obs_to_tensor(observation)
235
- distribution = self._model.policy.get_distribution(tensor_observation)
269
+ # fits agents that generated observations in the form of: list of tuples, each tuple a single step\frame with size 2, comprised of obs and action.
270
+ # the function squashes the 2d array of obs and action in a 1d array, concatenating their values together for training.
271
+ def simplify_observation(self, observation):
272
+ return [
273
+ np.concatenate(
274
+ (
275
+ np.array(obs).reshape(obs.shape[-1]),
276
+ np.array(action[0]).reshape(action[0].shape[-1]),
277
+ )
278
+ )
279
+ for (obs, action) in observation
280
+ ]
236
281
 
237
- actor_means = distribution.distribution.mean.cpu().detach().numpy()
238
- log_std_dev = distribution.distribution.stddev.cpu().detach().numpy()
239
- if isinstance(self._model.policy.action_space, gym.spaces.Box):
240
- actor_means = np.clip(
241
- actor_means,
242
- self._model.policy.action_space.low,
243
- self._model.policy.action_space.high
244
- )
245
- return actor_means, log_std_dev
246
- else:
247
- assert False
248
- return actor_means, log_std_dev
282
+ def add_random_optimalism(self, observations, action, constant_initial_action):
283
+ if len(observations) > 3:
284
+ for i in range(0, len(action[0])):
285
+ action[0][i] += random.uniform(
286
+ -0.01 * action[0][i], 0.01 * action[0][i]
287
+ )
288
+ else: # just walk in a specific random direction to enable diverse plans
289
+ action = np.array(np.array([constant_initial_action]), None)
249
290
 
250
- # fits agents that generated observations in the form of: list of tuples, each tuple a single step\frame with size 2, comprised of obs and action.
251
- # the function squashes the 2d array of obs and action in a 1d array, concatenating their values together for training.
252
- def simplify_observation(self, observation):
253
- return [np.concatenate((np.array(obs).reshape(obs.shape[-1]),np.array(action[0]).reshape(action[0].shape[-1]))) for (obs,action) in observation]
291
+ def generate_partial_observation(
292
+ self,
293
+ action_selection_method,
294
+ percentage,
295
+ is_consecutive,
296
+ save_fig=False,
297
+ fig_path=None,
298
+ random_optimalism=True,
299
+ ):
300
+ steps = self.generate_observation(
301
+ action_selection_method,
302
+ save_fig=save_fig,
303
+ random_optimalism=random_optimalism,
304
+ fig_path=fig_path,
305
+ ) # steps are a full observation
306
+ return random_subset_with_order(
307
+ steps, (int)(percentage * len(steps)), is_consecutive
308
+ )
254
309
 
255
- def generate_partial_observation(self, action_selection_method, percentage, is_consecutive, save_fig=False, fig_path=None, random_optimalism=True):
256
- steps = self.generate_observation(action_selection_method, save_fig=save_fig, random_optimalism=random_optimalism, fig_path=fig_path) # steps are a full observation
257
- return random_subset_with_order(steps, (int)(percentage * len(steps)), is_consecutive)
310
+ def generate_observation(
311
+ self,
312
+ action_selection_method: MethodType,
313
+ random_optimalism,
314
+ save_fig=False,
315
+ fig_path=None,
316
+ with_dict=False,
317
+ desired=None,
318
+ ) -> List[
319
+ Tuple[np.ndarray, np.ndarray]
320
+ ]: # TODO make sure to add a linter to alert when a method doesn't accept or return the type it should
321
+ if save_fig == False:
322
+ assert (
323
+ fig_path == None
324
+ ), "You can't specify a vid path when you don't even save the figure."
325
+ else:
326
+ assert (
327
+ fig_path != None
328
+ ), "You need to specify a vid path when you save the figure."
329
+ # The try-except is a bug fix for the env not being reset properly in panda. If someone wants to check why and provide a robust solution they're welcome.
330
+ obs = self.safe_env_reset()
331
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
332
+ observations = []
333
+ is_successful_observation_made = False
334
+ num_of_insuccessful_attempts = 0
335
+ while not is_successful_observation_made:
336
+ is_successful_observation_made = True # start as true, if this isn't the case (crash/death/truncation instead of success)
337
+ if random_optimalism:
338
+ constant_initial_action = self.env.action_space.sample()
339
+ while True:
340
+ from gr_libs.metrics.metrics import stochastic_amplified_selection
258
341
 
259
- def generate_observation(self, action_selection_method: MethodType, random_optimalism, save_fig=False, env_prop=None,
260
- fig_path=None, with_dict=False, desired=None) -> List[Tuple[np.ndarray, np.ndarray]]: # TODO make sure to add a linter to alert when a method doesn't accept or return the type it should
261
- if save_fig == False:
262
- assert fig_path == None, "You can't specify a vid path when you don't even save the figure."
263
- else:
264
- assert fig_path != None, "You need to specify a vid path when you save the figure."
265
- # The try-except is a bug fix for the env not being reset properly in panda. If someone wants to check why and provide a robust solution they're welcome.
266
- try:
267
- obs = self.env.reset()
268
- change_goal_to_specific_desired(obs, desired)
269
- except Exception as e:
270
- kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
271
- self.env = create_vec_env(kwargs)
272
- obs = self.env.reset()
273
- change_goal_to_specific_desired(obs, desired)
274
- observations = []
275
- is_successful_observation_made = False
276
- num_of_insuccessful_attempts = 0
277
- while not is_successful_observation_made:
278
- is_successful_observation_made = True # start as true, if this isn't the case (crash/death/truncation instead of success)
279
- if random_optimalism:
280
- constant_initial_action = self.env.action_space.sample()
281
- while True:
282
- from gr_libs.metrics.metrics import stochastic_amplified_selection
283
- deterministic = action_selection_method != stochastic_amplified_selection
284
- action, _states = self._model.predict(obs, deterministic=deterministic)
285
- if random_optimalism: # get the right direction and then start inserting noise to still get a relatively optimal plan
286
- if len(observations) > 3:
287
- for i in range(0, len(action[0])):
288
- action[0][i] += random.uniform(-0.01 * action[0][i], 0.01 * action[0][i])
289
- else: # just walk in a specific random direction to enable diverse plans
290
- action = np.array(np.array([constant_initial_action]), None)
291
- if with_dict: observations.append((obs, action))
292
- else: observations.append((obs['observation'], action))
293
- obs, reward, done, info = self.env.step(action)
294
- change_goal_to_specific_desired(obs, desired)
295
- if isinstance(done, np.ndarray): general_done = done[0]
296
- else: general_done = done
297
- if "success" in info[0].keys(): success_done = info[0]["success"]
298
- elif "is_success" in info[0].keys(): success_done = info[0]["is_success"]
299
- elif "step_task_completions" in info[0].keys(): success_done = info[0]["step_task_completions"]
300
- else: raise NotImplementedError("no other option for any of the environments.")
301
- success_done = self.change_done_by_specific_desired(obs, desired, success_done)
302
- if general_done == True and success_done == False:
303
- # it could be that the stochasticity inserted into the actions made the agent die/crash. we don't want this observation.
304
- num_of_insuccessful_attempts += 1
305
- # print(f"for agent for problem {self.problem_name}, its done {len(observations)} steps, and got to a situation where general_done != success_done, for the {num_of_insuccessful_attempts} time.")
306
- if num_of_insuccessful_attempts > 50:
307
- # print(f"got more then 10 insuccessful attempts. fuak!")
308
- assert general_done == success_done, f"failed on goal: {obs['desired']}" # we want to make sure the episode is done only when the agent has actually succeeded with the task.
309
- else:
310
- # try again by breaking inner loop. everything is set up to be like the beginning of the function.
311
- is_successful_observation_made = False
312
- try:
313
- obs = self.env.reset()
314
- change_goal_to_specific_desired(obs, desired)
315
- except Exception as e:
316
- kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
317
- self.env = create_vec_env(kwargs)
318
- obs = self.env.reset()
319
- change_goal_to_specific_desired(obs, desired)
320
- observations = [] # we want to re-accumulate the observations from scratch, have another try
321
- break
322
- elif general_done == False and success_done == False:
323
- continue
324
- elif general_done == True and success_done == True:
325
- if num_of_insuccessful_attempts > 0:
326
- pass # print(f"after {num_of_insuccessful_attempts}, finally I succeeded!")
327
- break
328
- elif general_done == False and success_done == True:
329
- assert HACK_HAPPENED == True # happens only if hack happened
330
- break
331
- # self.env.close()
332
- if save_fig:
333
- num_tries = 0
334
- while True:
335
- if num_tries >= 10:
336
- assert False, "agent keeps failing on recording an optimal obs."
337
- try:
338
- self.record_video(fig_path, desired)
339
- break
340
- except Exception as e:
341
- num_tries += 1
342
- #print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
343
- print(f"generated sequence video at {fig_path}.")
344
- self.env.close()
345
- return observations
342
+ deterministic = (
343
+ action_selection_method != stochastic_amplified_selection
344
+ )
345
+ action, _states = self._model.predict(obs, deterministic=deterministic)
346
+ if (
347
+ random_optimalism
348
+ ): # get the right direction and then start inserting noise to still get a relatively optimal plan
349
+ self.add_random_optimalism(obs, action, constant_initial_action)
350
+ if with_dict:
351
+ observations.append((obs, action))
352
+ else:
353
+ observations.append((obs["observation"], action))
354
+ obs, reward, done, info = self.env.step(action)
355
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
356
+ general_done = self.env_prop.is_done(done)
357
+ success_done = self.env_prop.is_success(info)
358
+ success_done = self.env_prop.change_done_by_specific_desired(
359
+ obs, desired, success_done
360
+ )
361
+ if general_done == True and success_done == False:
362
+ # it could be that the stochasticity inserted into the actions made the agent die/crash. we don't want this observation: it's an insuccessful attempt.
363
+ num_of_insuccessful_attempts += 1
364
+ # print(f"for agent for problem {self.problem_name}, its done {len(observations)} steps, and got to a situation where general_done != success_done, for the {num_of_insuccessful_attempts} time.")
365
+ if num_of_insuccessful_attempts > 50:
366
+ # print(f"got more then 10 insuccessful attempts!")
367
+ assert (
368
+ general_done == success_done
369
+ ), f"failed on goal: {obs['desired']}" # we want to make sure the episode is done only when the agent has actually succeeded with the task.
370
+ else:
371
+ # try again by breaking inner loop. everything is set up to be like the beginning of the function.
372
+ is_successful_observation_made = False
373
+ obs = self.safe_env_reset()
374
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
375
+ observations = (
376
+ []
377
+ ) # we want to re-accumulate the observations from scratch, have another try
378
+ break
379
+ elif general_done == False and success_done == False:
380
+ continue
381
+ elif general_done == True and success_done == True:
382
+ if num_of_insuccessful_attempts > 0:
383
+ pass # print(f"after {num_of_insuccessful_attempts}, finally I succeeded!")
384
+ break
385
+ elif general_done == False and success_done == True:
386
+ # The environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done)
387
+ # only in case we incorporated a 'desired' when generating the observation.
388
+ assert (
389
+ desired is not None
390
+ ), f"general_done is false but success_done is true, and desired is None. This should never happen, since the \
391
+ environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done) \
392
+ only in case we incorporated a 'desired' when generating the observation."
393
+ break
346
394
 
347
- # def reset_with_goal_idx(self, goal_idx):
348
- # self.env.set_options({"goal_idx": goal_idx})
349
- # return self.env.reset()
350
-
351
- class GCDeepRLAgent(DeepRLAgent):
352
- def generate_partial_observation(self, action_selection_method, percentage, is_consecutive, goal_directed_problem=None, goal_directed_goal=None, save_fig=False, fig_path=None, random_optimalism=True):
353
- steps = self.generate_observation(action_selection_method, save_fig=save_fig, fig_path=fig_path, random_optimalism=random_optimalism, goal_directed_problem=goal_directed_problem, goal_directed_goal=goal_directed_goal) # steps are a full observation
354
- return random_subset_with_order(steps, (int)(percentage * len(steps)), is_consecutive)
395
+ if save_fig:
396
+ self.try_recording_video(fig_path, desired)
355
397
 
356
- def generate_observation(self, action_selection_method: MethodType, random_optimalism, env_prop=None, goal_directed_problem=None, goal_directed_goal=None,
357
- save_fig = False, fig_path=None, with_dict=False):
358
- # print(f"hyperparams:{hyperparams}")
359
- if goal_directed_problem:
360
- if save_fig:
361
- assert fig_path != None, "You need to specify a vid path when you save the figure."
362
- else:
363
- assert fig_path == None
364
- assert goal_directed_goal == None, "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
365
- kwargs = {"id": goal_directed_problem, "render_mode": "rgb_array"}
366
- self.env = create_vec_env(kwargs)
367
- orig_env = self.env
368
- observations = super().generate_observation(action_selection_method=action_selection_method, random_optimalism=random_optimalism,
369
- save_fig=save_fig, fig_path=fig_path, with_dict=with_dict)
370
- self.env = orig_env
371
- else: #goal_directed_goal!=None
372
- if save_fig:
373
- assert fig_path != None, "You need to specify a vid path when you save the figure."
374
- else:
375
- assert fig_path == None
376
- assert goal_directed_problem == None, "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
377
- observations = super().generate_observation(action_selection_method=action_selection_method, random_optimalism=random_optimalism,
378
- save_fig=save_fig, fig_path=fig_path, with_dict=with_dict, desired=goal_directed_goal) # TODO tutorial on how to use the deepRLAgent for sequence generation and examination and plotting of the sequence
379
- return observations
380
-
398
+ self.env.close()
399
+ return observations
381
400
 
382
- if __name__ == "__main__":
383
- package_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
384
- print("this is package root:" + package_root)
385
- if package_root not in sys.path:
386
- sys.path.insert(0, package_root)
387
401
 
388
- from gr_libs.ml.utils.storage import get_agent_model_dir, set_global_storage_configs
402
+ class GCDeepRLAgent(DeepRLAgent):
403
+ def generate_partial_observation(
404
+ self,
405
+ action_selection_method,
406
+ percentage,
407
+ is_consecutive,
408
+ goal_directed_problem=None,
409
+ goal_directed_goal=None,
410
+ save_fig=False,
411
+ fig_path=None,
412
+ random_optimalism=True,
413
+ ):
414
+ steps = self.generate_observation(
415
+ action_selection_method,
416
+ save_fig=save_fig,
417
+ fig_path=fig_path,
418
+ random_optimalism=random_optimalism,
419
+ goal_directed_problem=goal_directed_problem,
420
+ goal_directed_goal=goal_directed_goal,
421
+ ) # steps are a full observation
422
+ return random_subset_with_order(
423
+ steps, (int)(percentage * len(steps)), is_consecutive
424
+ )
389
425
 
390
- set_global_storage_configs("graml", "fragmented_partial_obs", "inference_same_length", "learn_diff_length")
391
- agent = DeepRLAgent(domain_name="point_maze", problem_name="PointMaze-FourRoomsEnvDense-11x11-Goal-9x1", algorithm=SAC, num_timesteps=200000)
392
- agent.learn()
393
- agent.record_video("")
426
+ # TODO move the goal_directed_goal and/or goal_directed_problem mechanism to be a property of the env_property, so deep_rl_learner doesn't depend on it and holds this logic so heavily.
427
+ # Generate observation with goal_directed_goal or goal_directed_problem is only possible for a GC agent, otherwise - the agent can't act optimally to that new goal.
428
+ def generate_observation(
429
+ self,
430
+ action_selection_method: MethodType,
431
+ random_optimalism,
432
+ goal_directed_problem=None,
433
+ goal_directed_goal=None,
434
+ save_fig=False,
435
+ fig_path=None,
436
+ with_dict=False,
437
+ ):
438
+ if save_fig:
439
+ assert (
440
+ fig_path != None
441
+ ), "You need to specify a vid path when you save the figure."
442
+ else:
443
+ assert fig_path == None
444
+ # goal_directed_problem employs the GC agent in a new env with a static, predefined goal, and has him generate an observation sequence in it.
445
+ if goal_directed_problem:
446
+ assert (
447
+ goal_directed_goal == None
448
+ ), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
449
+ kwargs = {"id": goal_directed_problem, "render_mode": "rgb_array"}
450
+ self.env = self.env_prop.create_vec_env(kwargs)
451
+ orig_env = self.env
452
+ observations = super().generate_observation(
453
+ action_selection_method=action_selection_method,
454
+ random_optimalism=random_optimalism,
455
+ save_fig=save_fig,
456
+ fig_path=fig_path,
457
+ with_dict=with_dict,
458
+ )
459
+ self.env = orig_env
460
+ # goal_directed_goal employs the agent in the same env on which it trained - with goals that change with every episode sampled from the goal space.
461
+ # but we manually change the 'desired' part of the observation to be the goal_directed_goal and edit the id_success and is_done accordingly.
462
+ else:
463
+ assert (
464
+ goal_directed_problem == None
465
+ ), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
466
+ observations = super().generate_observation(
467
+ action_selection_method=action_selection_method,
468
+ random_optimalism=random_optimalism,
469
+ save_fig=save_fig,
470
+ fig_path=fig_path,
471
+ with_dict=with_dict,
472
+ desired=goal_directed_goal,
473
+ ) # TODO tutorial on how to use the deepRLAgent for sequence generation and examination and plotting of the sequence
474
+ return observations