gr-libs 0.1.7.post0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. gr_libs/__init__.py +4 -1
  2. gr_libs/_evaluation/__init__.py +1 -0
  3. gr_libs/_evaluation/_analyze_results_cross_alg_cross_domain.py +260 -0
  4. gr_libs/_evaluation/_generate_experiments_results.py +141 -0
  5. gr_libs/_evaluation/_generate_task_specific_statistics_plots.py +497 -0
  6. gr_libs/_evaluation/_get_plans_images.py +61 -0
  7. gr_libs/_evaluation/_increasing_and_decreasing_.py +106 -0
  8. gr_libs/_version.py +2 -2
  9. gr_libs/all_experiments.py +294 -0
  10. gr_libs/environment/__init__.py +30 -9
  11. gr_libs/environment/_utils/utils.py +27 -0
  12. gr_libs/environment/environment.py +417 -54
  13. gr_libs/metrics/__init__.py +7 -0
  14. gr_libs/metrics/metrics.py +231 -54
  15. gr_libs/ml/__init__.py +2 -5
  16. gr_libs/ml/agent.py +21 -6
  17. gr_libs/ml/base/__init__.py +3 -1
  18. gr_libs/ml/base/rl_agent.py +81 -13
  19. gr_libs/ml/consts.py +1 -1
  20. gr_libs/ml/neural/__init__.py +1 -3
  21. gr_libs/ml/neural/deep_rl_learner.py +619 -378
  22. gr_libs/ml/neural/utils/__init__.py +1 -2
  23. gr_libs/ml/neural/utils/dictlist.py +3 -3
  24. gr_libs/ml/planner/mcts/{utils → _utils}/__init__.py +1 -1
  25. gr_libs/ml/planner/mcts/{utils → _utils}/node.py +11 -7
  26. gr_libs/ml/planner/mcts/{utils → _utils}/tree.py +15 -11
  27. gr_libs/ml/planner/mcts/mcts_model.py +571 -312
  28. gr_libs/ml/sequential/__init__.py +0 -1
  29. gr_libs/ml/sequential/_lstm_model.py +270 -0
  30. gr_libs/ml/tabular/__init__.py +1 -3
  31. gr_libs/ml/tabular/state.py +7 -7
  32. gr_libs/ml/tabular/tabular_q_learner.py +150 -82
  33. gr_libs/ml/tabular/tabular_rl_agent.py +42 -28
  34. gr_libs/ml/utils/__init__.py +2 -3
  35. gr_libs/ml/utils/format.py +28 -97
  36. gr_libs/ml/utils/math.py +5 -3
  37. gr_libs/ml/utils/other.py +3 -3
  38. gr_libs/ml/utils/storage.py +88 -81
  39. gr_libs/odgr_executor.py +268 -0
  40. gr_libs/problems/consts.py +1549 -1227
  41. gr_libs/recognizer/_utils/__init__.py +0 -0
  42. gr_libs/recognizer/_utils/format.py +18 -0
  43. gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +233 -88
  44. gr_libs/recognizer/graml/_gr_dataset.py +233 -0
  45. gr_libs/recognizer/graml/graml_recognizer.py +586 -252
  46. gr_libs/recognizer/recognizer.py +90 -30
  47. gr_libs/tutorials/draco_panda_tutorial.py +58 -0
  48. gr_libs/tutorials/draco_parking_tutorial.py +56 -0
  49. gr_libs/tutorials/gcdraco_panda_tutorial.py +62 -0
  50. gr_libs/tutorials/gcdraco_parking_tutorial.py +57 -0
  51. gr_libs/tutorials/graml_minigrid_tutorial.py +64 -0
  52. gr_libs/tutorials/graml_panda_tutorial.py +57 -0
  53. gr_libs/tutorials/graml_parking_tutorial.py +52 -0
  54. gr_libs/tutorials/graml_point_maze_tutorial.py +60 -0
  55. gr_libs/tutorials/graql_minigrid_tutorial.py +50 -0
  56. {gr_libs-0.1.7.post0.dist-info → gr_libs-0.2.2.dist-info}/METADATA +84 -29
  57. gr_libs-0.2.2.dist-info/RECORD +71 -0
  58. {gr_libs-0.1.7.post0.dist-info → gr_libs-0.2.2.dist-info}/WHEEL +1 -1
  59. gr_libs-0.2.2.dist-info/top_level.txt +2 -0
  60. tests/test_draco.py +14 -0
  61. tests/test_gcdraco.py +10 -0
  62. tests/test_graml.py +12 -8
  63. tests/test_graql.py +3 -2
  64. evaluation/analyze_results_cross_alg_cross_domain.py +0 -277
  65. evaluation/create_minigrid_map_image.py +0 -34
  66. evaluation/file_system.py +0 -42
  67. evaluation/generate_experiments_results.py +0 -92
  68. evaluation/generate_experiments_results_new_ver1.py +0 -254
  69. evaluation/generate_experiments_results_new_ver2.py +0 -331
  70. evaluation/generate_task_specific_statistics_plots.py +0 -272
  71. evaluation/get_plans_images.py +0 -47
  72. evaluation/increasing_and_decreasing_.py +0 -63
  73. gr_libs/environment/utils/utils.py +0 -17
  74. gr_libs/ml/neural/utils/penv.py +0 -57
  75. gr_libs/ml/sequential/lstm_model.py +0 -192
  76. gr_libs/recognizer/graml/gr_dataset.py +0 -134
  77. gr_libs/recognizer/utils/__init__.py +0 -1
  78. gr_libs/recognizer/utils/format.py +0 -13
  79. gr_libs-0.1.7.post0.dist-info/RECORD +0 -67
  80. gr_libs-0.1.7.post0.dist-info/top_level.txt +0 -4
  81. tutorials/graml_minigrid_tutorial.py +0 -34
  82. tutorials/graml_panda_tutorial.py +0 -41
  83. tutorials/graml_parking_tutorial.py +0 -39
  84. tutorials/graml_point_maze_tutorial.py +0 -39
  85. tutorials/graql_minigrid_tutorial.py +0 -34
  86. /gr_libs/environment/{utils → _utils}/__init__.py +0 -0
@@ -1,393 +1,634 @@
1
- from collections import OrderedDict
2
1
  import gc
2
+ from collections import OrderedDict
3
3
  from types import MethodType
4
- from typing import List, Tuple
5
- import gymnasium as gym
6
- import numpy as np
4
+
7
5
  import cv2
6
+ import numpy as np
8
7
 
9
- HACK_HAPPENED = False
8
+ from gr_libs.environment.environment import EnvProperty
10
9
 
11
10
  if __name__ != "__main__":
12
- from gr_libs.ml.utils.storage import get_agent_model_dir
13
- from gr_libs.ml.utils.format import random_subset_with_order
14
- from stable_baselines3 import SAC, PPO
15
- from stable_baselines3.common.vec_env import DummyVecEnv
16
- from gr_libs.ml.utils import device
11
+ from gr_libs.ml.utils.storage import get_agent_model_dir
12
+ from gr_libs.ml.utils.format import random_subset_with_order
13
+
14
+ import os
17
15
 
18
16
  # built-in python modules
19
17
  import random
20
- import os
21
- import sys
22
-
23
- def create_vec_env(kwargs):
24
- # create the model, it will not be a pretrained one anyway
25
- # env = gym.make(**kwargs)
26
- env = gym.make(**kwargs)
27
- return DummyVecEnv([lambda: env])
28
-
29
- def change_goal_to_specific_desired(obs, desired):
30
- if desired is not None:
31
- obs['desired_goal'] = desired
32
- # try:
33
- # if desired!=None: obs['desired_goal'] = desired
34
- # except Exception as e:
35
- # try:
36
- # if all(desired!=None): obs['desired_goal'] = desired
37
- # except Exception as e:
38
- # if all([desiredy!=None for desiredish in desired for desiredy in desiredish]): obs['desired_goal'] = desired
39
18
 
19
+ import gymnasium as gym
20
+ from stable_baselines3 import PPO, SAC, TD3
21
+ from stable_baselines3.common.base_class import BaseAlgorithm
22
+
23
+ from gr_libs.ml.utils import device
40
24
 
25
+ # TODO do we need this?
41
26
  NETWORK_SETUP = {
42
- SAC: OrderedDict([('batch_size', 512), ('buffer_size', 100000), ('ent_coef', 'auto'), ('gamma', 0.95), ('learning_rate', 0.001), ('learning_starts', 5000), ('n_timesteps', 50000.0), ('normalize', "{'norm_obs': False, 'norm_reward': False}"), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(net_arch=[64, 64])'), ('replay_buffer_class', 'HerReplayBuffer'), ('replay_buffer_kwargs', "dict( goal_selection_strategy='future', n_sampled_goal=4 )"), ('normalize_kwargs', {'norm_obs': False, 'norm_reward': False})]),
43
- #"tqc": OrderedDict([('batch_size', 256), ('buffer_size', 1000000), ('ent_coef', 'auto'), ('env_wrapper', ['sb3_contrib.common.wrappers.TimeFeatureWrapper']), ('gamma', 0.95), ('learning_rate', 0.001), ('learning_starts', 1000), ('n_timesteps', 25000.0), ('normalize', False), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(net_arch=[64, 64])'), ('replay_buffer_class', 'HerReplayBuffer'), ('replay_buffer_kwargs', "dict( goal_selection_strategy='future', n_sampled_goal=4 )"), ('normalize_kwargs',{'norm_obs':False,'norm_reward':False})]),
44
- PPO: OrderedDict([('batch_size', 256), ('ent_coef', 0.01), ('gae_lambda', 0.9), ('gamma', 0.99), ('learning_rate', 'lin_0.0001'), ('max_grad_norm', 0.5), ('n_envs', 8), ('n_epochs', 20), ('n_steps', 8), ('n_timesteps', 25000.0), ('normalize_advantage', False), ('policy', 'MultiInputPolicy'), ('policy_kwargs', 'dict(log_std_init=-2, ortho_init=False)'), ('use_sde', True), ('vf_coef', 0.4), ('normalize', False), ('normalize_kwargs', {'norm_obs': False, 'norm_reward': False})]),
27
+ SAC: OrderedDict(
28
+ [
29
+ ("batch_size", 512),
30
+ ("buffer_size", 100000),
31
+ ("ent_coef", "auto"),
32
+ ("gamma", 0.95),
33
+ ("learning_rate", 0.001),
34
+ ("learning_starts", 5000),
35
+ ("n_timesteps", 50000.0),
36
+ ("normalize", "{'norm_obs': False, 'norm_reward': False}"),
37
+ ("policy", "MultiInputPolicy"),
38
+ ("policy_kwargs", "dict(net_arch=[64, 64])"),
39
+ ("replay_buffer_class", "HerReplayBuffer"),
40
+ (
41
+ "replay_buffer_kwargs",
42
+ "dict( goal_selection_strategy='future', n_sampled_goal=4 )",
43
+ ),
44
+ ("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
45
+ ]
46
+ ),
47
+ PPO: OrderedDict(
48
+ [
49
+ ("batch_size", 256),
50
+ ("ent_coef", 0.01),
51
+ ("gae_lambda", 0.9),
52
+ ("gamma", 0.99),
53
+ ("learning_rate", "lin_0.0001"),
54
+ ("max_grad_norm", 0.5),
55
+ ("n_envs", 8),
56
+ ("n_epochs", 20),
57
+ ("n_steps", 8),
58
+ ("n_timesteps", 25000.0),
59
+ ("normalize_advantage", False),
60
+ ("policy", "MultiInputPolicy"),
61
+ ("policy_kwargs", "dict(log_std_init=-2, ortho_init=False)"),
62
+ ("use_sde", True),
63
+ ("vf_coef", 0.4),
64
+ ("normalize", False),
65
+ ("normalize_kwargs", {"norm_obs": False, "norm_reward": False}),
66
+ ]
67
+ ),
45
68
  }
46
69
 
47
- class DeepRLAgent():
48
- def __init__(self, domain_name: str, problem_name: str, num_timesteps:float, algorithm=SAC, reward_threshold: float=450,
49
- exploration_rate=None):
50
- # Need to change reward threshold to change according to which task the agent is training on, becuase it changes from task to task.
51
- kwargs = {"id":problem_name, "render_mode":"rgb_array"}
52
-
53
- self.domain_name = domain_name
54
- self.problem_name = problem_name
55
-
56
- self._model_directory = get_agent_model_dir(domain_name=self.domain_name, model_name=problem_name, class_name=algorithm.__name__)
57
- if os.path.exists(os.path.join(self._model_directory, "saved_model.zip")):
58
- self.pre_trained_model = True
59
- self._model_file_path = os.path.join(self._model_directory, "saved_model.zip")
60
- else:
61
- self.pre_trained_model = False
62
- self.env = create_vec_env(kwargs)
63
- self._actions_space = self.env.action_space
64
- if exploration_rate != None: self._model = algorithm("MultiInputPolicy", self.env, ent_coef=exploration_rate, verbose=1)
65
- else: self._model = algorithm("MultiInputPolicy", self.env, verbose=1)
66
- self._model_file_path = os.path.join(self._model_directory, "saved_model.pth")
67
- self.algorithm = algorithm
68
- self.reward_threshold = reward_threshold
69
- self.num_timesteps = num_timesteps
70
-
71
- def save_model(self):
72
- self._model.save(self._model_file_path)
73
-
74
- def record_video(self, video_path, desired=None):
75
- global HACK_HAPPENED
76
- """Record a video of the agent's performance."""
77
- fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
78
- fps = 30.0
79
- # if is_gc:
80
- # assert goal_idx != None
81
- # self.reset_with_goal_idx(goal_idx)
82
- # else:
83
- # assert goal_idx == None
84
- self.env.reset()
85
- frame_size = (self.env.render(mode='rgb_array').shape[1], self.env.render(mode='rgb_array').shape[0])
86
- video_path = os.path.join(video_path, "plan_video.mp4")
87
- video_writer = cv2.VideoWriter(video_path, fourcc, fps, frame_size)
88
- general_done, success_done = False, False
89
- gc.collect()
90
- obs = self.env.reset()
91
- change_goal_to_specific_desired(obs, desired)
92
- counter = 0
93
- while not (general_done or success_done):
94
- counter += 1
95
- action, _states = self._model.predict(obs, deterministic=False)
96
- obs, rewards, general_done, info = self.env.step(action)
97
- if isinstance(general_done, np.ndarray): general_done = general_done[0]
98
- change_goal_to_specific_desired(obs, desired)
99
- if "success" in info[0].keys(): success_done = info[0]["success"] # make sure the agent actually reached the goal within the max time
100
- elif "is_success" in info[0].keys(): success_done = info[0]["is_success"] # make sure the agent actually reached the goal within the max time
101
- elif "step_task_completions" in info[0].keys(): success_done = (len(info[0]["step_task_completions"]) == 1) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
102
- else: raise NotImplementedError("no other option for any of the environments.")
103
- frame = self.env.render()
104
- success_done = self.change_done_by_specific_desired(obs, desired, success_done)
105
- video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
106
- if general_done == False != success_done == True:
107
- assert HACK_HAPPENED
108
- elif general_done == True != success_done == False:
109
- raise Exception("general_done is true but success_done is false")
110
- self.env.close()
111
- video_writer.release()
112
-
113
- #def set_success_done(self, success_done, desired, )
114
-
115
- def change_done_by_specific_desired(self, obs, desired, old_success_done):
116
- global HACK_HAPPENED
117
- try:
118
- if desired!=None:
119
- HACK_HAPPENED = True
120
- if 'Panda' in self.problem_name:
121
- assert obs['achieved_goal'].shape == desired.shape
122
- d = np.linalg.norm(obs['achieved_goal'] - desired, axis=-1)
123
- # print(f"achieved_goal:{achieved_goal}, desired_goal:{desired_goal}, distance:{d}, is finished:{d < self.distance_threshold}")
124
- return (d < 0.04)[0]
125
- elif 'Parking' in self.problem_name: # shuoldn't be used for now
126
- # TODO
127
- return self.env.task.is_success()
128
- else:
129
- return old_success_done
130
- except Exception as e:
131
- try:
132
- if all(desired!=None):
133
- HACK_HAPPENED = True
134
- if 'Panda' in self.problem_name:
135
- assert obs['achieved_goal'].shape == desired.shape
136
- d = np.linalg.norm(obs['achieved_goal'] - desired, axis=-1)
137
- # print(f"achieved_goal:{achieved_goal}, desired_goal:{desired_goal}, distance:{d}, is finished:{d < self.distance_threshold}")
138
- return (d < 0.04)[0]
139
- elif 'Parking' in self.problem_name:
140
- # TODO add all of this to the environment property. recognizer shouldn't know anything about it.
141
- return self.env.task.is_success()
142
- else:
143
- return old_success_done
144
- except Exception as e:
145
- if all([desiredy!=None for desiredish in desired for desiredy in desiredish]):
146
- HACK_HAPPENED = True
147
- if 'Panda' in self.problem_name:
148
- assert obs['achieved_goal'].shape == desired.shape
149
- d = np.linalg.norm(obs['achieved_goal'] - desired, axis=-1)
150
- # print(f"achieved_goal:{achieved_goal}, desired_goal:{desired_goal}, distance:{d}, is finished:{d < self.distance_threshold}")
151
- return (d < 0.04)[0]
152
- elif 'Parking' in self.problem_name:
153
- # TODO
154
- return self.env.task.is_success()
155
- else:
156
- return old_success_done
157
-
158
- def load_model(self):
159
- self._model = self.algorithm.load(self._model_file_path, env=self.env, device=device)
160
-
161
- def learn(self):
162
- if os.path.exists(self._model_file_path):
163
- print(f"Loading pre-existing model in {self._model_file_path}")
164
- if self.pre_trained_model:
165
- def test(env):
166
- obs = env.reset()
167
- lstm_states = None
168
- episode_start = np.ones((1,), dtype=bool)
169
- deterministic = True
170
- episode_reward = 0.0
171
- ep_len = 0
172
- generator = range(5000)
173
- for i in generator:
174
- # print(f"iteration {i}:{obs=}")
175
- action, lstm_states = self._model.predict(
176
- obs, # type: ignore[arg-type]
177
- state=lstm_states,
178
- episode_start=episode_start,
179
- deterministic=deterministic,
180
- )
181
- obs, reward, done, infos = env.step(action)
182
-
183
- assert len(reward) == 1, f"length of rewards list is not 1, rewards:{reward}"
184
- if "success" in infos[0].keys(): is_success = infos[0]["success"] # make sure the agent actually reached the goal within the max time
185
- elif "is_success" in infos[0].keys(): is_success = infos[0]["is_success"] # make sure the agent actually reached the goal within the max time
186
- elif "step_task_completions" in infos[0].keys(): is_success = (len(infos[0]["step_task_completions"]) == 1) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
187
- else: raise NotImplementedError("no other option for any of the environments.")
188
- # print(f"(action,is_done,info):({action},{done},{infos})")
189
- if is_success:
190
- #print(f"breaking due to GG, took {i} steps")
191
- break
192
- episode_start = done
193
-
194
- episode_reward += reward[0]
195
- ep_len += 1
196
- env.close()
197
- custom_objects = {
198
- "learning_rate": 0.0,
199
- "lr_schedule": lambda _: 0.0,
200
- "clip_range": lambda _: 0.0,
201
- }
202
- kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
203
- self.env = create_vec_env(kwargs)
204
- self._actions_space = self.env.action_space
205
- kwargs = {'seed': 0, 'buffer_size': 1}
206
-
207
- self._model = self.algorithm.load(self._model_file_path, env=self.env, custom_objects=custom_objects, device=device, **kwargs)
208
- test(self.env)
209
- else:
210
- self.load_model()
211
- else:
212
- # Stop training when the model reaches the reward threshold
213
- # callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=self.reward_threshold, verbose=1)
214
- # eval_callback = EvalCallback(self.env, best_model_save_path="./logs/",
215
- # log_path="./logs/", eval_freq=500, callback_on_new_best=callback_on_best, verbose=1, render=True)
216
- # self._model.learn(total_timesteps=self.num_timesteps, progress_bar=True, callback=eval_callback)
217
- print(f"No existing model in {self._model_file_path}, starting learning")
218
- self._model.learn(total_timesteps=self.num_timesteps, progress_bar=True) # comment this in a normal env
219
- self.save_model()
220
-
221
- def get_mean_and_std_dev(self, observation):
222
- if self.algorithm == SAC:
223
- tensor_observation, _ = self._model.actor.obs_to_tensor(observation)
224
-
225
- mean_actions, log_std_dev, kwargs = self._model.actor.get_action_dist_params(tensor_observation)
226
- probability_dist = self._model.actor.action_dist.proba_distribution(
227
- mean_actions=mean_actions,
228
- log_std=log_std_dev
229
- )
230
- actor_means = probability_dist.get_actions(True).cpu().detach().numpy()
231
- log_std_dev = log_std_dev.cpu().detach().numpy()
232
- elif self.algorithm == PPO:
233
- self._model.policy.set_training_mode(False)
234
- tensor_observation, _ = self._model.policy.obs_to_tensor(observation)
235
- distribution = self._model.policy.get_distribution(tensor_observation)
236
-
237
- actor_means = distribution.distribution.mean.cpu().detach().numpy()
238
- log_std_dev = distribution.distribution.stddev.cpu().detach().numpy()
239
- if isinstance(self._model.policy.action_space, gym.spaces.Box):
240
- actor_means = np.clip(
241
- actor_means,
242
- self._model.policy.action_space.low,
243
- self._model.policy.action_space.high
244
- )
245
- return actor_means, log_std_dev
246
- else:
247
- assert False
248
- return actor_means, log_std_dev
249
-
250
- # fits agents that generated observations in the form of: list of tuples, each tuple a single step\frame with size 2, comprised of obs and action.
251
- # the function squashes the 2d array of obs and action in a 1d array, concatenating their values together for training.
252
- def simplify_observation(self, observation):
253
- return [np.concatenate((np.array(obs).reshape(obs.shape[-1]),np.array(action[0]).reshape(action[0].shape[-1]))) for (obs,action) in observation]
254
-
255
- def generate_partial_observation(self, action_selection_method, percentage, is_consecutive, save_fig=False, fig_path=None, random_optimalism=True):
256
- steps = self.generate_observation(action_selection_method, save_fig=save_fig, random_optimalism=random_optimalism, fig_path=fig_path) # steps are a full observation
257
- return random_subset_with_order(steps, (int)(percentage * len(steps)), is_consecutive)
258
-
259
- def generate_observation(self, action_selection_method: MethodType, random_optimalism, save_fig=False, env_prop=None,
260
- fig_path=None, with_dict=False, desired=None) -> List[Tuple[np.ndarray, np.ndarray]]: # TODO make sure to add a linter to alert when a method doesn't accept or return the type it should
261
- if save_fig == False:
262
- assert fig_path == None, "You can't specify a vid path when you don't even save the figure."
263
- else:
264
- assert fig_path != None, "You need to specify a vid path when you save the figure."
265
- # The try-except is a bug fix for the env not being reset properly in panda. If someone wants to check why and provide a robust solution they're welcome.
266
- try:
267
- obs = self.env.reset()
268
- change_goal_to_specific_desired(obs, desired)
269
- except Exception as e:
270
- kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
271
- self.env = create_vec_env(kwargs)
272
- obs = self.env.reset()
273
- change_goal_to_specific_desired(obs, desired)
274
- observations = []
275
- is_successful_observation_made = False
276
- num_of_insuccessful_attempts = 0
277
- while not is_successful_observation_made:
278
- is_successful_observation_made = True # start as true, if this isn't the case (crash/death/truncation instead of success)
279
- if random_optimalism:
280
- constant_initial_action = self.env.action_space.sample()
281
- while True:
282
- from gr_libs.metrics.metrics import stochastic_amplified_selection
283
- deterministic = action_selection_method != stochastic_amplified_selection
284
- action, _states = self._model.predict(obs, deterministic=deterministic)
285
- if random_optimalism: # get the right direction and then start inserting noise to still get a relatively optimal plan
286
- if len(observations) > 3:
287
- for i in range(0, len(action[0])):
288
- action[0][i] += random.uniform(-0.01 * action[0][i], 0.01 * action[0][i])
289
- else: # just walk in a specific random direction to enable diverse plans
290
- action = np.array(np.array([constant_initial_action]), None)
291
- if with_dict: observations.append((obs, action))
292
- else: observations.append((obs['observation'], action))
293
- obs, reward, done, info = self.env.step(action)
294
- change_goal_to_specific_desired(obs, desired)
295
- if isinstance(done, np.ndarray): general_done = done[0]
296
- else: general_done = done
297
- if "success" in info[0].keys(): success_done = info[0]["success"]
298
- elif "is_success" in info[0].keys(): success_done = info[0]["is_success"]
299
- elif "step_task_completions" in info[0].keys(): success_done = info[0]["step_task_completions"]
300
- else: raise NotImplementedError("no other option for any of the environments.")
301
- success_done = self.change_done_by_specific_desired(obs, desired, success_done)
302
- if general_done == True and success_done == False:
303
- # it could be that the stochasticity inserted into the actions made the agent die/crash. we don't want this observation.
304
- num_of_insuccessful_attempts += 1
305
- # print(f"for agent for problem {self.problem_name}, its done {len(observations)} steps, and got to a situation where general_done != success_done, for the {num_of_insuccessful_attempts} time.")
306
- if num_of_insuccessful_attempts > 50:
307
- # print(f"got more then 10 insuccessful attempts. fuak!")
308
- assert general_done == success_done, f"failed on goal: {obs['desired']}" # we want to make sure the episode is done only when the agent has actually succeeded with the task.
309
- else:
310
- # try again by breaking inner loop. everything is set up to be like the beginning of the function.
311
- is_successful_observation_made = False
312
- try:
313
- obs = self.env.reset()
314
- change_goal_to_specific_desired(obs, desired)
315
- except Exception as e:
316
- kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
317
- self.env = create_vec_env(kwargs)
318
- obs = self.env.reset()
319
- change_goal_to_specific_desired(obs, desired)
320
- observations = [] # we want to re-accumulate the observations from scratch, have another try
321
- break
322
- elif general_done == False and success_done == False:
323
- continue
324
- elif general_done == True and success_done == True:
325
- if num_of_insuccessful_attempts > 0:
326
- pass # print(f"after {num_of_insuccessful_attempts}, finally I succeeded!")
327
- break
328
- elif general_done == False and success_done == True:
329
- assert HACK_HAPPENED == True # happens only if hack happened
330
- break
331
- # self.env.close()
332
- if save_fig:
333
- num_tries = 0
334
- while True:
335
- if num_tries >= 10:
336
- assert False, "agent keeps failing on recording an optimal obs."
337
- try:
338
- self.record_video(fig_path, desired)
339
- break
340
- except Exception as e:
341
- num_tries += 1
342
- #print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
343
- print(f"generated sequence video at {fig_path}.")
344
- self.env.close()
345
- return observations
346
-
347
- # def reset_with_goal_idx(self, goal_idx):
348
- # self.env.set_options({"goal_idx": goal_idx})
349
- # return self.env.reset()
350
-
70
+
71
+ class DeepRLAgent:
72
+ """
73
+ Deep Reinforcement Learning Agent, wrapping a SB3 agent and adding functionality,
74
+ needed for GR framework executions such as observation generation and video recording.
75
+ Supports SAC, PPO and TD3 algorithms.
76
+ Can be loaded from rl_zoo or trained from scratch.
77
+
78
+ Args:
79
+ domain_name (str): The domain name.
80
+ problem_name (str): The problem name.
81
+ num_timesteps (float): The number of timesteps for training.
82
+ env_prop (EnvProperty): The environment property.
83
+ algorithm (BaseAlgorithm, optional): The algorithm to use. Defaults to SAC.
84
+ reward_threshold (float, optional): The reward threshold. Defaults to 450.
85
+ exploration_rate (float, optional): The exploration rate. Defaults to None.
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ domain_name: str,
91
+ problem_name: str,
92
+ num_timesteps: float,
93
+ env_prop: EnvProperty,
94
+ algorithm: BaseAlgorithm = SAC,
95
+ reward_threshold: float = 450,
96
+ exploration_rate=None,
97
+ ):
98
+ """
99
+ Initialize the DeepRLLearner object.
100
+
101
+ Args:
102
+ domain_name (str): The name of the domain.
103
+ problem_name (str): The name of the problem.
104
+ num_timesteps (float): The number of timesteps.
105
+ env_prop (EnvProperty): The environment property.
106
+ algorithm (BaseAlgorithm, optional): The algorithm to use. Defaults to SAC.
107
+ reward_threshold (float, optional): The reward threshold. Defaults to 450.
108
+ exploration_rate (float, optional): The exploration rate. Defaults to None.
109
+ """
110
+ env_kwargs = {"id": problem_name, "render_mode": "rgb_array"}
111
+ assert algorithm in [SAC, PPO, TD3]
112
+
113
+ self.domain_name = domain_name
114
+ self.problem_name = problem_name
115
+ self.env_prop = env_prop
116
+ self.exploration_rate = exploration_rate
117
+
118
+ self._model_directory = get_agent_model_dir(
119
+ domain_name=self.domain_name,
120
+ model_name=problem_name,
121
+ class_name=algorithm.__name__,
122
+ )
123
+ self.env = self.env_prop.create_vec_env(env_kwargs)
124
+ self._actions_space = self.env.action_space
125
+
126
+ # first_support: SB3 models from RL zoo, with the .zip format.
127
+ if os.path.exists(os.path.join(self._model_directory, "saved_model.zip")):
128
+ # TODO check if it's ncessary to give these to the model.load if loading from rl zoo
129
+ self._model_file_path = os.path.join(
130
+ self._model_directory, "saved_model.zip"
131
+ )
132
+ self.model_kwargs = {
133
+ "custom_objects": {
134
+ "learning_rate": 0.0,
135
+ "lr_schedule": lambda _: 0.0,
136
+ "clip_range": lambda _: 0.0,
137
+ },
138
+ "seed": 0,
139
+ "buffer_size": 1,
140
+ }
141
+ # second support: models saved with SB3's model.save, which is saved as a
142
+ # formatted .pth file.
143
+ else:
144
+ self.model_kwargs = {}
145
+ self._model_file_path = os.path.join(
146
+ self._model_directory, "saved_model.pth"
147
+ )
148
+
149
+ self.algorithm = algorithm
150
+ self.reward_threshold = reward_threshold
151
+ self.num_timesteps = num_timesteps
152
+
153
+ def save_model(self):
154
+ """Save the model to a file."""
155
+ self._model.save(self._model_file_path)
156
+
157
+ def try_recording_video(self, video_path, desired=None):
158
+ """
159
+ Try recording a video of the agent's performance.
160
+
161
+ Args:
162
+ video_path (str): The path to save the video.
163
+ desired (optional): The desired goal. Defaults to None.
164
+ """
165
+ num_tries = 0
166
+ while True:
167
+ if num_tries >= 10:
168
+ assert False, "agent keeps failing on recording an optimal obs."
169
+ try:
170
+ self.record_video(video_path, desired)
171
+ break
172
+ except Exception:
173
+ num_tries += 1
174
+ # print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
175
+ print(f"generated sequence video at {video_path}.")
176
+
177
+ def record_video(self, video_path, desired=None):
178
+ """
179
+ Record a video of the agent's performance.
180
+
181
+ Args:
182
+ video_path (str): The path to save the video.
183
+ desired (optional): The desired goal. Defaults to None.
184
+ """
185
+ fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
186
+ fps = 30.0
187
+ # if is_gc:
188
+ # assert goal_idx is not None
189
+ # self.reset_with_goal_idx(goal_idx)
190
+ # else:
191
+ # assert goal_idx is None
192
+ self.env.reset()
193
+ frame_size = (
194
+ self.env.render(mode="rgb_array").shape[1],
195
+ self.env.render(mode="rgb_array").shape[0],
196
+ )
197
+ video_path = os.path.join(video_path, "plan_video.mp4")
198
+ video_writer = cv2.VideoWriter(video_path, fourcc, fps, frame_size)
199
+ general_done, success_done = False, False
200
+ gc.collect()
201
+ obs = self.env.reset()
202
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
203
+ counter = 0
204
+ while not (general_done or success_done):
205
+ counter += 1
206
+ action, _states = self._model.predict(obs, deterministic=False)
207
+ obs, rewards, general_done, info = self.env.step(action)
208
+ if isinstance(general_done, np.ndarray):
209
+ general_done = general_done[0]
210
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
211
+ if "success" in info[0].keys():
212
+ success_done = info[0][
213
+ "success"
214
+ ] # make sure the agent actually reached the goal within the max time
215
+ elif "is_success" in info[0].keys():
216
+ success_done = info[0][
217
+ "is_success"
218
+ ] # make sure the agent actually reached the goal within the max time
219
+ elif "step_task_completions" in info[0].keys():
220
+ success_done = (
221
+ len(info[0]["step_task_completions"]) == 1
222
+ ) # bug of dummyVecEnv, it removes the episode_task_completions from the info dict.
223
+ else:
224
+ raise NotImplementedError(
225
+ "no other option for any of the environments."
226
+ )
227
+ frame = self.env.render()
228
+ success_done = self.env_prop.change_done_by_specific_desired(
229
+ obs, desired, success_done
230
+ )
231
+ video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
232
+ if general_done == False and success_done == True:
233
+ assert (
234
+ desired is not None
235
+ ), f"general_done is false but success_done is true, and desired is None. \
236
+ This should never happen, since the environment will say 'done' is false \
237
+ (general_done) while the observation will be close to the goal (success_done) \
238
+ only in case we incorporated a 'desired' when generating the observation."
239
+ elif general_done == True and success_done == False:
240
+ raise Exception("general_done is true but success_done is false")
241
+ self.env.close()
242
+ video_writer.release()
243
+
244
+ def load_model(self):
245
+ """Load the model from a file."""
246
+ self._model = self.algorithm.load(
247
+ self._model_file_path, env=self.env, device=device, **self.model_kwargs
248
+ )
249
+
250
+ def learn(self):
251
+ """Train the agent."""
252
+ if os.path.exists(self._model_file_path):
253
+ print(f"Loading pre-existing model in {self._model_file_path}")
254
+ self.load_model()
255
+ else:
256
+ print(f"No existing model in {self._model_file_path}, starting learning")
257
+ if self.exploration_rate is not None:
258
+ self._model = self.algorithm(
259
+ "MultiInputPolicy",
260
+ self.env,
261
+ ent_coef=self.exploration_rate,
262
+ verbose=1,
263
+ )
264
+ else:
265
+ self._model = self.algorithm("MultiInputPolicy", self.env, verbose=1)
266
+ self._model.learn(
267
+ total_timesteps=self.num_timesteps, progress_bar=True
268
+ ) # comment this in a normal env
269
+ self.save_model()
270
+
271
+ def safe_env_reset(self):
272
+ """
273
+ Reset the environment safely.
274
+
275
+ Returns:
276
+ The initial observation.
277
+ """
278
+ try:
279
+ obs = self.env.reset()
280
+ except Exception:
281
+ kwargs = {"id": self.problem_name, "render_mode": "rgb_array"}
282
+ self.env = self.env_prop.create_vec_env(kwargs)
283
+ obs = self.env.reset()
284
+ return obs
285
+
286
+ def get_mean_and_std_dev(self, observation):
287
+ """
288
+ Get the mean and standard deviation of the action distribution.
289
+
290
+ Args:
291
+ observation: The observation.
292
+
293
+ Returns:
294
+ The mean and standard deviation of the action distribution.
295
+ """
296
+ if self.algorithm == SAC:
297
+ tensor_observation, _ = self._model.actor.obs_to_tensor(observation)
298
+
299
+ mean_actions, log_std_dev, kwargs = (
300
+ self._model.actor.get_action_dist_params(tensor_observation)
301
+ )
302
+ probability_dist = self._model.actor.action_dist.proba_distribution(
303
+ mean_actions=mean_actions, log_std=log_std_dev
304
+ )
305
+ actor_means = probability_dist.get_actions(True).cpu().detach().numpy()
306
+ log_std_dev = log_std_dev.cpu().detach().numpy()
307
+ elif self.algorithm == PPO:
308
+ self._model.policy.set_training_mode(False)
309
+ tensor_observation, _ = self._model.policy.obs_to_tensor(observation)
310
+ distribution = self._model.policy.get_distribution(tensor_observation)
311
+
312
+ actor_means = distribution.distribution.mean.cpu().detach().numpy()
313
+ log_std_dev = distribution.distribution.stddev.cpu().detach().numpy()
314
+ if isinstance(self._model.policy.action_space, gym.spaces.Box):
315
+ actor_means = np.clip(
316
+ actor_means,
317
+ self._model.policy.action_space.low,
318
+ self._model.policy.action_space.high,
319
+ )
320
+ return actor_means, log_std_dev
321
+ else:
322
+ assert False
323
+ return actor_means, log_std_dev
324
+
325
+ def simplify_observation(self, observation):
326
+ """
327
+ Simplifies the given observation by concatenating the last dimension of each observation and action.
328
+ fits agents that generated observations in the form of: list of tuples, each tuple a single
329
+ step\frame with size 2, comprised of obs and action.
330
+ the function squashes the 2d array of obs and action in a 1d array, concatenating their
331
+ values together for training.
332
+
333
+ Args:
334
+ observation (list): List of tuples containing observation and action.
335
+
336
+ Returns:
337
+ list: List of simplified observations.
338
+ """
339
+ return [
340
+ np.concatenate(
341
+ (
342
+ np.array(obs).reshape(obs.shape[-1]),
343
+ np.array(action[0]).reshape(action[0].shape[-1]),
344
+ )
345
+ )
346
+ for (obs, action) in observation
347
+ ]
348
+
349
+ def add_random_optimalism(self, observations, action, constant_initial_action):
350
+ """
351
+ Adds random optimalism to the given action based on the length of observations.
352
+
353
+ Parameters:
354
+ observations (list): List of observations.
355
+ action (ndarray): Action to modify.
356
+ constant_initial_action (float): Initial action value.
357
+
358
+ Returns:
359
+ ndarray: Modified action.
360
+ """
361
+ if len(observations) > 3:
362
+ for i in range(0, len(action[0])):
363
+ action[0][i] += random.uniform(
364
+ -0.01 * action[0][i], 0.01 * action[0][i]
365
+ )
366
+ else: # just walk in a specific random direction to enable diverse plans
367
+ action = np.array(np.array([constant_initial_action]), None)
368
+ return action
369
+
370
+ def generate_partial_observation(
371
+ self,
372
+ action_selection_method,
373
+ percentage,
374
+ is_consecutive,
375
+ save_fig=False,
376
+ fig_path=None,
377
+ random_optimalism=True,
378
+ ):
379
+ """
380
+ Generates a partial observation by selecting a subset of steps from a full observation.
381
+
382
+ Args:
383
+ action_selection_method (str): The method used for selecting actions.
384
+ percentage (float): The percentage of steps to include in the partial observation.
385
+ is_consecutive (bool): Whether the selected steps should be consecutive or not.
386
+ save_fig (bool, optional): Whether to save a figure of the observation. Defaults to False.
387
+ fig_path (str, optional): The path to save the figure. Defaults to None.
388
+ random_optimalism (bool, optional): Whether to apply random optimalism during observation generation. Defaults to True.
389
+
390
+ Returns:
391
+ list: A partial observation consisting of a subset of steps from the full observation.
392
+ """
393
+ steps = self.generate_observation(
394
+ action_selection_method,
395
+ save_fig=save_fig,
396
+ random_optimalism=random_optimalism,
397
+ fig_path=fig_path,
398
+ ) # steps are a full observation
399
+ return random_subset_with_order(
400
+ steps, (int)(percentage * len(steps)), is_consecutive
401
+ )
402
+
403
+ def generate_observation(
404
+ self,
405
+ action_selection_method: MethodType,
406
+ random_optimalism,
407
+ save_fig=False,
408
+ fig_path=None,
409
+ with_dict=False,
410
+ desired=None,
411
+ ) -> list[tuple[np.ndarray, np.ndarray]]:
412
+ """
413
+ Generates observations by interacting with the environment.
414
+
415
+ Args:
416
+ action_selection_method (MethodType): The method used for action selection.
417
+ random_optimalism (bool): Flag indicating whether to add random optimalism to the actions.
418
+ save_fig (bool, optional): Flag indicating whether to save a figure. Defaults to False.
419
+ fig_path (str, optional): The path to save the figure. Required if save_fig is True. Defaults to None.
420
+ with_dict (bool, optional): Flag indicating whether to include the observation as a dictionary. Defaults to False.
421
+ desired (Any, optional): The desired goal for the observation. Defaults to None.
422
+
423
+ Returns:
424
+ list[tuple[np.ndarray, np.ndarray]]: A list of tuples containing the observation and the corresponding action.
425
+ """
426
+ if save_fig is False:
427
+ assert (
428
+ fig_path is None
429
+ ), "You can't specify a vid path when you don't even save the figure."
430
+ else:
431
+ assert (
432
+ fig_path is not None
433
+ ), "You need to specify a vid path when you save the figure."
434
+ # The try-except is a bug fix for the env not being reset properly in panda.
435
+ # If someone wants to check why and provide a robust solution they're welcome.
436
+ obs = self.safe_env_reset()
437
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
438
+ observations = []
439
+ is_successful_observation_made = False
440
+ num_of_insuccessful_attempts = 0
441
+ while not is_successful_observation_made:
442
+ # start as true, if this isn't the case (crash/death/truncation instead of success)
443
+ is_successful_observation_made = True
444
+ if random_optimalism:
445
+ constant_initial_action = self.env.action_space.sample()
446
+ while True:
447
+ from gr_libs.metrics.metrics import stochastic_amplified_selection
448
+
449
+ deterministic = (
450
+ action_selection_method != stochastic_amplified_selection
451
+ )
452
+ action, _states = self._model.predict(obs, deterministic=deterministic)
453
+ if random_optimalism:
454
+ # get the right direction and then start inserting noise to still get a relatively optimal plan
455
+ self.add_random_optimalism(obs, action, constant_initial_action)
456
+ if with_dict:
457
+ observations.append((obs, action))
458
+ else:
459
+ observations.append((obs["observation"], action))
460
+ obs, reward, done, info = self.env.step(action)
461
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
462
+ general_done = bool(self.env_prop.is_done(done))
463
+ success_done = self.env_prop.is_success(info)
464
+ success_done = bool(
465
+ self.env_prop.change_done_by_specific_desired(
466
+ obs, desired, success_done
467
+ )
468
+ )
469
+ if general_done is True and success_done is False:
470
+ # it could be that the stochasticity inserted into the actions made the agent die/crash.
471
+ # we don't want this observation: it's an insuccessful attempt.
472
+ num_of_insuccessful_attempts += 1
473
+ # print(f"for agent for problem {self.problem_name}, its done
474
+ # {len(observations)} steps, and got to a situation where
475
+ # general_done != success_done, for the {num_of_insuccessful_attempts} time.")
476
+ if num_of_insuccessful_attempts > 50:
477
+ # print(f"got more then 10 insuccessful attempts!")
478
+ assert (
479
+ general_done
480
+ == success_done
481
+ # we want to make sure the episode is done only
482
+ # when the agent has actually succeeded with the task.
483
+ ), f"failed on goal: {obs['desired']}"
484
+ else:
485
+ # try again by breaking inner loop.
486
+ # everything is set up to be like the beginning of the function.
487
+ is_successful_observation_made = False
488
+ obs = self.safe_env_reset()
489
+ self.env_prop.change_goal_to_specific_desired(obs, desired)
490
+ observations = (
491
+ []
492
+ ) # we want to re-accumulate the observations from scratch, have another try
493
+ break
494
+ elif general_done is False and success_done is False:
495
+ continue
496
+ elif general_done is True and success_done is True:
497
+ if num_of_insuccessful_attempts > 0:
498
+ pass # print(f"after {num_of_insuccessful_attempts}, finally I succeeded!")
499
+ break
500
+ elif general_done is False and success_done is True:
501
+ # The environment will say 'done' is false (general_done) while the observation
502
+ # will be close to the goal (success_done) only in case we incorporated a 'desired'
503
+ # when generating the observation.
504
+ assert (
505
+ desired is not None
506
+ ), f"general_done is false but success_done is true, and desired is None. This should never happen, since the \
507
+ environment will say 'done' is false (general_done) while the observation will be close to the goal (success_done) \
508
+ only in case we incorporated a 'desired' when generating the observation."
509
+ break
510
+
511
+ if save_fig:
512
+ self.try_recording_video(fig_path, desired)
513
+
514
+ self.env.close()
515
+ return observations
516
+
517
+
351
518
  class GCDeepRLAgent(DeepRLAgent):
352
- def generate_partial_observation(self, action_selection_method, percentage, is_consecutive, goal_directed_problem=None, goal_directed_goal=None, save_fig=False, fig_path=None, random_optimalism=True):
353
- steps = self.generate_observation(action_selection_method, save_fig=save_fig, fig_path=fig_path, random_optimalism=random_optimalism, goal_directed_problem=goal_directed_problem, goal_directed_goal=goal_directed_goal) # steps are a full observation
354
- return random_subset_with_order(steps, (int)(percentage * len(steps)), is_consecutive)
355
-
356
- def generate_observation(self, action_selection_method: MethodType, random_optimalism, env_prop=None, goal_directed_problem=None, goal_directed_goal=None,
357
- save_fig = False, fig_path=None, with_dict=False):
358
- # print(f"hyperparams:{hyperparams}")
359
- if goal_directed_problem:
360
- if save_fig:
361
- assert fig_path != None, "You need to specify a vid path when you save the figure."
362
- else:
363
- assert fig_path == None
364
- assert goal_directed_goal == None, "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
365
- kwargs = {"id": goal_directed_problem, "render_mode": "rgb_array"}
366
- self.env = create_vec_env(kwargs)
367
- orig_env = self.env
368
- observations = super().generate_observation(action_selection_method=action_selection_method, random_optimalism=random_optimalism,
369
- save_fig=save_fig, fig_path=fig_path, with_dict=with_dict)
370
- self.env = orig_env
371
- else: #goal_directed_goal!=None
372
- if save_fig:
373
- assert fig_path != None, "You need to specify a vid path when you save the figure."
374
- else:
375
- assert fig_path == None
376
- assert goal_directed_problem == None, "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
377
- observations = super().generate_observation(action_selection_method=action_selection_method, random_optimalism=random_optimalism,
378
- save_fig=save_fig, fig_path=fig_path, with_dict=with_dict, desired=goal_directed_goal) # TODO tutorial on how to use the deepRLAgent for sequence generation and examination and plotting of the sequence
379
- return observations
380
-
381
-
382
- if __name__ == "__main__":
383
- package_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
384
- print("this is package root:" + package_root)
385
- if package_root not in sys.path:
386
- sys.path.insert(0, package_root)
387
-
388
- from gr_libs.ml.utils.storage import get_agent_model_dir, set_global_storage_configs
389
-
390
- set_global_storage_configs("graml", "fragmented_partial_obs", "inference_same_length", "learn_diff_length")
391
- agent = DeepRLAgent(domain_name="point_maze", problem_name="PointMaze-FourRoomsEnvDense-11x11-Goal-9x1", algorithm=SAC, num_timesteps=200000)
392
- agent.learn()
393
- agent.record_video("")
519
+ """
520
+ A class representing a Goal Conditioned Deep Reinforcement Learning Agent.
521
+
522
+ This agent extends the functionality of the base DeepRLAgent class by providing methods for generating partial observations and observations with goal-directed goals or problems.
523
+
524
+ Args:
525
+ DeepRLAgent (class): The base class for DeepRLAgent.
526
+
527
+ Attributes:
528
+ env (object): The environment in which the agent operates.
529
+ env_prop (object): The environment properties.
530
+
531
+ Methods:
532
+ generate_partial_observation: Generates a partial observation based on a given percentage of steps.
533
+ generate_observation: Generates an observation with optional goal-directed goals or problems.
534
+ """
535
+
536
+ def generate_partial_observation(
537
+ self,
538
+ action_selection_method,
539
+ percentage,
540
+ is_consecutive,
541
+ goal_directed_problem=None,
542
+ goal_directed_goal=None,
543
+ save_fig=False,
544
+ fig_path=None,
545
+ random_optimalism=True,
546
+ ):
547
+ """
548
+ Generates a partial observation based on a given percentage of steps.
549
+
550
+ Args:
551
+ action_selection_method (MethodType): The method for selecting actions.
552
+ percentage (float): The percentage of steps to include in the partial observation.
553
+ is_consecutive (bool): Whether the steps should be consecutive or randomly selected.
554
+ goal_directed_problem (str, optional): The goal-directed problem. Defaults to None.
555
+ goal_directed_goal (object, optional): The goal-directed goal. Defaults to None.
556
+ save_fig (bool, optional): Whether to save a figure. Defaults to False.
557
+ fig_path (str, optional): The path to save the figure. Defaults to None.
558
+ random_optimalism (bool, optional): Whether to use random optimalism. Defaults to True.
559
+
560
+ Returns:
561
+ list: A random subset of steps from the full observation.
562
+ """
563
+ steps = self.generate_observation(
564
+ action_selection_method,
565
+ save_fig=save_fig,
566
+ fig_path=fig_path,
567
+ random_optimalism=random_optimalism,
568
+ goal_directed_problem=goal_directed_problem,
569
+ goal_directed_goal=goal_directed_goal,
570
+ ) # steps are a full observation
571
+ return random_subset_with_order(
572
+ steps, (int)(percentage * len(steps)), is_consecutive
573
+ )
574
+
575
+ def generate_observation(
576
+ self,
577
+ action_selection_method: MethodType,
578
+ random_optimalism,
579
+ goal_directed_problem=None,
580
+ goal_directed_goal=None,
581
+ save_fig=False,
582
+ fig_path=None,
583
+ with_dict=False,
584
+ ):
585
+ """
586
+ Generates an observation with optional goal-directed goals or problems.
587
+
588
+ Args:
589
+ action_selection_method (MethodType): The method for selecting actions.
590
+ random_optimalism (bool): Whether to use random optimalism.
591
+ goal_directed_problem (str, optional): The goal-directed problem. Defaults to None.
592
+ goal_directed_goal (object, optional): The goal-directed goal. Defaults to None.
593
+ save_fig (bool, optional): Whether to save a figure. Defaults to False.
594
+ fig_path (str, optional): The path to save the figure. Defaults to None.
595
+ with_dict (bool, optional): Whether to include a dictionary in the observation. Defaults to False.
596
+
597
+ Returns:
598
+ list: The generated observation.
599
+ """
600
+ if save_fig:
601
+ assert (
602
+ fig_path is not None
603
+ ), "You need to specify a vid path when you save the figure."
604
+ else:
605
+ assert fig_path is None
606
+
607
+ if goal_directed_problem:
608
+ assert (
609
+ goal_directed_goal is None
610
+ ), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
611
+ kwargs = {"id": goal_directed_problem, "render_mode": "rgb_array"}
612
+ self.env = self.env_prop.create_vec_env(kwargs)
613
+ orig_env = self.env
614
+ observations = super().generate_observation(
615
+ action_selection_method=action_selection_method,
616
+ random_optimalism=random_optimalism,
617
+ save_fig=save_fig,
618
+ fig_path=fig_path,
619
+ with_dict=with_dict,
620
+ )
621
+ self.env = orig_env
622
+ else:
623
+ assert (
624
+ goal_directed_problem is None
625
+ ), "can't give goal directed goal and also goal directed problem for the sake of sequence generation by a general agent"
626
+ observations = super().generate_observation(
627
+ action_selection_method=action_selection_method,
628
+ random_optimalism=random_optimalism,
629
+ save_fig=save_fig,
630
+ fig_path=fig_path,
631
+ with_dict=with_dict,
632
+ desired=goal_directed_goal,
633
+ )
634
+ return observations