python-motion-planning 0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. python_motion_planning/__init__.py +4 -0
  2. python_motion_planning/curve_generation/__init__.py +9 -0
  3. python_motion_planning/curve_generation/bezier_curve.py +131 -0
  4. python_motion_planning/curve_generation/bspline_curve.py +271 -0
  5. python_motion_planning/curve_generation/cubic_spline.py +128 -0
  6. python_motion_planning/curve_generation/curve.py +64 -0
  7. python_motion_planning/curve_generation/dubins_curve.py +348 -0
  8. python_motion_planning/curve_generation/fem_pos_smooth.py +114 -0
  9. python_motion_planning/curve_generation/polynomial_curve.py +226 -0
  10. python_motion_planning/curve_generation/reeds_shepp.py +736 -0
  11. python_motion_planning/global_planner/__init__.py +3 -0
  12. python_motion_planning/global_planner/evolutionary_search/__init__.py +4 -0
  13. python_motion_planning/global_planner/evolutionary_search/aco.py +186 -0
  14. python_motion_planning/global_planner/evolutionary_search/evolutionary_search.py +87 -0
  15. python_motion_planning/global_planner/evolutionary_search/pso.py +356 -0
  16. python_motion_planning/global_planner/graph_search/__init__.py +28 -0
  17. python_motion_planning/global_planner/graph_search/a_star.py +124 -0
  18. python_motion_planning/global_planner/graph_search/d_star.py +291 -0
  19. python_motion_planning/global_planner/graph_search/d_star_lite.py +188 -0
  20. python_motion_planning/global_planner/graph_search/dijkstra.py +77 -0
  21. python_motion_planning/global_planner/graph_search/gbfs.py +78 -0
  22. python_motion_planning/global_planner/graph_search/graph_search.py +87 -0
  23. python_motion_planning/global_planner/graph_search/jps.py +165 -0
  24. python_motion_planning/global_planner/graph_search/lazy_theta_star.py +114 -0
  25. python_motion_planning/global_planner/graph_search/lpa_star.py +230 -0
  26. python_motion_planning/global_planner/graph_search/s_theta_star.py +133 -0
  27. python_motion_planning/global_planner/graph_search/theta_star.py +171 -0
  28. python_motion_planning/global_planner/graph_search/voronoi.py +200 -0
  29. python_motion_planning/global_planner/sample_search/__init__.py +6 -0
  30. python_motion_planning/global_planner/sample_search/informed_rrt.py +152 -0
  31. python_motion_planning/global_planner/sample_search/rrt.py +151 -0
  32. python_motion_planning/global_planner/sample_search/rrt_connect.py +147 -0
  33. python_motion_planning/global_planner/sample_search/rrt_star.py +77 -0
  34. python_motion_planning/global_planner/sample_search/sample_search.py +135 -0
  35. python_motion_planning/local_planner/__init__.py +19 -0
  36. python_motion_planning/local_planner/apf.py +144 -0
  37. python_motion_planning/local_planner/ddpg.py +630 -0
  38. python_motion_planning/local_planner/dqn.py +687 -0
  39. python_motion_planning/local_planner/dwa.py +212 -0
  40. python_motion_planning/local_planner/local_planner.py +262 -0
  41. python_motion_planning/local_planner/lqr.py +146 -0
  42. python_motion_planning/local_planner/mpc.py +214 -0
  43. python_motion_planning/local_planner/pid.py +158 -0
  44. python_motion_planning/local_planner/rpp.py +147 -0
  45. python_motion_planning/utils/__init__.py +19 -0
  46. python_motion_planning/utils/agent/__init__.py +0 -0
  47. python_motion_planning/utils/agent/agent.py +135 -0
  48. python_motion_planning/utils/environment/__init__.py +0 -0
  49. python_motion_planning/utils/environment/env.py +134 -0
  50. python_motion_planning/utils/environment/node.py +85 -0
  51. python_motion_planning/utils/environment/point2d.py +96 -0
  52. python_motion_planning/utils/environment/pose2d.py +91 -0
  53. python_motion_planning/utils/helper/__init__.py +3 -0
  54. python_motion_planning/utils/helper/math_helper.py +65 -0
  55. python_motion_planning/utils/planner/__init__.py +0 -0
  56. python_motion_planning/utils/planner/control_factory.py +31 -0
  57. python_motion_planning/utils/planner/curve_factory.py +29 -0
  58. python_motion_planning/utils/planner/planner.py +40 -0
  59. python_motion_planning/utils/planner/search_factory.py +51 -0
  60. python_motion_planning/utils/plot/__init__.py +0 -0
  61. python_motion_planning/utils/plot/plot.py +274 -0
  62. python_motion_planning-0.1.dist-info/LICENSE +674 -0
  63. python_motion_planning-0.1.dist-info/METADATA +873 -0
  64. python_motion_planning-0.1.dist-info/RECORD +66 -0
  65. python_motion_planning-0.1.dist-info/WHEEL +5 -0
  66. python_motion_planning-0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,630 @@
1
+ """
2
+ @file: ddpg.py
3
+ @breif: Deep Deterministic Policy Gradient (DDPG) motion planning.
4
+ @author: Wu Maojia
5
+ @update: 2024.5.24
6
+ """
7
+ import numpy as np
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+ from torch.optim.lr_scheduler import ReduceLROnPlateau
12
+ from torch.utils.tensorboard import SummaryWriter
13
+ import random
14
+ from tqdm import tqdm
15
+ import math
16
+ import copy
17
+ import datetime
18
+ import os
19
+
20
+ from .local_planner import LocalPlanner
21
+ from python_motion_planning.utils import Env, MathHelper, Robot
22
+
23
+
24
+ class ReplayBuffer(object):
25
+ """
26
+ Experience replay buffer to store the transitions.
27
+
28
+ Parameters:
29
+ state_dim (int): state dimension
30
+ action_dim (int): action dimension
31
+ max_size (int): maximum replay buffer size
32
+ device (torch.device): device to store the data
33
+ """
34
+ def __init__(self, state_dim: int, action_dim: int, max_size: int, device: torch.device) -> None:
35
+ self.max_size = max_size
36
+ self.count = 0
37
+ self.size = 0
38
+ self.s = torch.zeros((self.max_size, state_dim), dtype=torch.float, device=device)
39
+ self.a = torch.zeros((self.max_size, action_dim), dtype=torch.float, device=device)
40
+ self.r = torch.zeros((self.max_size, 1), dtype=torch.float, device=device)
41
+ self.s_ = torch.zeros((self.max_size, state_dim), dtype=torch.float, device=device)
42
+ self.win = torch.zeros((self.max_size, 1), dtype=torch.bool, device=device)
43
+
44
+ def store(self, s: torch.Tensor, a: torch.Tensor, r: torch.Tensor, s_: torch.Tensor, win: bool) -> None:
45
+ """
46
+ Store a new transition in the replay buffer.
47
+
48
+ Parameters:
49
+ s (torch.Tensor): state
50
+ a (torch.Tensor): action
51
+ r (torch.Tensor): reward
52
+ s_ (torch.Tensor): next state
53
+ win (bool): win or otherwise, True: win (reached the goal), False: otherwise.
54
+ """
55
+ self.s[self.count] = s
56
+ self.a[self.count] = a
57
+ self.r[self.count] = r
58
+ self.s_[self.count] = s_
59
+ self.win[self.count] = torch.tensor(win, dtype=torch.bool)
60
+ self.count = (self.count + 1) % self.max_size # When the 'count' reaches max_size, it will be reset to 0.
61
+ self.size = min(self.size + 1, self.max_size) # Record the number of transitions
62
+
63
+ def sample(self, batch_size: int) -> tuple:
64
+ """
65
+ Sample a batch of transitions from the replay buffer.
66
+
67
+ Parameters:
68
+ batch_size (int): batch size
69
+
70
+ Returns:
71
+ batch_s (torch.Tensor): batch of states
72
+ batch_a (torch.Tensor): batch of actions
73
+ batch_r (torch.Tensor): batch of rewards
74
+ batch_s_ (torch.Tensor): batch of next states
75
+ batch_win (torch.Tensor): batch of win or otherwise, True: win (reached the goal), False: otherwise.
76
+ """
77
+ index = torch.randint(self.size, size=(batch_size,)) # Randomly sampling
78
+ batch_s = self.s[index]
79
+ batch_a = self.a[index]
80
+ batch_r = self.r[index]
81
+ batch_s_ = self.s_[index]
82
+ batch_win = self.win[index]
83
+
84
+ return batch_s, batch_a, batch_r, batch_s_, batch_win
85
+
86
+
87
+ class Actor(nn.Module):
88
+ """
89
+ Actor network to generate the action.
90
+
91
+ Parameters:
92
+ state_dim (int): state dimension
93
+ action_dim (int): action dimension
94
+ hidden_depth (int): the number of hidden layers of the neural network
95
+ hidden_width (int): the number of neurons in hidden layers of the neural network
96
+ min_state (torch.Tensor): minimum of each value in the state
97
+ max_state (torch.Tensor): maximum of each value in the state
98
+ min_action (torch.Tensor): minimum of each value in the action
99
+ max_action (torch.Tensor): maximum of each value in the action
100
+ """
101
+ def __init__(self, state_dim: int, action_dim: int, hidden_depth: int, hidden_width: int,
102
+ min_state: torch.Tensor, max_state: torch.Tensor, min_action: torch.Tensor, max_action: torch.Tensor) -> None:
103
+ super(Actor, self).__init__()
104
+ self.min_state = min_state
105
+ self.max_state = max_state
106
+ self.min_action = min_action
107
+ self.max_action = max_action
108
+
109
+ self.hidden_depth = hidden_depth
110
+ self.input_layer = nn.Linear(state_dim, hidden_width)
111
+ self.hidden_layers = nn.ModuleList([nn.Linear(hidden_width, hidden_width) for _ in range(self.hidden_depth)])
112
+ self.output_layer = nn.Linear(hidden_width, action_dim)
113
+
114
+ def forward(self, s: torch.Tensor) -> torch.Tensor:
115
+ """
116
+ Generate the action based on the state.
117
+
118
+ Parameters:
119
+ s (torch.Tensor): state
120
+
121
+ Returns:
122
+ a (torch.Tensor): action
123
+ """
124
+ # normalization
125
+ s = (s - self.min_state) / (self.max_state - self.min_state)
126
+
127
+ s = F.relu(self.input_layer(s))
128
+ for i in range(self.hidden_depth):
129
+ s = F.relu(self.hidden_layers[i](s))
130
+ s = self.output_layer(s)
131
+ a = self.min_action + (self.max_action - self.min_action) * torch.sigmoid(s) # [min,max]
132
+ return a
133
+
134
+
135
+ class Critic(nn.Module):
136
+ """
137
+ Critic network to estimate the value function q(s,a).
138
+
139
+ Parameters:
140
+ state_dim (int): state dimension
141
+ action_dim (int): action dimension
142
+ hidden_depth (int): the number of hidden layers of the neural network
143
+ hidden_width (int): the number of neurons in hidden layers of the neural network
144
+ min_state (torch.Tensor): minimum of each value in the state
145
+ max_state (torch.Tensor): maximum of each value in the state
146
+ min_action (torch.Tensor): minimum of each value in the action
147
+ max_action (torch.Tensor): maximum of each value in the action
148
+ """
149
+ def __init__(self, state_dim: int, action_dim: int, hidden_depth: int, hidden_width: int,
150
+ min_state: torch.Tensor, max_state: torch.Tensor, min_action: torch.Tensor, max_action: torch.Tensor) -> None:
151
+ super(Critic, self).__init__()
152
+ self.min_state = min_state
153
+ self.max_state = max_state
154
+ self.min_action = min_action
155
+ self.max_action = max_action
156
+
157
+ self.hidden_depth = hidden_depth
158
+ self.input_layer = nn.Linear(state_dim + action_dim, hidden_width)
159
+ self.hidden_layers = nn.ModuleList([nn.Linear(hidden_width, hidden_width) for _ in range(self.hidden_depth)])
160
+ self.output_layer = nn.Linear(hidden_width, 1)
161
+
162
+ def forward(self, s: torch.Tensor, a: torch.Tensor) -> torch.Tensor:
163
+ """
164
+ Calculate the Q-value of (s,a)
165
+
166
+ Parameters:
167
+ s (torch.Tensor): state
168
+ a (torch.Tensor): action
169
+
170
+ Returns:
171
+ q (torch.Tensor): Q-value of (s,a)
172
+ """
173
+ # normalization
174
+ s = (s - self.min_state) / (self.max_state - self.min_state)
175
+ a = (a - self.min_action) / (self.max_action - self.min_action)
176
+
177
+ input = torch.cat([s, a], axis=-1)
178
+
179
+ q = F.relu(self.input_layer(input))
180
+ for i in range(self.hidden_depth):
181
+ q = F.relu(self.hidden_layers[i](q))
182
+ q = self.output_layer(q)
183
+ return q
184
+
185
+
186
+ class DDPG(LocalPlanner):
187
+ """
188
+ Class for Deep Deterministic Policy Gradient (DDPG) motion planning.
189
+
190
+ Parameters:
191
+ start (tuple): start point coordinate
192
+ goal (tuple): goal point coordinate
193
+ env (Env): environment
194
+ heuristic_type (str): heuristic function type
195
+ hidden_depth (int): the number of hidden layers of the neural network
196
+ hidden_width (int): the number of neurons in hidden layers of the neural network
197
+ batch_size (int): batch size to optimize the neural networks
198
+ buffer_size (int): maximum replay buffer size
199
+ gamma (float): discount factor
200
+ tau (float): Softly update the target network
201
+ lr (float): learning rate
202
+ train_noise (float): Action noise coefficient during training for exploration
203
+ random_episodes (int): Take the random actions in the beginning for the better exploration
204
+ max_episode_steps (int): Maximum steps for each episode
205
+ update_freq (int): Frequency (times) of updating the network for each step
206
+ update_steps (int): Update the network for every 'update_steps' steps
207
+ evaluate_freq (int): Frequency (times) of evaluations and calculate the average
208
+ evaluate_episodes (int): Evaluate the network every 'evaluate_episodes' episodes
209
+ actor_save_path (str): Save path of the trained actor network
210
+ critic_save_path (str): Save path of the trained critic network
211
+ actor_load_path (str): Load path of the trained actor network
212
+ critic_load_path (str): Load path of the trained critic network
213
+ **params: other parameters can be found in the parent class LocalPlanner
214
+
215
+ Examples:
216
+ # Import the necessary dependencies
217
+ >>> from python_motion_planning.utils import Grid
218
+ >>> from python_motion_planning.local_planner import DDPG
219
+
220
+ # Train the model and save the trained model
221
+ Train the model, only for learning-based planners, such as DDPG.
222
+ It costs a lot of time to train the model, please be patient.
223
+ If you want a faster training, try reducing num_episodes and batch_size,
224
+ or increasing update_steps and evaluate_episodes, or fine-tuning other hyperparameters
225
+ if you are familiar with them, usually in a cost of performance, however.
226
+ >>> plt = DDPG(start=(5, 5, 0), goal=(45, 25, 0), env=Grid(51, 31),
227
+ >>> actor_save_path="models/actor_best.pth", critic_save_path="models/critic_best.pth")
228
+ >>> plt.train(num_episodes=10000)
229
+
230
+ # load the trained model and run
231
+ >>> plt = DDPG(start=(5, 5, 0), goal=(45, 25, 0), env=Grid(51, 31),
232
+ >>> actor_load_path="models/actor_best.pth", critic_load_path="models/critic_best.pth")
233
+ >>> plt.run()
234
+
235
+ References:
236
+ [1] Continuous control with deep reinforcement learning
237
+ """
238
+ def __init__(self, start: tuple, goal: tuple, env: Env, heuristic_type: str = "euclidean",
239
+ hidden_depth: int = 3, hidden_width: int = 512, batch_size: int = 2000, buffer_size: int = 1e6,
240
+ gamma: float = 0.999, tau: float = 1e-3, lr: float = 1e-4, train_noise: float = 0.1,
241
+ random_episodes: int = 50, max_episode_steps: int = 200,
242
+ update_freq: int = 1, update_steps: int = 1, evaluate_freq: int = 50, evaluate_episodes: int = 50,
243
+ actor_save_path: str = "models/actor_best.pth",
244
+ critic_save_path: str = "models/critic_best.pth",
245
+ actor_load_path: str = None,
246
+ critic_load_path: str = None,
247
+ **params) -> None:
248
+ super().__init__(start, goal, env, heuristic_type, **params)
249
+ # DDPG parameters
250
+ self.hidden_depth = hidden_depth # The number of hidden layers of the neural network
251
+ self.hidden_width = hidden_width # The number of neurons in hidden layers of the neural network
252
+ self.batch_size = int(batch_size) # batch size to optimize the neural networks
253
+ self.buffer_size = int(buffer_size) # maximum replay buffer size
254
+ self.gamma = gamma # discount factor
255
+ self.tau = tau # Softly update the target network
256
+ self.lr = lr # learning rate
257
+ self.train_noise = train_noise # Action noise coefficient during training for exploration
258
+ self.random_episodes = random_episodes # Take the random actions in the beginning for the better exploration
259
+ self.max_episode_steps = max_episode_steps # Maximum steps for each episode
260
+ self.update_freq = update_freq # Frequency (times) of updating the network for each step
261
+ self.update_steps = update_steps # Update the network for every 'update_steps' steps
262
+ self.evaluate_freq = evaluate_freq # Frequency (times) of evaluations and calculate the average
263
+ self.evaluate_episodes = evaluate_episodes # Evaluate the network every 'evaluate_episodes' episodes
264
+ self.actor_save_path = actor_save_path # Save path of the trained actor network
265
+ self.critic_save_path = critic_save_path # Save path of the trained critic network
266
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
267
+ print(f"Using device: {self.device}")
268
+
269
+ self.n_observations = 8 # x, y, theta, v, w, g_x, g_y, g_theta
270
+ self.n_actions = 2 # v_inc, w_inc
271
+
272
+ self.min_state = torch.tensor([0, 0, -math.pi, self.params["MIN_V"], self.params["MIN_W"], 0, 0, -math.pi],
273
+ device=self.device)
274
+ self.max_state = torch.tensor([self.env.x_range, self.env.y_range, math.pi, self.params["MAX_V"],
275
+ self.params["MAX_W"], self.env.x_range, self.env.y_range, math.pi,], device=self.device)
276
+ self.min_action = torch.tensor([self.params["MIN_V_INC"], self.params["MIN_W_INC"]], device=self.device)
277
+ self.max_action = torch.tensor([self.params["MAX_V_INC"], self.params["MAX_W_INC"]], device=self.device)
278
+
279
+ self.actor = Actor(self.n_observations, self.n_actions, self.hidden_depth, self.hidden_width, self.min_state,
280
+ self.max_state, self.min_action, self.max_action).to(self.device)
281
+ if actor_load_path:
282
+ self.actor.load_state_dict(torch.load(actor_load_path))
283
+ self.actor_target = copy.deepcopy(self.actor)
284
+
285
+ self.critic = Critic(self.n_observations, self.n_actions, self.hidden_depth, self.hidden_width,
286
+ self.min_state, self.max_state, self.min_action, self.max_action).to(self.device)
287
+ if critic_load_path:
288
+ self.critic.load_state_dict(torch.load(critic_load_path))
289
+ self.critic_target = copy.deepcopy(self.critic)
290
+
291
+ self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr, weight_decay=1e-4)
292
+ self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr, weight_decay=1e-4)
293
+
294
+ self.actor_scheduler = ReduceLROnPlateau(self.actor_optimizer, mode='max', factor=0.2, patience=10)
295
+ self.critic_scheduler = ReduceLROnPlateau(self.critic_optimizer, mode='max', factor=0.2, patience=10)
296
+
297
+ self.criterion = nn.MSELoss()
298
+
299
+ self.replay_buffer = ReplayBuffer(self.n_observations, self.n_actions, max_size=self.buffer_size, device=self.device)
300
+
301
+ # Build a tensorboard
302
+ self.writer = SummaryWriter(log_dir='runs/DDPG_{}'.format(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))
303
+
304
+ # global planner
305
+ g_start = (start[0], start[1])
306
+ g_goal = (goal[0], goal[1])
307
+ self.g_planner = {"planner_name": "a_star", "start": g_start, "goal": g_goal, "env": env}
308
+ self.path = self.g_path[::-1]
309
+
310
+ def __del__(self) -> None:
311
+ self.writer.close()
312
+
313
+ def __str__(self) -> str:
314
+ return "Deep Deterministic Policy Gradient (DDPG)"
315
+
316
+ def plan(self) -> tuple:
317
+ """
318
+ Deep Deterministic Policy Gradient (DDPG) motion plan function.
319
+
320
+ Returns:
321
+ flag (bool): planning successful if true else failed
322
+ pose_list (list): history poses of robot
323
+ """
324
+ s = self.reset()
325
+ for _ in range(self.params["MAX_ITERATION"]):
326
+ # break until goal reached
327
+ if self.reachGoal(tuple(s[0:3]), tuple(s[5:8])):
328
+ return True, self.robot.history_pose
329
+
330
+ # get the particular point on the path at the lookahead distance to track
331
+ lookahead_pt, theta_trj, kappa = self.getLookaheadPoint()
332
+ s[5:7] = torch.tensor(lookahead_pt, device=self.device)
333
+ s[7] = torch.tensor(theta_trj, device=self.device)
334
+
335
+ a = self.select_action(s) # get the action from the actor network
336
+ s_, r, done, win = self.step(s, a) # take the action and get the next state and reward
337
+ s = s_ # Move to the next state
338
+ self.robot.px, self.robot.py, self.robot.theta, self.robot.v, self.robot.w = tuple(s[0:5].cpu().numpy())
339
+
340
+ return True, self.robot.history_pose
341
+ # return False, None
342
+
343
+ def run(self) -> None:
344
+ """
345
+ Running both plannig and animation.
346
+ """
347
+ _, history_pose = self.plan()
348
+ print(f"Number of iterations: {len(history_pose)}")
349
+ if not history_pose:
350
+ raise ValueError("Path not found and planning failed!")
351
+
352
+ path = np.array(history_pose)[:, 0:2]
353
+ cost = np.sum(np.sqrt(np.sum(np.diff(path, axis=0)**2, axis=1, keepdims=True)))
354
+ self.plot.plotPath(self.path, path_color="r", path_style="--")
355
+ self.plot.animation(path, str(self), cost, history_pose=history_pose)
356
+
357
+ def select_action(self, s: torch.Tensor) -> torch.Tensor:
358
+ """
359
+ Select the action from the actor network.
360
+
361
+ Parameters:
362
+ s (torch.Tensor): current state
363
+
364
+ Returns:
365
+ a (torch.Tensor): selected action
366
+ """
367
+ s = torch.unsqueeze(s.clone().detach(), 0)
368
+ a = self.actor(s).detach().flatten()
369
+ return a
370
+
371
+ def optimize_model(self) -> tuple:
372
+ """
373
+ Optimize the neural networks when training.
374
+
375
+ Returns:
376
+ actor_loss (float): actor loss
377
+ critic_loss (float): critic loss
378
+ """
379
+ batch_s, batch_a, batch_r, batch_s_, batch_win = self.replay_buffer.sample(self.batch_size) # Sample a batch
380
+
381
+ # Compute the target q
382
+ with torch.no_grad(): # target_q has no gradient
383
+ q_ = self.critic_target(batch_s_, self.actor_target(batch_s_))
384
+ target_q = batch_r + self.gamma * torch.logical_not(batch_win) * q_
385
+
386
+ # Compute the current q and the critic loss
387
+ current_q = self.critic(batch_s, batch_a)
388
+ critic_loss = self.criterion(target_q, current_q)
389
+
390
+ # Optimize the critic
391
+ self.critic_optimizer.zero_grad()
392
+ critic_loss.backward()
393
+ torch.nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=1.0, norm_type=2) # clip the gradient
394
+ self.critic_optimizer.step()
395
+
396
+ # Freeze critic networks so you don't waste computational effort
397
+ for params in self.critic.parameters():
398
+ params.requires_grad = False
399
+
400
+ # Compute the actor loss
401
+ actor_loss = -self.critic(batch_s, self.actor(batch_s)).mean()
402
+ # Optimize the actor
403
+ self.actor_optimizer.zero_grad()
404
+ actor_loss.backward()
405
+ torch.nn.utils.clip_grad_norm_(self.actor.parameters(), max_norm=1.0, norm_type=2) # clip the gradient
406
+ self.actor_optimizer.step()
407
+
408
+ # Unfreeze critic networks
409
+ for params in self.critic.parameters():
410
+ params.requires_grad = True
411
+
412
+ # Softly update the target networks
413
+ for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
414
+ target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
415
+
416
+ for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
417
+ target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
418
+
419
+ return actor_loss.item(), critic_loss.item()
420
+
421
+ def evaluate_policy(self) -> float:
422
+ """
423
+ Evaluate the policy and calculating the average reward.
424
+
425
+ Returns:
426
+ evaluate_reward (float): average reward of the policy
427
+ """
428
+ print(f"Evaluating: ")
429
+ evaluate_reward = 0
430
+ for _ in tqdm(range(self.evaluate_freq)):
431
+ s = self.reset(random_sg=True)
432
+ done = False
433
+ episode_reward = 0
434
+ step = 0
435
+ while not done:
436
+ a = self.select_action(s) # We do not add noise when evaluating
437
+ s_, r, done, win = self.step(s, a)
438
+ self.replay_buffer.store(s, a, r, s_, win) # Store the transition
439
+ episode_reward += r
440
+ s = s_
441
+ step += 1
442
+ if step >= self.max_episode_steps:
443
+ break
444
+ evaluate_reward += episode_reward / step
445
+
446
+ return evaluate_reward / self.evaluate_freq
447
+
448
+ def train(self, num_episodes: int = 1000) -> None:
449
+ """
450
+ Train the model.
451
+
452
+ Parameters:
453
+ num_episodes (int): number of episodes to train the model
454
+ """
455
+ noise_std = self.train_noise * torch.tensor([
456
+ self.params["MAX_V_INC"] - self.params["MIN_V_INC"],
457
+ self.params["MAX_W_INC"] - self.params["MIN_W_INC"]
458
+ ], device=self.device) # the std of Gaussian noise for exploration
459
+
460
+ best_reward = -float('inf')
461
+
462
+ # Train the model
463
+ for episode in range(1, num_episodes+1):
464
+ print(f"Episode: {episode}/{num_episodes}, Training: ")
465
+ s = self.reset(random_sg=True)
466
+ episode_actor_loss = 0
467
+ episode_critic_loss = 0
468
+ optimize_times = 0
469
+ for episode_steps in tqdm(range(1, self.max_episode_steps+1)):
470
+ if episode <= self.random_episodes:
471
+ # Take the random actions in the beginning for the better exploration
472
+ a = torch.tensor([
473
+ random.uniform(self.params["MIN_V_INC"], self.params["MAX_V_INC"]),
474
+ random.uniform(self.params["MIN_W_INC"], self.params["MAX_W_INC"])
475
+ ], device=self.device)
476
+ else:
477
+ # Add Gaussian noise to actions for exploration
478
+ a = self.select_action(s)
479
+ a[0] = ((a[0] + torch.normal(0., noise_std[0].item(), size=(1,), device=self.device)).
480
+ clamp(self.params["MIN_V_INC"], self.params["MAX_V_INC"]))
481
+ a[1] = ((a[1] + torch.normal(0., noise_std[1].item(), size=(1,), device=self.device)).
482
+ clamp(self.params["MIN_W_INC"], self.params["MAX_W_INC"]))
483
+ s_, r, done, win = self.step(s, a)
484
+
485
+ self.replay_buffer.store(s, a, r, s_, win) # Store the transition
486
+
487
+ # update the networks if enough samples are available
488
+ if episode > self.random_episodes and (episode_steps % self.update_steps == 0 or done):
489
+ for _ in range(self.update_freq):
490
+ actor_loss, critic_loss = self.optimize_model()
491
+ episode_actor_loss += actor_loss
492
+ episode_critic_loss += critic_loss
493
+ optimize_times += 1
494
+
495
+ if win:
496
+ print(f"Goal reached! State: {s}, Action: {a}, Reward: {r:.4f}, Next State: {s_}")
497
+ break
498
+ elif done: # lose (collide)
499
+ print(f"Collision! State: {s}, Action: {a}, Reward: {r:.4f}, Next State: {s_}")
500
+ break
501
+
502
+ s = s_ # Move to the next state
503
+
504
+ if episode > self.random_episodes:
505
+ average_actor_loss = episode_actor_loss / optimize_times
506
+ average_critic_loss = episode_critic_loss / optimize_times
507
+ self.writer.add_scalar('Actor train loss', average_actor_loss, global_step=episode)
508
+ self.writer.add_scalar('Critic train loss', average_critic_loss, global_step=episode)
509
+
510
+ if episode % self.evaluate_episodes == 0 and episode > self.random_episodes - self.evaluate_episodes:
511
+ print()
512
+ evaluate_reward = self.evaluate_policy()
513
+ print("Evaluate_reward:{}".format(evaluate_reward))
514
+ print()
515
+ self.writer.add_scalar('Evaluate reward', evaluate_reward, global_step=episode)
516
+ self.writer.add_scalar('Learning rate', self.actor_scheduler.optimizer.param_groups[0]['lr'],
517
+ global_step=episode) # Learning rates of the actor and critic are the same
518
+
519
+ self.actor_scheduler.step(evaluate_reward)
520
+ self.critic_scheduler.step(evaluate_reward)
521
+
522
+ # Save the model
523
+ if evaluate_reward > best_reward:
524
+ best_reward = evaluate_reward
525
+
526
+ # Create the directory if it does not exist
527
+ if not os.path.exists(os.path.dirname(self.actor_save_path)):
528
+ os.makedirs(os.path.dirname(self.actor_save_path))
529
+ if not os.path.exists(os.path.dirname(self.critic_save_path)):
530
+ os.makedirs(os.path.dirname(self.critic_save_path))
531
+
532
+ torch.save(self.actor.state_dict(), self.actor_save_path)
533
+ torch.save(self.critic.state_dict(), self.critic_save_path)
534
+
535
+ def reset(self, random_sg: bool = False) -> torch.Tensor:
536
+ """
537
+ Reset the environment and the robot.
538
+
539
+ Parameters:
540
+ random_sg (bool): whether to generate random start and goal or not
541
+
542
+ Returns:
543
+ state (torch.Tensor): initial state of the robot
544
+ """
545
+ if random_sg: # random start and goal
546
+ start = (random.uniform(0, self.env.x_range), random.uniform(0, self.env.y_range), random.uniform(-math.pi, math.pi))
547
+ # generate random start and goal until they are not in collision
548
+ while self.isCollision(start):
549
+ start = (random.uniform(0, self.env.x_range), random.uniform(0, self.env.y_range), random.uniform(-math.pi, math.pi))
550
+
551
+ # goal is on the circle with radius self.params["MAX_LOOKAHEAD_DIST"] centered at start
552
+ goal_angle = random.uniform(-math.pi, math.pi)
553
+ goal_dist = self.params["MAX_LOOKAHEAD_DIST"]
554
+ goal_x = start[0] + goal_dist * math.cos(goal_angle)
555
+ goal_y = start[1] + goal_dist * math.sin(goal_angle)
556
+ goal = (goal_x, goal_y, goal_angle)
557
+
558
+ while self.isCollision(goal):
559
+ goal_angle = random.uniform(-math.pi, math.pi)
560
+ goal_dist = self.params["MAX_LOOKAHEAD_DIST"]
561
+ goal_x = start[0] + goal_dist * math.cos(goal_angle)
562
+ goal_y = start[1] + goal_dist * math.sin(goal_angle)
563
+ goal = (goal_x, goal_y, goal_angle)
564
+
565
+ else:
566
+ start = self.start
567
+ goal = self.goal
568
+
569
+ self.robot = Robot(start[0], start[1], start[2], 0, 0)
570
+ state = self.robot.state # np.array([[self.px], [self.py], [self.theta], [self.v], [self.w]])
571
+ state = np.pad(state, pad_width=((0, 3), (0, 0)), mode='constant')
572
+ state[5:8, 0] = goal
573
+ state = torch.tensor(state, device=self.device, dtype=torch.float).squeeze(dim=1)
574
+ return state
575
+
576
+ def step(self, state: torch.Tensor, action: torch.Tensor) -> tuple:
577
+ """
578
+ Take a step in the environment.
579
+
580
+ Parameters:
581
+ state (torch.Tensor): current state of the robot
582
+ action (torch.Tensor): action to take
583
+
584
+ Returns:
585
+ next_state (torch.Tensor): next state of the robot
586
+ reward (float): reward for taking the action
587
+ done (bool): whether the episode is done
588
+ """
589
+ dt = self.params["TIME_STEP"]
590
+ v_d = (state[3] + action[0] * dt).item()
591
+ w_d = (state[4] + action[1] * dt).item()
592
+ self.robot.kinematic(np.array([[v_d], [w_d]]), dt)
593
+ next_state = self.robot.state
594
+ next_state = np.pad(next_state, pad_width=((0, 3), (0, 0)), mode='constant')
595
+ next_state = torch.tensor(next_state, device=self.device, dtype=torch.float).squeeze(dim=1)
596
+ next_state[5:8] = state[5:8]
597
+ next_state[2] = self.regularizeAngle(next_state[2].item())
598
+ next_state[3] = MathHelper.clamp(next_state[3].item(), self.params["MIN_V"], self.params["MAX_V"])
599
+ next_state[4] = MathHelper.clamp(next_state[4].item(), self.params["MIN_W"], self.params["MAX_W"])
600
+ win = self.reachGoal(tuple(next_state[0:3]), tuple(next_state[5:8]))
601
+ lose = self.isCollision(tuple(next_state[0:2]))
602
+ reward = self.reward(next_state, win, lose)
603
+ done = win or lose
604
+ return next_state, reward, done, win
605
+
606
+ def reward(self, state: torch.Tensor, win: bool, lose: bool) -> float:
607
+ """
608
+ The state reward function.
609
+
610
+ Parameters:
611
+ state (torch.Tensor): current state of the robot
612
+ win (bool): whether the episode is won (reached the goal)
613
+ lose (bool): whether the episode is lost (collided)
614
+
615
+ Returns:
616
+ reward (float): reward for the current state
617
+ """
618
+ reward = 0
619
+
620
+ goal_dist = self.dist((state[0], state[1]), (state[5], state[6]))
621
+ scaled_goal_dist = goal_dist / self.params["MAX_LOOKAHEAD_DIST"]
622
+
623
+ reward -= scaled_goal_dist
624
+
625
+ if win:
626
+ reward += self.max_episode_steps
627
+ if lose:
628
+ reward -= self.max_episode_steps / 5.0
629
+
630
+ return reward