python-motion-planning 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. curve_generation/__init__.py +9 -0
  2. curve_generation/bezier_curve.py +131 -0
  3. curve_generation/bspline_curve.py +271 -0
  4. curve_generation/cubic_spline.py +128 -0
  5. curve_generation/curve.py +64 -0
  6. curve_generation/dubins_curve.py +348 -0
  7. curve_generation/fem_pos_smooth.py +114 -0
  8. curve_generation/polynomial_curve.py +226 -0
  9. curve_generation/reeds_shepp.py +736 -0
  10. global_planner/__init__.py +3 -0
  11. global_planner/evolutionary_search/__init__.py +4 -0
  12. global_planner/evolutionary_search/aco.py +186 -0
  13. global_planner/evolutionary_search/evolutionary_search.py +87 -0
  14. global_planner/evolutionary_search/pso.py +356 -0
  15. global_planner/graph_search/__init__.py +28 -0
  16. global_planner/graph_search/a_star.py +124 -0
  17. global_planner/graph_search/d_star.py +291 -0
  18. global_planner/graph_search/d_star_lite.py +188 -0
  19. global_planner/graph_search/dijkstra.py +77 -0
  20. global_planner/graph_search/gbfs.py +78 -0
  21. global_planner/graph_search/graph_search.py +87 -0
  22. global_planner/graph_search/jps.py +165 -0
  23. global_planner/graph_search/lazy_theta_star.py +114 -0
  24. global_planner/graph_search/lpa_star.py +230 -0
  25. global_planner/graph_search/s_theta_star.py +133 -0
  26. global_planner/graph_search/theta_star.py +171 -0
  27. global_planner/graph_search/voronoi.py +200 -0
  28. global_planner/sample_search/__init__.py +6 -0
  29. global_planner/sample_search/informed_rrt.py +152 -0
  30. global_planner/sample_search/rrt.py +151 -0
  31. global_planner/sample_search/rrt_connect.py +147 -0
  32. global_planner/sample_search/rrt_star.py +77 -0
  33. global_planner/sample_search/sample_search.py +135 -0
  34. local_planner/__init__.py +19 -0
  35. local_planner/apf.py +144 -0
  36. local_planner/ddpg.py +630 -0
  37. local_planner/dqn.py +687 -0
  38. local_planner/dwa.py +212 -0
  39. local_planner/local_planner.py +262 -0
  40. local_planner/lqr.py +146 -0
  41. local_planner/mpc.py +214 -0
  42. local_planner/pid.py +158 -0
  43. local_planner/rpp.py +147 -0
  44. python_motion_planning-1.0.dist-info/LICENSE +674 -0
  45. python_motion_planning-1.0.dist-info/METADATA +873 -0
  46. python_motion_planning-1.0.dist-info/RECORD +65 -0
  47. python_motion_planning-1.0.dist-info/WHEEL +5 -0
  48. python_motion_planning-1.0.dist-info/top_level.txt +4 -0
  49. utils/__init__.py +19 -0
  50. utils/agent/__init__.py +0 -0
  51. utils/agent/agent.py +135 -0
  52. utils/environment/__init__.py +0 -0
  53. utils/environment/env.py +134 -0
  54. utils/environment/node.py +85 -0
  55. utils/environment/point2d.py +96 -0
  56. utils/environment/pose2d.py +91 -0
  57. utils/helper/__init__.py +3 -0
  58. utils/helper/math_helper.py +65 -0
  59. utils/planner/__init__.py +0 -0
  60. utils/planner/control_factory.py +31 -0
  61. utils/planner/curve_factory.py +29 -0
  62. utils/planner/planner.py +40 -0
  63. utils/planner/search_factory.py +51 -0
  64. utils/plot/__init__.py +0 -0
  65. utils/plot/plot.py +274 -0
local_planner/dqn.py ADDED
@@ -0,0 +1,687 @@
1
+ """
2
+ @file: ddpg.py
3
+ @breif: Deep Deterministic Policy Gradient (DDPG) motion planning.
4
+ @author: Wu Maojia
5
+ @update: 2024.5.24
6
+ """
7
+ import numpy as np
8
+ import itertools
9
+ import torch
10
+ import torch.nn as nn
11
+ from torch.utils.tensorboard import SummaryWriter
12
+ import random
13
+ from tqdm import tqdm
14
+ import math
15
+ import datetime
16
+ import os
17
+ from collections import namedtuple, deque
18
+
19
+ from .local_planner import LocalPlanner
20
+ from python_motion_planning.utils import Env, MathHelper, Robot
21
+
22
+ ActionRot = namedtuple("ActionRot", ["v", "w"])
23
+
24
+ class BasicBuffer:
25
+ """
26
+ Basic replay buffer.
27
+
28
+ Parameters:
29
+ max_size (int): buffer capacity
30
+ """
31
+ def __init__(self, max_size):
32
+ self.max_size = max_size
33
+ self.buffer = deque(maxlen=max_size)
34
+
35
+ def push(self, *experience):
36
+ """
37
+ Injecting an experience into the replay buffer.
38
+
39
+ Parameters:
40
+ experience (tuple): five-element tuple including state, action, reward, next_state and done flag
41
+ """
42
+ state, action, reward, next_state, done = experience
43
+ self.buffer.append((state, action, np.array([reward]), next_state, done))
44
+
45
+ def sample(self, batch_size):
46
+ """
47
+ Sampling a batch of data.
48
+
49
+ Parameters:
50
+ batch_size (int): the size of sampling batch
51
+ """
52
+ state_batch, action_batch, reward_batch, next_state_batch, done_batch = [], [], [], [], []
53
+ batch = random.sample(self.buffer, batch_size)
54
+
55
+ for experience in batch:
56
+ state, action, reward, next_state, done = experience
57
+ state_batch.append(state)
58
+ action_batch.append(action)
59
+ reward_batch.append(reward)
60
+ next_state_batch.append(next_state)
61
+ done_batch.append(done)
62
+
63
+ return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)
64
+
65
+ def sampleSequence(self, batch_size):
66
+ """
67
+ Sampling a contiguous batch of data.
68
+
69
+ Parameters:
70
+ batch_size (int): the size of sampling batch
71
+ """
72
+ state_batch, action_batch, reward_batch, next_state_batch, done_batch = [], [], [], [], []
73
+ start = np.random.randint(0, len(self.buffer) - batch_size)
74
+
75
+ for sample in range(start, start + batch_size):
76
+ state, action, reward, next_state, done = self.buffer[sample]
77
+ state_batch.append(state)
78
+ action_batch.append(action)
79
+ reward_batch.append(reward)
80
+ next_state_batch.append(next_state)
81
+ done_batch.append(done)
82
+
83
+ return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)
84
+
85
+ def __len__(self):
86
+ return len(self.buffer)
87
+
88
+ class SumTree:
89
+ """
90
+ Sum tree structure.
91
+
92
+ Parameters:
93
+ capacity (int): buffer capacity (must be even)
94
+ """
95
+ def __init__(self, capacity):
96
+ self.capacity = capacity
97
+ self.tree = np.zeros(2 * capacity - 1)
98
+ self.data = np.zeros(capacity, dtype=object)
99
+ # pointer to the current leaf node
100
+ self.write = 0
101
+ # the amount of data cached in the sum tree
102
+ self.size = 0
103
+
104
+ def _propagate(self, idx, change):
105
+ """
106
+ Recursively updating the priority of the tree.
107
+
108
+ Parameters:
109
+ idx (int): tree node index
110
+ change (int): priority incrementation
111
+
112
+ Example: index of the six-node sum tree
113
+ 0
114
+ / \
115
+ 1 2
116
+ / \ / \
117
+ 3 4 5 6
118
+ / \ / \
119
+ 7 8 9 10
120
+ """
121
+ parent = (idx - 1) // 2
122
+ self.tree[parent] += change
123
+ if parent != 0:
124
+ self._propagate(parent, change)
125
+
126
+ def _retrieve(self, idx, s):
127
+ """
128
+ Recursively finding leaf nodes (where s falls within a node interval).
129
+
130
+ Parameters:
131
+ idx (int): index of the subtree root node
132
+ s (int): sampling priority
133
+ """
134
+ left = 2 * idx + 1
135
+ right = left + 1
136
+ if left >= len(self.tree):
137
+ return idx
138
+ if s <= self.tree[left]:
139
+ return self._retrieve(left, s)
140
+ else:
141
+ return self._retrieve(right, s - self.tree[left])
142
+
143
+ def total(self):
144
+ """
145
+ Returning the root node, i.e., the total priority weight.
146
+ """
147
+ return self.tree[0]
148
+
149
+ def add(self, p, data):
150
+ """
151
+ Adding data with priorities to the sum tree.
152
+
153
+ Parameters:
154
+ p (int): priority
155
+ data (tuple): data
156
+ """
157
+ idx = self.write + self.capacity - 1
158
+ self.data[self.write] = data
159
+ self.update(idx, p)
160
+ self.write += 1
161
+ self.size = min(self.capacity, self.size + 1)
162
+ if self.write >= self.capacity:
163
+ self.write = 0
164
+
165
+ def update(self, idx, p):
166
+ """
167
+ Update data of the sum tree.
168
+
169
+ Parameters:
170
+ idx (int): tree node index
171
+ p (int): priority
172
+ """
173
+ change = p - self.tree[idx]
174
+ self.tree[idx] = p
175
+ self._propagate(idx, change)
176
+ self.tree = self.tree / self.tree.max()
177
+
178
+ def get(self, s):
179
+ """
180
+ Obtaining leaf node data based on the sampled value.
181
+
182
+ Parameters:
183
+ s (int): sampling priority
184
+ """
185
+ idx = self._retrieve(0, s)
186
+ dataIdx = idx - self.capacity + 1
187
+ return (idx, self.tree[idx], self.data[dataIdx])
188
+
189
+
190
+ class PrioritizedBuffer:
191
+ def __init__(self, max_size, alpha=0.6, beta=0.4):
192
+ """
193
+ Priority replay buffer.
194
+ """
195
+ self.sum_tree = SumTree(max_size)
196
+ self.alpha = alpha
197
+ self.beta = beta
198
+ self.cur_size = 0
199
+
200
+ def push(self, *experience):
201
+ """
202
+ Injecting an experience into the replay buffer.
203
+
204
+ Parameters:
205
+ experience (tuple): five-element tuple including state, action, reward, next_state and done flag
206
+ """
207
+ priority = 1.0 if self.cur_size == 0 else self.sum_tree.tree.max()
208
+ self.cur_size = self.cur_size + 1
209
+ state, action, reward, next_state, done = experience
210
+ self.sum_tree.add(priority, (state, action, np.array([reward]), next_state, done))
211
+
212
+ def sample(self, batch_size):
213
+ """
214
+ Sampling a batch of data.
215
+
216
+ Parameters:
217
+ batch_size (int): the size of sampling batch
218
+ """
219
+ batch_idx, batch, probs = [], [], []
220
+ segment = self.sum_tree.total() / batch_size
221
+
222
+ for i in range(batch_size):
223
+ a = segment * i
224
+ b = segment * (i + 1)
225
+ s = random.uniform(a, b)
226
+ idx, p, data = self.sum_tree.get(s)
227
+ batch_idx.append(idx)
228
+ batch.append(data)
229
+ probs.append(p)
230
+
231
+ weights = np.power(self.sum_tree.size * np.array(probs) / self.sum_tree.total(), -self.beta)
232
+ weights = (weights / weights.max()).tolist()
233
+
234
+ state_batch, action_batch, reward_batch, next_state_batch, done_batch = [], [], [], [], []
235
+ for transition in batch:
236
+ state, action, reward, next_state, done = transition
237
+ state_batch.append(state)
238
+ action_batch.append(action)
239
+ reward_batch.append(reward)
240
+ next_state_batch.append(next_state)
241
+ done_batch.append(done)
242
+
243
+ return (state_batch, action_batch, reward_batch, next_state_batch, done_batch), batch_idx, weights
244
+
245
+ def updatePriority(self, idx, td_error):
246
+ """
247
+ Updating priorities based on temporal-difference
248
+ """
249
+ priority = td_error ** self.alpha
250
+ self.sum_tree.update(idx, priority)
251
+
252
+ def __len__(self):
253
+ return self.cur_size
254
+
255
+
256
+ class DQN(nn.Module):
257
+ def __init__(self, state_dim, action_dim):
258
+ super(DQN, self).__init__()
259
+ self.state_dim = state_dim
260
+ self.action_dim = action_dim
261
+
262
+ self.fc = nn.Sequential(
263
+ nn.Linear(self.state_dim, 128),
264
+ nn.ReLU(),
265
+ nn.Linear(128, 256),
266
+ nn.ReLU(),
267
+ nn.Linear(256, self.action_dim)
268
+ )
269
+
270
+ def forward(self, state):
271
+ qvals = self.fc(state)
272
+ return qvals
273
+
274
+ class DQNPlanner(LocalPlanner):
275
+ """
276
+ Class for Fully Connected Deep Q-Value Network (DQN) motion planning.
277
+
278
+ Parameters:
279
+ start (tuple): start point coordinate
280
+ goal (tuple): goal point coordinate
281
+ env (Env): environment
282
+ heuristic_type (str): heuristic function type
283
+ hidden_depth (int): the number of hidden layers of the neural network
284
+ hidden_width (int): the number of neurons in hidden layers of the neural network
285
+ batch_size (int): batch size to optimize the neural networks
286
+ buffer_size (int): maximum replay buffer size
287
+ gamma (float): discount factor
288
+ tau (float): Softly update the target network
289
+ lr (float): learning rate
290
+ train_noise (float): Action noise coefficient during training for exploration
291
+ random_episodes (int): Take the random actions in the beginning for the better exploration
292
+ max_episode_steps (int): Maximum steps for each episode
293
+ update_freq (int): Frequency (times) of updating the network for each step
294
+ update_steps (int): Update the network for every 'update_steps' steps
295
+ evaluate_freq (int): Frequency (times) of evaluations and calculate the average
296
+ evaluate_episodes (int): Evaluate the network every 'evaluate_episodes' episodes
297
+ actor_save_path (str): Save path of the trained actor network
298
+ critic_save_path (str): Save path of the trained critic network
299
+ actor_load_path (str): Load path of the trained actor network
300
+ critic_load_path (str): Load path of the trained critic network
301
+ **params: other parameters can be found in the parent class LocalPlanner
302
+
303
+ Examples:
304
+ >>> from python_motion_planning.utils import Grid
305
+ >>> from python_motion_planning.local_planner import DDPG
306
+ >>> plt = DDPG(start=(5, 5, 0), goal=(45, 25, 0), env=Grid(51, 31),
307
+ actor_save_path="models/actor_best.pth", critic_save_path="models/critic_best.pth")
308
+ >>> plt.train(num_episodes=10000)
309
+
310
+ # load the trained model and run
311
+ >>> plt = DDPG(start=(5, 5, 0), goal=(45, 25, 0), env=Grid(51, 31),
312
+ actor_load_path="models/actor_best.pth", critic_load_path="models/critic_best.pth")
313
+ >>> plt.run()
314
+ """
315
+ def __init__(self, start: tuple, goal: tuple, env: Env, heuristic_type: str = "euclidean",
316
+ batch_size: int = 2000, buffer_size: int = 1e6,
317
+ gamma: float = 0.999, tau: float = 1e-3, lr: float = 1e-4, train_noise: float = 0.1,
318
+ random_episodes: int = 50, max_episode_steps: int = 200,
319
+ update_freq: int = 1, update_steps: int = 1, evaluate_freq: int = 50, evaluate_episodes: int = 50,
320
+ model_save_path: str = "models/dqn_best.pth",
321
+ model_load_path: str = None,
322
+ **params) -> None:
323
+ super().__init__(start, goal, env, heuristic_type, **params)
324
+ # DDPG parameters
325
+ self.batch_size = int(batch_size) # batch size to optimize the neural networks
326
+ self.buffer_size = int(buffer_size) # maximum replay buffer size
327
+ self.gamma = gamma # discount factor
328
+ self.tau = tau # Softly update the target network
329
+ self.lr = lr # learning rate
330
+ self.train_noise = train_noise # Action noise coefficient during training for exploration
331
+ self.random_episodes = random_episodes # Take the random actions in the beginning for the better exploration
332
+ self.max_episode_steps = max_episode_steps # Maximum steps for each episode
333
+ self.update_freq = update_freq # Frequency (times) of updating the network for each step
334
+ self.update_steps = update_steps # Update the network for every 'update_steps' steps
335
+ self.evaluate_freq = evaluate_freq # Frequency (times) of evaluations and calculate the average
336
+ self.evaluate_episodes = evaluate_episodes # Evaluate the network every 'evaluate_episodes' episodes
337
+ self.model_save_path = model_save_path # Save path of the trained network
338
+ self.epsilon, self.epsilon_max, self.epsilon_delta = 0.0, 0.95, 5e-4
339
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
340
+ print(f"Using device: {self.device}")
341
+
342
+ self.action_space = self.buildActionSpace()
343
+ self.n_observations = 8 # x, y, theta, v, w, g_x, g_y, g_theta
344
+ self.n_actions = len(self.action_space)
345
+
346
+ self.model = DQN(self.n_observations, self.n_actions).to(self.device)
347
+ if model_load_path:
348
+ self.model.load_state_dict(torch.load(model_load_path))
349
+ self.target_model = DQN(self.n_observations, self.n_actions).to(self.device)
350
+ for target_param, param in zip(self.target_model.parameters(), self.model.parameters()):
351
+ target_param.data.copy_(param)
352
+
353
+ self.optimizer = torch.optim.Adam(self.model.parameters(), self.lr)
354
+
355
+ self.criterion = nn.MSELoss()
356
+
357
+ # self.replay_buffer = BasicBuffer(max_size=self.buffer_size)
358
+ self.replay_buffer = PrioritizedBuffer(max_size=self.buffer_size)
359
+
360
+ # Build a tensorboard
361
+ self.writer = SummaryWriter(log_dir='runs/DQN_{}'.format(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))
362
+
363
+ # global planner
364
+ g_start = (start[0], start[1])
365
+ g_goal = (goal[0], goal[1])
366
+ self.g_planner = {"planner_name": "a_star", "start": g_start, "goal": g_goal, "env": env}
367
+ self.path = self.g_path[::-1]
368
+
369
+ def __del__(self) -> None:
370
+ self.writer.close()
371
+
372
+ def __str__(self) -> str:
373
+ return "Fully Connected Deep Q-Value Network (DQN)"
374
+
375
+ def plan(self) -> tuple:
376
+ """
377
+ Deep Deterministic Policy Gradient (DDPG) motion plan function.
378
+
379
+ Returns:
380
+ flag (bool): planning successful if true else failed
381
+ pose_list (list): history poses of robot
382
+ """
383
+ s = self.reset()
384
+ for _ in range(self.params["MAX_ITERATION"]):
385
+ # break until goal reached
386
+ if self.reach_goal(tuple(s[0:3]), tuple(s[5:8])):
387
+ return True, self.robot.history_pose
388
+
389
+ # get the particular point on the path at the lookahead distance to track
390
+ lookahead_pt, theta_trj, kappa = self.getLookaheadPoint()
391
+ s[5:7] = torch.tensor(lookahead_pt, device=self.device)
392
+ s[7] = torch.tensor(theta_trj, device=self.device)
393
+
394
+ a = self.policy(s) # get the action from the actor network
395
+ s_, r, done, win = self.step(s, a) # take the action and get the next state and reward
396
+ s = s_ # Move to the next state
397
+ self.robot.px, self.robot.py, self.robot.theta, self.robot.v, self.robot.w = tuple(s[0:5].cpu().numpy())
398
+
399
+ return True, self.robot.history_pose
400
+ # return False, None
401
+
402
+ def run(self) -> None:
403
+ """
404
+ Running both plannig and animation.
405
+ """
406
+ _, history_pose = self.plan()
407
+ print(f"Number of iterations: {len(history_pose)}")
408
+ if not history_pose:
409
+ raise ValueError("Path not found and planning failed!")
410
+
411
+ path = np.array(history_pose)[:, 0:2]
412
+ cost = np.sum(np.sqrt(np.sum(np.diff(path, axis=0)**2, axis=1, keepdims=True)))
413
+ self.plot.plotPath(self.path, path_color="r", path_style="--")
414
+ self.plot.animation(path, str(self), cost, history_pose=history_pose)
415
+
416
+ def buildActionSpace(self):
417
+ '''
418
+ Action space consists of 25 uniformly sampled actions in permitted range
419
+ and 25 randomly sampled actions.
420
+ '''
421
+ speed_samples, rotation_samples = 5, 16
422
+ speeds = [
423
+ (np.exp((i + 1) / speed_samples) - 1) / (np.e - 1) * self.params["MAX_V"]
424
+ for i in range(speed_samples)
425
+ ]
426
+ rotations = np.linspace(self.params["MIN_W"], self.params["MAX_W"], rotation_samples)
427
+
428
+ action_space = [ActionRot(0, 0)]
429
+ for rotation, speed in itertools.product(rotations, speeds):
430
+ action_space.append(ActionRot(speed, rotation))
431
+
432
+ return action_space
433
+
434
+ def optimize_model(self) -> tuple:
435
+ """
436
+ Optimize the neural networks when training.
437
+
438
+ Returns:
439
+ actor_loss (float): actor loss
440
+ critic_loss (float): critic loss
441
+ """
442
+ # basic buffer
443
+ states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)
444
+
445
+ # # priority buffer
446
+ # transitions, idxs, weights = self.replay_buffer.sample(self.batch_size)
447
+ # states, actions, rewards, next_states, dones = transitions
448
+
449
+ states = torch.stack(states).to(self.device)
450
+ actions = torch.LongTensor(actions).to(self.device)
451
+ rewards = torch.FloatTensor(rewards).to(self.device)
452
+ next_states = torch.stack(next_states).to(self.device)
453
+ dones = (1 - torch.FloatTensor(dones)).to(self.device)
454
+ weights = torch.FloatTensor(weights).to(self.device)
455
+
456
+ # basic buffer
457
+ curr_Q = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
458
+ next_Q = self.target_model(next_states)
459
+ max_next_Q = torch.max(next_Q, 1)[0]
460
+ expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q * dones
461
+ loss = self.criterion(curr_Q, expected_Q.detach())
462
+
463
+ # # priority buffer
464
+ # curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1)).squeeze(1)
465
+ # next_a = torch.argmax(self.model.forward(next_states), dim=1)
466
+ # next_Q = self.target_model.forward(next_states).gather(1, next_a.unsqueeze(1)).squeeze(1)
467
+ # expected_Q = rewards.squeeze(1) + self.gamma * next_Q * dones
468
+
469
+ # td_errors = torch.abs(curr_Q - expected_Q)
470
+ # loss = self.criterion(torch.sqrt(weights) * curr_Q, torch.sqrt(weights) * expected_Q.detach())
471
+
472
+ self.optimizer.zero_grad()
473
+ loss.backward()
474
+ self.optimizer.step()
475
+
476
+ # update target network
477
+ for target_param, param in zip(self.target_model.parameters(), self.model.parameters()):
478
+ target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)
479
+
480
+ self.epsilon = self.epsilon + self.epsilon_delta \
481
+ if self.epsilon < self.epsilon_max else self.epsilon_max
482
+
483
+ # # updating priorities based on temporal-difference
484
+ # for idx, td_error in zip(idxs, td_errors.cpu().detach().numpy()):
485
+ # self.replay_buffer.updatePriority(idx, td_error)
486
+
487
+ return loss.item()
488
+
489
+ def evaluate_policy(self) -> float:
490
+ """
491
+ Evaluate the policy and calculating the average reward.
492
+
493
+ Returns:
494
+ evaluate_reward (float): average reward of the policy
495
+ """
496
+ print(f"Evaluating: ")
497
+ evaluate_reward = 0
498
+ for _ in tqdm(range(self.evaluate_freq)):
499
+ s = self.reset(random_sg=True)
500
+ done = False
501
+ episode_reward = 0
502
+ step = 0
503
+ while not done:
504
+ a = self.policy(s) # We do not add noise when evaluating
505
+ s_, r, done, win = self.step(s, a)
506
+ self.replay_buffer.push(s, a, r, s_, win) # Store the transition
507
+ episode_reward += r
508
+ s = s_
509
+ step += 1
510
+ if step >= self.max_episode_steps:
511
+ break
512
+ evaluate_reward += episode_reward / step
513
+
514
+ return evaluate_reward / self.evaluate_freq
515
+
516
+ def policy(self, state, mode='random'):
517
+ # state = torch.FloatTensor(state).unsqueeze(0).to(self.device) # 化成batch_size=1的数据
518
+ qvals = self.model.forward(state)
519
+ action = np.argmax(qvals.cpu().detach().numpy())
520
+ if mode=='random' and np.random.randn() > self.epsilon:
521
+ return np.random.randint(0, len(self.action_space))
522
+ return action
523
+
524
+ def train(self, num_episodes: int = 10000) -> None:
525
+ """
526
+ Train the model.
527
+
528
+ Parameters:
529
+ num_episodes (int): number of episodes to train the model
530
+ """
531
+ noise_std = self.train_noise * torch.tensor([
532
+ self.params["MAX_V_INC"] - self.params["MIN_V_INC"],
533
+ self.params["MAX_W_INC"] - self.params["MIN_W_INC"]
534
+ ], device=self.device) # the std of Gaussian noise for exploration
535
+
536
+ best_reward = -float('inf')
537
+
538
+ # Train the model
539
+ for episode in range(1, num_episodes+1):
540
+ print(f"Episode: {episode}/{num_episodes}, Training: ")
541
+ s = self.reset(random_sg=True)
542
+ episode_loss = 0
543
+ optimize_times = 0
544
+ for episode_steps in tqdm(range(1, self.max_episode_steps+1)):
545
+ a = self.policy(s)
546
+ s_, r, done, win = self.step(s, a)
547
+ self.replay_buffer.push(s, a, r, s_, win) # Store the transition
548
+
549
+ # update the networks if enough samples are available
550
+ if episode > self.random_episodes and (episode_steps % self.update_steps == 0 or done):
551
+ for _ in range(self.update_freq):
552
+ loss = self.optimize_model()
553
+ episode_loss += loss
554
+ optimize_times += 1
555
+
556
+ if win:
557
+ print(f"Goal reached! State: {s}, Action: {a}, Reward: {r:.4f}, Next State: {s_}")
558
+ break
559
+ elif done: # lose (collide)
560
+ print(f"Collision! State: {s}, Action: {a}, Reward: {r:.4f}, Next State: {s_}")
561
+ break
562
+
563
+ s = s_ # Move to the next state
564
+
565
+ if episode > self.random_episodes:
566
+ average_loss = episode_loss / optimize_times
567
+ self.writer.add_scalar('train loss', average_loss, global_step=episode)
568
+
569
+ if episode % self.evaluate_episodes == 0 and episode > self.random_episodes - self.evaluate_episodes:
570
+ print()
571
+ evaluate_reward = self.evaluate_policy()
572
+ print("Evaluate_reward:{}".format(evaluate_reward))
573
+ print()
574
+ self.writer.add_scalar('Evaluate reward', evaluate_reward, global_step=episode)
575
+
576
+ # Save the model
577
+ if evaluate_reward > best_reward:
578
+ best_reward = evaluate_reward
579
+
580
+ # Create the directory if it does not exist
581
+ if not os.path.exists(os.path.dirname(self.model_save_path)):
582
+ os.makedirs(os.path.dirname(self.model_save_path))
583
+
584
+ torch.save(self.model.state_dict(), self.model_save_path)
585
+
586
+ def reset(self, random_sg: bool = False) -> torch.Tensor:
587
+ """
588
+ Reset the environment and the robot.
589
+
590
+ Parameters:
591
+ random_sg (bool): whether to generate random start and goal or not
592
+
593
+ Returns:
594
+ state (torch.Tensor): initial state of the robot
595
+ """
596
+ if random_sg: # random start and goal
597
+ start = (random.uniform(0, self.env.x_range), random.uniform(0, self.env.y_range), random.uniform(-math.pi, math.pi))
598
+ # generate random start and goal until they are not in collision
599
+ while self.in_collision(start):
600
+ start = (random.uniform(0, self.env.x_range), random.uniform(0, self.env.y_range), random.uniform(-math.pi, math.pi))
601
+
602
+ # goal is on the circle with radius self.params["MAX_LOOKAHEAD_DIST"] centered at start
603
+ goal_angle = random.uniform(-math.pi, math.pi)
604
+ goal_dist = self.params["MAX_LOOKAHEAD_DIST"]
605
+ goal_x = start[0] + goal_dist * math.cos(goal_angle)
606
+ goal_y = start[1] + goal_dist * math.sin(goal_angle)
607
+ goal = (goal_x, goal_y, goal_angle)
608
+
609
+ while self.in_collision(goal):
610
+ goal_angle = random.uniform(-math.pi, math.pi)
611
+ goal_dist = self.params["MAX_LOOKAHEAD_DIST"]
612
+ goal_x = start[0] + goal_dist * math.cos(goal_angle)
613
+ goal_y = start[1] + goal_dist * math.sin(goal_angle)
614
+ goal = (goal_x, goal_y, goal_angle)
615
+
616
+ else:
617
+ start = self.start
618
+ goal = self.goal
619
+
620
+ self.robot = Robot(start[0], start[1], start[2], 0, 0)
621
+ state = self.robot.state # np.array([[self.px], [self.py], [self.theta], [self.v], [self.w]])
622
+ state = np.pad(state, pad_width=((0, 3), (0, 0)), mode='constant')
623
+ state[5:8, 0] = goal
624
+ state = torch.tensor(state, device=self.device, dtype=torch.float).squeeze(dim=1)
625
+ return state
626
+
627
+ def step(self, state: torch.Tensor, action) -> tuple:
628
+ """
629
+ Take a step in the environment.
630
+
631
+ Parameters:
632
+ state (torch.Tensor): current state of the robot
633
+ action (torch.Tensor): action to take
634
+
635
+ Returns:
636
+ next_state (torch.Tensor): next state of the robot
637
+ reward (float): reward for taking the action
638
+ done (bool): whether the episode is done
639
+ """
640
+ dt = self.params["TIME_STEP"]
641
+ v_d = state[3].item() + self.action_space[action].v * dt
642
+ w_d = state[4].item() + self.action_space[action].w * dt
643
+ self.robot.kinematic(np.array([[v_d], [w_d]]), dt)
644
+ next_state = self.robot.state
645
+ next_state = np.pad(next_state, pad_width=((0, 3), (0, 0)), mode='constant')
646
+ next_state = torch.tensor(next_state, device=self.device, dtype=torch.float).squeeze(dim=1)
647
+ next_state[5:8] = state[5:8]
648
+ next_state[2] = self.regularizeAngle(next_state[2].item())
649
+ next_state[3] = MathHelper.clamp(next_state[3].item(), self.params["MIN_V"], self.params["MAX_V"])
650
+ next_state[4] = MathHelper.clamp(next_state[4].item(), self.params["MIN_W"], self.params["MAX_W"])
651
+ win = self.reach_goal(tuple(next_state[0:3]), tuple(next_state[5:8]))
652
+ lose = self.in_collision(tuple(next_state[0:2]))
653
+ reward = self.reward(state, next_state, win, lose)
654
+ done = win or lose
655
+ return next_state, reward, done, win
656
+
657
+ def reward(self, prev_state: torch.Tensor, state: torch.Tensor, win: bool, lose: bool) -> float:
658
+ """
659
+ The state reward function.
660
+
661
+ Parameters:
662
+ state (torch.Tensor): current state of the robot
663
+ win (bool): whether the episode is won (reached the goal)
664
+ lose (bool): whether the episode is lost (collided)
665
+
666
+ Returns:
667
+ reward (float): reward for the current state
668
+ """
669
+ reward = 0
670
+
671
+ goal_dist = self.dist((state[0], state[1]), (state[5], state[6]))
672
+ scaled_goal_dist = goal_dist / self.params["MAX_LOOKAHEAD_DIST"]
673
+
674
+ reward -= scaled_goal_dist
675
+
676
+ prev_goal_dist = self.dist((prev_state[0], prev_state[1]), (state[5], state[6]))
677
+ if goal_dist < prev_goal_dist:
678
+ reward += 0.01
679
+ else:
680
+ reward -= 0.01
681
+
682
+ if win:
683
+ reward += self.max_episode_steps
684
+ if lose:
685
+ reward -= self.max_episode_steps / 5.0
686
+
687
+ return reward