gr-libs 0.1.7.post0__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaluation/analyze_results_cross_alg_cross_domain.py +236 -246
- evaluation/create_minigrid_map_image.py +10 -6
- evaluation/file_system.py +16 -5
- evaluation/generate_experiments_results.py +123 -74
- evaluation/generate_experiments_results_new_ver1.py +227 -243
- evaluation/generate_experiments_results_new_ver2.py +317 -317
- evaluation/generate_task_specific_statistics_plots.py +481 -253
- evaluation/get_plans_images.py +41 -26
- evaluation/increasing_and_decreasing_.py +97 -56
- gr_libs/__init__.py +2 -1
- gr_libs/_version.py +2 -2
- gr_libs/environment/__init__.py +16 -8
- gr_libs/environment/environment.py +167 -39
- gr_libs/environment/utils/utils.py +22 -12
- gr_libs/metrics/__init__.py +5 -0
- gr_libs/metrics/metrics.py +76 -34
- gr_libs/ml/__init__.py +2 -0
- gr_libs/ml/agent.py +21 -6
- gr_libs/ml/base/__init__.py +1 -1
- gr_libs/ml/base/rl_agent.py +13 -10
- gr_libs/ml/consts.py +1 -1
- gr_libs/ml/neural/deep_rl_learner.py +433 -352
- gr_libs/ml/neural/utils/__init__.py +1 -1
- gr_libs/ml/neural/utils/dictlist.py +3 -3
- gr_libs/ml/neural/utils/penv.py +5 -2
- gr_libs/ml/planner/mcts/mcts_model.py +524 -302
- gr_libs/ml/planner/mcts/utils/__init__.py +1 -1
- gr_libs/ml/planner/mcts/utils/node.py +11 -7
- gr_libs/ml/planner/mcts/utils/tree.py +14 -10
- gr_libs/ml/sequential/__init__.py +1 -1
- gr_libs/ml/sequential/lstm_model.py +256 -175
- gr_libs/ml/tabular/state.py +7 -7
- gr_libs/ml/tabular/tabular_q_learner.py +123 -73
- gr_libs/ml/tabular/tabular_rl_agent.py +20 -19
- gr_libs/ml/utils/__init__.py +8 -2
- gr_libs/ml/utils/format.py +78 -70
- gr_libs/ml/utils/math.py +2 -1
- gr_libs/ml/utils/other.py +1 -1
- gr_libs/ml/utils/storage.py +88 -28
- gr_libs/problems/consts.py +1549 -1227
- gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +145 -80
- gr_libs/recognizer/graml/gr_dataset.py +209 -110
- gr_libs/recognizer/graml/graml_recognizer.py +431 -240
- gr_libs/recognizer/recognizer.py +38 -27
- gr_libs/recognizer/utils/__init__.py +1 -1
- gr_libs/recognizer/utils/format.py +8 -3
- {gr_libs-0.1.7.post0.dist-info → gr_libs-0.1.8.dist-info}/METADATA +1 -1
- gr_libs-0.1.8.dist-info/RECORD +70 -0
- {gr_libs-0.1.7.post0.dist-info → gr_libs-0.1.8.dist-info}/WHEEL +1 -1
- tests/test_gcdraco.py +10 -0
- tests/test_graml.py +8 -4
- tests/test_graql.py +2 -1
- tutorials/gcdraco_panda_tutorial.py +66 -0
- tutorials/gcdraco_parking_tutorial.py +61 -0
- tutorials/graml_minigrid_tutorial.py +42 -12
- tutorials/graml_panda_tutorial.py +35 -14
- tutorials/graml_parking_tutorial.py +37 -20
- tutorials/graml_point_maze_tutorial.py +33 -13
- tutorials/graql_minigrid_tutorial.py +31 -15
- gr_libs-0.1.7.post0.dist-info/RECORD +0 -67
- {gr_libs-0.1.7.post0.dist-info → gr_libs-0.1.8.dist-info}/top_level.txt +0 -0
@@ -13,318 +13,540 @@ import gymnasium as gym
|
|
13
13
|
PROB = 0.8
|
14
14
|
UNIFORM_PROB = 0.1
|
15
15
|
newely_expanded = 0
|
16
|
-
dict_dir_id_to_str = {0:
|
17
|
-
dict_action_id_to_str = {0:
|
16
|
+
dict_dir_id_to_str = {0: "right", 1: "down", 2: "left", 3: "up"}
|
17
|
+
dict_action_id_to_str = {0: "turn left", 1: "turn right", 2: "go straight"}
|
18
|
+
|
18
19
|
|
19
20
|
def save_figure(steps, env_name, problem_name, img_path, env_prop):
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
sequence = [pos for ((state, pos), action) in steps]
|
22
|
+
# print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
|
23
|
+
print(f"generating sequence image at {img_path}.")
|
24
|
+
env_prop.create_sequence_image(sequence, img_path, problem_name)
|
25
|
+
|
24
26
|
|
25
27
|
# TODO add number of expanded nodes and debug by putting breakpoint on the creation of nodes representing (8,4) and checking if they're invalid or something
|
26
28
|
|
29
|
+
|
27
30
|
# Explanation on hashing and uncertainty in the acto outcome:
|
28
31
|
# We want to detect circles, while not preventing expected behavior. To achieve it, hasing must include previous state, action, and resulting state.
|
29
32
|
# Hashing the direction means coming to the same position from different positions gets different id's.
|
30
|
-
# Example: the agent might have stood at (2,2), picked action 2 (forward), and accidently turned right, resulting at state ((2,2), right).
|
31
|
-
#
|
32
|
-
#
|
33
|
+
# Example: the agent might have stood at (2,2), picked action 2 (forward), and accidently turned right, resulting at state ((2,2), right).
|
34
|
+
# later, when the agent stood at (2,1), looked right and walked forward, it got to the same state. We would want to enable that, because
|
35
|
+
# this is the expected behavior, so these nodes must have unique id's.
|
33
36
|
# The situations where circles will indeed be detected, are only if the outcome was the same for the previous state, consistent with the action - whether it was or wasn't expected.
|
34
|
-
class MonteCarloTreeSearch
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
37
|
+
class MonteCarloTreeSearch:
|
38
|
+
|
39
|
+
def __init__(self, env, tree, goal, use_heuristic=True):
|
40
|
+
self.env = env
|
41
|
+
self.tree = tree
|
42
|
+
self.action_space = self.env.action_space.n
|
43
|
+
self.action_space = 3 # currently
|
44
|
+
state, _ = self.env.reset()
|
45
|
+
self.use_heuristic = use_heuristic
|
46
|
+
self.goal = goal
|
47
|
+
self.tree.add_node(
|
48
|
+
Node(
|
49
|
+
identifier=hash(
|
50
|
+
(
|
51
|
+
None,
|
52
|
+
None,
|
53
|
+
tuple(self.env.unwrapped.agent_pos),
|
54
|
+
state["direction"],
|
55
|
+
)
|
56
|
+
),
|
57
|
+
state=state,
|
58
|
+
action=None,
|
59
|
+
action_space=self.action_space,
|
60
|
+
reward=0,
|
61
|
+
terminal=False,
|
62
|
+
pos=env.unwrapped.agent_pos,
|
63
|
+
depth=0,
|
64
|
+
)
|
65
|
+
)
|
66
|
+
self.plan = []
|
67
|
+
|
68
|
+
# def mark_invalid_children(self, children_identifiers, action):
|
69
|
+
# for child_id in children_identifiers:
|
70
|
+
# child = self.tree.nodes[child_id]
|
71
|
+
# if child.action == action:
|
72
|
+
# child.invalid = True
|
73
|
+
|
74
|
+
def decide_invalid_path(
|
75
|
+
self, new_node_father, old_node, new_node
|
76
|
+
): # new_node created the circle, old_node got to the configuration first.
|
77
|
+
new_visits, old_visits = [1, 1], [
|
78
|
+
0,
|
79
|
+
0,
|
80
|
+
] # stochasticity couldn't result a cycle directly, because it involves a different action. we can get it only by making the same stochastic action mistake or just an actual cycle.
|
81
|
+
new_node_ptr = new_node_father
|
82
|
+
old_node_ptr = old_node
|
83
|
+
|
84
|
+
while new_node_ptr != None:
|
85
|
+
new_visits[0] += new_node_ptr.num_visits
|
86
|
+
new_visits[1] += 1
|
87
|
+
new_node_ptr = self.tree.parent(new_node_ptr)
|
88
|
+
|
89
|
+
while (
|
90
|
+
old_node_ptr != None
|
91
|
+
): # getting to the old node wasn't necessarily through the current root. check all the way until None, the original root's parent.
|
92
|
+
old_visits[0] += old_node_ptr.num_visits
|
93
|
+
old_visits[1] += 1
|
94
|
+
old_node_ptr = self.tree.parent(old_node_ptr)
|
95
|
+
|
96
|
+
if (
|
97
|
+
new_visits[0] / new_visits[1] > old_visits[0] / old_visits[1]
|
98
|
+
): # newer node is the more probable one. make the 1st path the invalid one: its the one that created the circle!
|
99
|
+
old_node.invalid = True
|
100
|
+
# self.tree.update_id(old_id=old_node.identifier, new_id=new_node.identifier)
|
101
|
+
else:
|
102
|
+
new_node.invalid = True
|
103
|
+
|
104
|
+
def is_parent_child_same(self, new_node, node):
|
105
|
+
return (
|
106
|
+
new_node.pos[0] == node.pos[0]
|
107
|
+
and new_node.pos[1] == node.pos[1]
|
108
|
+
and new_node.state["direction"] == node.state["direction"]
|
109
|
+
)
|
110
|
+
|
111
|
+
def expand(self, node, depth):
|
112
|
+
global newely_expanded
|
113
|
+
action = node.untried_action()
|
114
|
+
state, reward, terminated, truncated, _ = self.env.step(
|
115
|
+
self.stochastic_action(action)
|
116
|
+
)
|
117
|
+
done = terminated | truncated
|
118
|
+
new_identifier = hash(
|
119
|
+
(
|
120
|
+
tuple(node.pos),
|
121
|
+
node.state["direction"],
|
122
|
+
action,
|
123
|
+
tuple(self.env.unwrapped.agent_pos),
|
124
|
+
state["direction"],
|
125
|
+
)
|
126
|
+
)
|
127
|
+
valid_id = new_identifier
|
128
|
+
while (
|
129
|
+
new_identifier in self.tree.nodes.keys()
|
130
|
+
): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
|
131
|
+
if self.tree.nodes[new_identifier].invalid == False:
|
132
|
+
valid_id = new_identifier
|
133
|
+
new_identifier = hash((666, new_identifier))
|
134
|
+
# after this while, the id is for sure unused.
|
135
|
+
new_node = Node(
|
136
|
+
identifier=new_identifier,
|
137
|
+
state=state,
|
138
|
+
action=action,
|
139
|
+
action_space=self.action_space,
|
140
|
+
reward=reward,
|
141
|
+
terminal=done,
|
142
|
+
pos=self.env.unwrapped.agent_pos,
|
143
|
+
depth=depth,
|
144
|
+
)
|
145
|
+
if self.is_parent_child_same(
|
146
|
+
new_node, node
|
147
|
+
): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
|
148
|
+
new_node.invalid = True
|
149
|
+
new_node.got_invalid = True
|
150
|
+
# if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
|
151
|
+
elif (
|
152
|
+
valid_id in self.tree.nodes.keys()
|
153
|
+
): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
|
154
|
+
# print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
|
155
|
+
|
156
|
+
self.decide_invalid_path(
|
157
|
+
new_node_father=node,
|
158
|
+
old_node=self.tree.nodes[valid_id],
|
159
|
+
new_node=new_node,
|
160
|
+
)
|
161
|
+
# self.mark_invalid_children(node.children_identifiers, action)
|
162
|
+
|
163
|
+
self.tree.add_node(new_node, node)
|
164
|
+
# if action == 2 and tuple(self.env.unwrapped.agent_pos) == tuple(node.pos): # if the new node is actually invalid, mark it along with the other nodes of the same action as invalid, meaning reward will be 0 for them.
|
165
|
+
# self.mark_invalid_children(node.children_identifiers)
|
166
|
+
newely_expanded += 1
|
167
|
+
return new_node
|
168
|
+
|
169
|
+
def stochastic_action(self, choice):
|
170
|
+
prob_distribution = []
|
171
|
+
actions = range(self.action_space)
|
172
|
+
for action in actions:
|
173
|
+
if action == choice:
|
174
|
+
prob_distribution.append(PROB)
|
175
|
+
else:
|
176
|
+
prob_distribution.append(UNIFORM_PROB)
|
177
|
+
return random.choices(actions, weights=prob_distribution, k=1)[0]
|
178
|
+
|
179
|
+
def expand_selection_stochastic_node(
|
180
|
+
self,
|
181
|
+
node,
|
182
|
+
resulting_identifier,
|
183
|
+
terminated,
|
184
|
+
truncated,
|
185
|
+
reward,
|
186
|
+
action,
|
187
|
+
state,
|
188
|
+
depth,
|
189
|
+
):
|
190
|
+
global newely_expanded
|
191
|
+
# the new node could result in a terminating state.
|
192
|
+
done = terminated | truncated
|
193
|
+
valid_id = resulting_identifier
|
194
|
+
while (
|
195
|
+
resulting_identifier in self.tree.nodes.keys()
|
196
|
+
): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
|
197
|
+
if self.tree.nodes[resulting_identifier].invalid == False:
|
198
|
+
valid_id = resulting_identifier
|
199
|
+
resulting_identifier = hash((666, resulting_identifier))
|
200
|
+
# after this while, the id is for sure unused.
|
201
|
+
new_node = Node(
|
202
|
+
identifier=resulting_identifier,
|
203
|
+
state=state,
|
204
|
+
action=action,
|
205
|
+
action_space=self.action_space,
|
206
|
+
reward=reward,
|
207
|
+
terminal=done,
|
208
|
+
pos=self.env.unwrapped.agent_pos,
|
209
|
+
depth=depth,
|
210
|
+
)
|
211
|
+
if self.is_parent_child_same(
|
212
|
+
new_node, node
|
213
|
+
): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
|
214
|
+
new_node.invalid = True
|
215
|
+
new_node.got_invalid = True
|
216
|
+
# if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
|
217
|
+
elif (
|
218
|
+
valid_id in self.tree.nodes.keys()
|
219
|
+
): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
|
220
|
+
# print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
|
221
|
+
self.decide_invalid_path(
|
222
|
+
new_node_father=node,
|
223
|
+
old_node=self.tree.nodes[valid_id],
|
224
|
+
new_node=new_node,
|
225
|
+
)
|
226
|
+
# self.mark_invalid_children(node.children_identifiers, action)
|
227
|
+
self.tree.add_node(new_node, node)
|
228
|
+
newely_expanded += 1
|
229
|
+
return new_node
|
230
|
+
|
231
|
+
def simulation(self, node):
|
232
|
+
if node.terminal:
|
233
|
+
return node.reward
|
234
|
+
if self.use_heuristic:
|
235
|
+
# taken from Monte-Carlo Planning for Pathfinding in Real-Time Strategy Games , 2010.
|
236
|
+
# need to handle the case of walking into a wall here: the resulting node will be considered invalid and it's reward and performance needs to be 0, but must handle stochasticity
|
237
|
+
# suggestion to handle stochasticity - consider *all* the children associated with taking action 2 towards a wall as performance 0, even if they accidently led in walking to another direction.
|
238
|
+
# which suggests the invalidity needs to be checked not according to the resulting state, rather according to the intended action itself and the environment! remember, you cannot access the "stochastic_action", it is meant to be hidden from you.
|
239
|
+
if node.pos[0] == self.goal[0] and node.pos[1] == self.goal[1]:
|
240
|
+
return 2
|
241
|
+
if node.invalid:
|
242
|
+
return -0.5
|
243
|
+
else:
|
244
|
+
return 0.8 * (
|
245
|
+
1
|
246
|
+
/ (
|
247
|
+
abs(node.pos[0] - self.goal[0])
|
248
|
+
+ abs(node.pos[1] - self.goal[1])
|
249
|
+
)
|
250
|
+
) + 0.2 * (
|
251
|
+
1 / node.depth
|
252
|
+
) # large depth = less probability of obstacles -> larger nominator higher performance. further from goal -> larger denominator, lower performance.
|
253
|
+
while True:
|
254
|
+
action = random.randint(0, self.action_space - 1)
|
255
|
+
state, reward, terminated, truncated, _ = self.env.step(
|
256
|
+
self.stochastic_action(action)
|
257
|
+
)
|
258
|
+
done = (
|
259
|
+
terminated | truncated
|
260
|
+
) # this time there could be truncation unlike in the tree policy.
|
261
|
+
if done:
|
262
|
+
return reward
|
263
|
+
|
264
|
+
def compute_value(self, parent, child, exploration_constant):
|
265
|
+
exploration_term = exploration_constant * sqrt(
|
266
|
+
2 * log(parent.num_visits) / child.num_visits
|
267
|
+
)
|
268
|
+
return child.performance + exploration_term
|
269
|
+
|
270
|
+
# return the best action from a node. the value of an action is the weighted sum of performance of all children that are associated with this action.
|
271
|
+
def best_action(self, node, exploration_constant):
|
272
|
+
tried_actions_values = (
|
273
|
+
{}
|
274
|
+
) # dictionary mapping actions to tuples of (cumulative number of visits of children, sum of (child performance * num of visits for child)) to compute the mean later
|
275
|
+
if tuple(node.pos) == (1, 2) and node.depth == 3 and node.action == 0:
|
276
|
+
pass
|
277
|
+
children = [child for child in self.tree.children(node) if not child.invalid]
|
278
|
+
if not children: # all children are invalid. this node is invalid aswell.
|
279
|
+
return 2
|
280
|
+
for child in children:
|
281
|
+
value = self.compute_value(node, child, exploration_constant)
|
282
|
+
tried_actions_values.setdefault(
|
283
|
+
child.action, [0, 0]
|
284
|
+
) # create if it doesn't exist
|
285
|
+
tried_actions_values[child.action][
|
286
|
+
0
|
287
|
+
] += child.num_visits # add the number of child visits
|
288
|
+
tried_actions_values[child.action][1] += (
|
289
|
+
value * child.num_visits
|
290
|
+
) # add the relative performance of this child
|
291
|
+
return max(
|
292
|
+
tried_actions_values,
|
293
|
+
key=lambda k: tried_actions_values[k][1] / tried_actions_values[k][0],
|
294
|
+
) # return the key (action) with the highest average value
|
295
|
+
|
296
|
+
# only changes the environment to make sure the actions which are already a part of the plan have been executed.
|
297
|
+
def execute_partial_plan(self, plan):
|
298
|
+
node = self.tree.root
|
299
|
+
depth = 0
|
300
|
+
for action in plan:
|
301
|
+
depth += 1
|
302
|
+
# important to simulate the env to get to some state, as the nodes don't hold this information.
|
303
|
+
state, reward, terminated, truncated, _ = self.env.step(action)
|
304
|
+
done = terminated
|
305
|
+
if done:
|
306
|
+
return None, False
|
307
|
+
resulting_identifier = hash(
|
308
|
+
(
|
309
|
+
tuple(node.pos),
|
310
|
+
node.state["direction"],
|
311
|
+
action,
|
312
|
+
tuple(self.env.unwrapped.agent_pos),
|
313
|
+
state["direction"],
|
314
|
+
)
|
315
|
+
)
|
316
|
+
node = self.tree.nodes[resulting_identifier]
|
317
|
+
return node, True
|
318
|
+
|
319
|
+
# finds the ultimate path from the root node to a terminal state (the one that maximized rewards)
|
320
|
+
def tree_policy(self, root_depth):
|
321
|
+
node = self.tree.root
|
322
|
+
depth = root_depth
|
323
|
+
while not (node.terminal or node.invalid):
|
324
|
+
depth += 1
|
325
|
+
if self.tree.is_expandable(node):
|
326
|
+
# expansion - in case there's an action that never been tried, its value is infinity to encourage exploration of all children of a node.
|
327
|
+
return self.expand(node, depth), depth
|
328
|
+
else:
|
329
|
+
# selection - balance exploration and exploitation, coming down the tree - but note the selection might lead to new nodes because of stochaticity.
|
330
|
+
best_action = self.best_action(node, exploration_constant=1 / sqrt(2.0))
|
331
|
+
if best_action == -1:
|
332
|
+
break
|
333
|
+
# important to simulate the env to get to some state, as the nodes don't hold this information.
|
334
|
+
state, reward, terminated, truncated, _ = self.env.step(
|
335
|
+
self.stochastic_action(best_action)
|
336
|
+
)
|
337
|
+
# due to stochasticity, nodes could sometimes be terminal and sometimes they aren't. important to update it. also, the resulting state
|
338
|
+
# could be a state we've never been at due to uncertainty of actions' outcomes.
|
339
|
+
# if the resulting state creates a parent-action-child triplet that hasn't been seen before, add to the tree and return it, similar result to 'expand'.
|
340
|
+
# the hashing must include the action, because we want to enable getting to the same state stochastically from 2 different states: walking forward from (1,2) looking right and getting to (2,2) - the expected behavior, should be allowed even if the agent once stood at (2,1), looked down, turned right and accidently proceeded forward.
|
341
|
+
resulting_identifier = [
|
342
|
+
child_id
|
343
|
+
for child_id in node.children_identifiers
|
344
|
+
if all(
|
345
|
+
a == b
|
346
|
+
for a, b in zip(
|
347
|
+
self.tree.nodes[child_id].pos, self.env.unwrapped.agent_pos
|
348
|
+
)
|
349
|
+
)
|
350
|
+
and self.tree.nodes[child_id].action == best_action
|
351
|
+
]
|
352
|
+
if (
|
353
|
+
len(resulting_identifier) == 0
|
354
|
+
): # took an action done before, but it lead to a new state.
|
355
|
+
resulting_identifier = hash(
|
356
|
+
(
|
357
|
+
tuple(node.pos),
|
358
|
+
node.state["direction"],
|
359
|
+
best_action,
|
360
|
+
tuple(self.env.unwrapped.agent_pos),
|
361
|
+
state["direction"],
|
362
|
+
)
|
363
|
+
)
|
364
|
+
return (
|
365
|
+
self.expand_selection_stochastic_node(
|
366
|
+
node,
|
367
|
+
resulting_identifier,
|
368
|
+
terminated,
|
369
|
+
truncated,
|
370
|
+
reward,
|
371
|
+
best_action,
|
372
|
+
state,
|
373
|
+
depth,
|
374
|
+
),
|
375
|
+
depth,
|
376
|
+
)
|
377
|
+
assert len(resulting_identifier) == 1
|
378
|
+
node = self.tree.nodes[resulting_identifier[0]]
|
379
|
+
return node, depth
|
380
|
+
|
381
|
+
# receives a final state node and updates the rewards of all the nodes on the path to the root
|
382
|
+
def backpropagation(self, node, value):
|
383
|
+
while node != self.tree.parent(self.tree.root):
|
384
|
+
assert (
|
385
|
+
node != None
|
386
|
+
) # if we got to None it means we got to the actual root with the backpropogation instead of to the current root, which means in this path, someone had a differrent parent than it should, probably a double id.
|
387
|
+
node.num_visits += 1
|
388
|
+
node.total_simulation_reward += value
|
389
|
+
node.performance = node.total_simulation_reward / node.num_visits
|
390
|
+
node = self.tree.parent(node)
|
391
|
+
|
392
|
+
def generate_full_policy_sequence(
|
393
|
+
self, env_name, problem_name, save_fig=False, fig_path=None, env_prop=None
|
394
|
+
):
|
395
|
+
trace = []
|
396
|
+
node, prev_node = self.tree.root, self.tree.root
|
397
|
+
print("generating policy sequence.")
|
398
|
+
for action in self.plan:
|
399
|
+
print(
|
400
|
+
f"position {tuple(node.pos)} direction {dict_dir_id_to_str[node.state['direction']]}, action {dict_action_id_to_str[action]}"
|
401
|
+
)
|
402
|
+
candidate_children = [
|
403
|
+
child for child in self.tree.children(node) if child.action == action
|
404
|
+
] # there could be some children associated with the best action, representing different outcomes.
|
405
|
+
assert len(candidate_children) > 0
|
406
|
+
node = max(
|
407
|
+
candidate_children, key=lambda node: node.num_visits
|
408
|
+
) # pick the child that was visited most, meaning it represents the desired action and not the undesired outcomes.
|
409
|
+
trace.append(
|
410
|
+
((prev_node.state, tuple(prev_node.pos)), node.action)
|
411
|
+
) # need to add the previous node with the action leading to the next node which is a property of the next node
|
412
|
+
prev_node = node
|
413
|
+
if save_fig:
|
414
|
+
assert fig_path != None
|
415
|
+
save_figure(trace, env_name, problem_name, fig_path, env_prop)
|
416
|
+
else:
|
417
|
+
assert fig_path == None
|
418
|
+
return trace
|
419
|
+
|
420
|
+
|
421
|
+
def save_model_and_generate_policy(
|
422
|
+
tree, original_root, model_file_path, monteCarloTreeSearch
|
423
|
+
):
|
424
|
+
tree.root = original_root
|
425
|
+
with open(model_file_path, "wb") as file: # Serialize the model
|
426
|
+
monteCarloTreeSearch.env = (
|
427
|
+
None # pickle cannot serialize lambdas which exist in the env
|
428
|
+
)
|
429
|
+
pickle.dump(monteCarloTreeSearch, file)
|
249
430
|
|
250
431
|
|
251
432
|
def plan(env_name, problem_name, goal, save_fig=False, fig_path=None, env_prop=None):
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
433
|
+
global newely_expanded
|
434
|
+
model_dir = get_agent_model_dir(
|
435
|
+
env_name=env_name, model_name=problem_name, class_name="MCTS"
|
436
|
+
)
|
437
|
+
model_file_path = os.path.join(model_dir, "mcts_model.pth")
|
438
|
+
if os.path.exists(model_file_path):
|
439
|
+
print(f"Loading pre-existing mcts planner in {model_file_path}")
|
440
|
+
with open(model_file_path, "rb") as file: # Load the pre-existing model
|
441
|
+
try:
|
442
|
+
monteCarloTreeSearch = pickle.load(file)
|
443
|
+
except Exception as e:
|
444
|
+
|
445
|
+
class RenameUnpickler(pickle.Unpickler):
|
446
|
+
def find_class(self, module, name):
|
447
|
+
renamed_module = module
|
448
|
+
if module.startswith("ml"):
|
449
|
+
renamed_module = "gr_libs." + renamed_module
|
450
|
+
return super(RenameUnpickler, self).find_class(
|
451
|
+
renamed_module, name
|
452
|
+
)
|
453
|
+
|
454
|
+
def renamed_load(file_obj):
|
455
|
+
return RenameUnpickler(file_obj).load()
|
456
|
+
|
457
|
+
file.seek(0)
|
458
|
+
monteCarloTreeSearch = renamed_load(file)
|
459
|
+
|
460
|
+
with open(model_file_path, "wb") as file:
|
461
|
+
pickle.dump(monteCarloTreeSearch, file)
|
462
|
+
|
463
|
+
return monteCarloTreeSearch.generate_full_policy_sequence(
|
464
|
+
env_name, problem_name, save_fig, fig_path
|
465
|
+
)
|
466
|
+
if not os.path.exists(
|
467
|
+
model_dir
|
468
|
+
): # if we reached here, the model doesn't exist. make sure its folder exists.
|
469
|
+
os.makedirs(model_dir)
|
470
|
+
steps = 10000
|
471
|
+
print(
|
472
|
+
f"No tree found. Executing MCTS, starting with {steps} rollouts for each action."
|
473
|
+
)
|
474
|
+
env = gym.make(id=problem_name)
|
475
|
+
random.seed(2)
|
476
|
+
tree = Tree()
|
477
|
+
mcts = MonteCarloTreeSearch(env=env, tree=tree, goal=goal)
|
478
|
+
original_root = tree.root
|
479
|
+
depth = 0
|
480
|
+
while (
|
481
|
+
not tree.root.terminal
|
482
|
+
): # we iterate until the root is a terminal state, meaning the game is over.
|
483
|
+
max_reward = 0
|
484
|
+
iteration = 0
|
485
|
+
steps = max(2000, int(steps * 0.9))
|
486
|
+
print(f"Executing {steps} rollouts for each action now.")
|
487
|
+
tq = tqdm(
|
488
|
+
range(steps),
|
489
|
+
postfix=f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(env.unwrapped.agent_pos)}, newely expanded: {0}",
|
490
|
+
)
|
491
|
+
for n in tq:
|
492
|
+
iteration = n
|
493
|
+
mcts.env.reset()
|
494
|
+
# when executing the partial plan, it's possible the environment finished due to the stochasticity. the execution would return false if that happend.
|
495
|
+
depth = len(mcts.plan)
|
496
|
+
mcts.tree.root = original_root # need to return it to the original root before executing the partial plan as it can lead to a different path and the root can change between iterations.
|
497
|
+
node, result = mcts.execute_partial_plan(mcts.plan)
|
498
|
+
if not result:
|
499
|
+
# false return value from partial plan execution means the plan is finished. we can mark our root as terminal and exit, happy with our plan.
|
500
|
+
tree.root.terminal = True
|
501
|
+
save_model_and_generate_policy(
|
502
|
+
tree=tree,
|
503
|
+
original_root=original_root,
|
504
|
+
model_file_path=model_file_path,
|
505
|
+
monteCarloTreeSearch=mcts,
|
506
|
+
)
|
507
|
+
return mcts.generate_full_policy_sequence(
|
508
|
+
env_name, problem_name, save_fig, fig_path, env_prop
|
509
|
+
)
|
510
|
+
plan_pos, plan_dir = node.pos, dict_dir_id_to_str[node.state["direction"]]
|
511
|
+
tree.root = node # determine the root to be the node executed after the plan for this iteration.
|
512
|
+
node, depth = mcts.tree_policy(
|
513
|
+
root_depth=depth
|
514
|
+
) # find a path to a new unvisited node (unique sequence of actions) by utilizing explorative policy or choosing unvisited children recursively
|
515
|
+
# if the node that returned from tree policy is terminal, the reward will be returned from "simulation" function immediately.
|
516
|
+
reward = mcts.simulation(
|
517
|
+
node
|
518
|
+
) # proceed from that node randomly and collect the final reward expected from it (heuristic)
|
519
|
+
if reward > max_reward:
|
520
|
+
max_reward = reward
|
521
|
+
mcts.backpropagation(
|
522
|
+
node, reward
|
523
|
+
) # update the performances of nodes along the way until the root
|
524
|
+
tq.set_postfix_str(
|
525
|
+
f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(plan_pos)}, looking {plan_dir}. newely expanded: {newely_expanded}"
|
526
|
+
)
|
527
|
+
# update the root and start from it next time.
|
528
|
+
newely_expanded = 0
|
529
|
+
action = mcts.best_action(node=tree.root, exploration_constant=0)
|
530
|
+
if action == -1:
|
531
|
+
pass
|
532
|
+
mcts.plan.append(action)
|
533
|
+
print(f"Executed action {action}")
|
534
|
+
save_model_and_generate_policy(
|
535
|
+
tree=tree,
|
536
|
+
original_root=original_root,
|
537
|
+
model_file_path=model_file_path,
|
538
|
+
monteCarloTreeSearch=monteCarloTreeSearch,
|
539
|
+
)
|
540
|
+
return mcts.generate_full_policy_sequence(
|
541
|
+
env_name, problem_name, save_fig, fig_path
|
542
|
+
)
|
543
|
+
|
544
|
+
|
323
545
|
if __name__ == "__main__":
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
546
|
+
# register(
|
547
|
+
# id="MiniGrid-DynamicGoalEmpty-8x8-3x6-v0",
|
548
|
+
# entry_point="minigrid.envs:DynamicGoalEmpty",
|
549
|
+
# kwargs={"size": 8, "agent_start_pos" : (1, 1), "goal_pos": (3,6) },
|
550
|
+
# )
|
551
|
+
# plan("MiniGrid-DynamicGoalEmpty-8x8-3x6-v0")
|
552
|
+
pass
|