gr-libs 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaluation/analyze_results_cross_alg_cross_domain.py +277 -0
- evaluation/create_minigrid_map_image.py +34 -0
- evaluation/file_system.py +42 -0
- evaluation/generate_experiments_results.py +92 -0
- evaluation/generate_experiments_results_new_ver1.py +254 -0
- evaluation/generate_experiments_results_new_ver2.py +331 -0
- evaluation/generate_task_specific_statistics_plots.py +272 -0
- evaluation/get_plans_images.py +47 -0
- evaluation/increasing_and_decreasing_.py +63 -0
- gr_libs/__init__.py +2 -0
- gr_libs/environment/__init__.py +0 -0
- gr_libs/environment/environment.py +227 -0
- gr_libs/environment/utils/__init__.py +0 -0
- gr_libs/environment/utils/utils.py +17 -0
- gr_libs/metrics/__init__.py +0 -0
- gr_libs/metrics/metrics.py +224 -0
- gr_libs/ml/__init__.py +6 -0
- gr_libs/ml/agent.py +56 -0
- gr_libs/ml/base/__init__.py +1 -0
- gr_libs/ml/base/rl_agent.py +54 -0
- gr_libs/ml/consts.py +22 -0
- gr_libs/ml/neural/__init__.py +3 -0
- gr_libs/ml/neural/deep_rl_learner.py +395 -0
- gr_libs/ml/neural/utils/__init__.py +2 -0
- gr_libs/ml/neural/utils/dictlist.py +33 -0
- gr_libs/ml/neural/utils/penv.py +57 -0
- gr_libs/ml/planner/__init__.py +0 -0
- gr_libs/ml/planner/mcts/__init__.py +0 -0
- gr_libs/ml/planner/mcts/mcts_model.py +330 -0
- gr_libs/ml/planner/mcts/utils/__init__.py +2 -0
- gr_libs/ml/planner/mcts/utils/node.py +33 -0
- gr_libs/ml/planner/mcts/utils/tree.py +102 -0
- gr_libs/ml/sequential/__init__.py +1 -0
- gr_libs/ml/sequential/lstm_model.py +192 -0
- gr_libs/ml/tabular/__init__.py +3 -0
- gr_libs/ml/tabular/state.py +21 -0
- gr_libs/ml/tabular/tabular_q_learner.py +453 -0
- gr_libs/ml/tabular/tabular_rl_agent.py +126 -0
- gr_libs/ml/utils/__init__.py +6 -0
- gr_libs/ml/utils/env.py +7 -0
- gr_libs/ml/utils/format.py +100 -0
- gr_libs/ml/utils/math.py +13 -0
- gr_libs/ml/utils/other.py +24 -0
- gr_libs/ml/utils/storage.py +127 -0
- gr_libs/recognizer/__init__.py +0 -0
- gr_libs/recognizer/gr_as_rl/__init__.py +0 -0
- gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +102 -0
- gr_libs/recognizer/graml/__init__.py +0 -0
- gr_libs/recognizer/graml/gr_dataset.py +134 -0
- gr_libs/recognizer/graml/graml_recognizer.py +266 -0
- gr_libs/recognizer/recognizer.py +46 -0
- gr_libs/recognizer/utils/__init__.py +1 -0
- gr_libs/recognizer/utils/format.py +13 -0
- gr_libs-0.1.3.dist-info/METADATA +197 -0
- gr_libs-0.1.3.dist-info/RECORD +62 -0
- gr_libs-0.1.3.dist-info/WHEEL +5 -0
- gr_libs-0.1.3.dist-info/top_level.txt +3 -0
- tutorials/graml_minigrid_tutorial.py +30 -0
- tutorials/graml_panda_tutorial.py +32 -0
- tutorials/graml_parking_tutorial.py +38 -0
- tutorials/graml_point_maze_tutorial.py +43 -0
- tutorials/graql_minigrid_tutorial.py +29 -0
@@ -0,0 +1,330 @@
|
|
1
|
+
import os
|
2
|
+
import random
|
3
|
+
from math import sqrt, log
|
4
|
+
|
5
|
+
from tqdm import tqdm
|
6
|
+
import pickle
|
7
|
+
|
8
|
+
from gr_libs.ml.utils.storage import get_agent_model_dir
|
9
|
+
from .utils import Node
|
10
|
+
from .utils import Tree
|
11
|
+
import gymnasium as gym
|
12
|
+
|
13
|
+
PROB = 0.8
|
14
|
+
UNIFORM_PROB = 0.1
|
15
|
+
newely_expanded = 0
|
16
|
+
dict_dir_id_to_str = {0:'right', 1:'down', 2:'left', 3:'up'}
|
17
|
+
dict_action_id_to_str = {0:'turn left', 1:'turn right', 2:'go straight'}
|
18
|
+
|
19
|
+
def save_figure(steps, env_name, problem_name, img_path, env_prop):
|
20
|
+
sequence = [pos for ((state, pos), action) in steps]
|
21
|
+
#print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
|
22
|
+
print(f"generating sequence image at {img_path}.")
|
23
|
+
env_prop.create_sequence_image(sequence, img_path, problem_name)
|
24
|
+
|
25
|
+
# TODO add number of expanded nodes and debug by putting breakpoint on the creation of nodes representing (8,4) and checking if they're invalid or something
|
26
|
+
|
27
|
+
# Explanation on hashing and uncertainty in the acto outcome:
|
28
|
+
# We want to detect circles, while not preventing expected behavior. To achieve it, hasing must include previous state, action, and resulting state.
|
29
|
+
# Hashing the direction means coming to the same position from different positions gets different id's.
|
30
|
+
# Example: the agent might have stood at (2,2), picked action 2 (forward), and accidently turned right, resulting at state ((2,2), right).
|
31
|
+
# later, when the agent stood at (2,1), looked right and walked forward, it got to the same state. We would want to enable that, because
|
32
|
+
# this is the expected behavior, so these nodes must have unique id's.
|
33
|
+
# The situations where circles will indeed be detected, are only if the outcome was the same for the previous state, consistent with the action - whether it was or wasn't expected.
|
34
|
+
class MonteCarloTreeSearch():
|
35
|
+
|
36
|
+
def __init__(self, env, tree, goal, use_heuristic=True):
|
37
|
+
self.env = env
|
38
|
+
self.tree = tree
|
39
|
+
self.action_space = self.env.action_space.n
|
40
|
+
self.action_space = 3 # currently
|
41
|
+
state, _ = self.env.reset()
|
42
|
+
self.use_heuristic = use_heuristic
|
43
|
+
self.goal = goal
|
44
|
+
self.tree.add_node(Node(identifier=hash((None, None, tuple(self.env.unwrapped.agent_pos), state['direction'])), state=state, action=None, action_space=self.action_space, reward=0, terminal=False, pos=env.unwrapped.agent_pos, depth=0))
|
45
|
+
self.plan = []
|
46
|
+
|
47
|
+
# def mark_invalid_children(self, children_identifiers, action):
|
48
|
+
# for child_id in children_identifiers:
|
49
|
+
# child = self.tree.nodes[child_id]
|
50
|
+
# if child.action == action:
|
51
|
+
# child.invalid = True
|
52
|
+
|
53
|
+
def decide_invalid_path(self, new_node_father, old_node, new_node): # new_node created the circle, old_node got to the configuration first.
|
54
|
+
new_visits, old_visits = [1,1], [0,0] # stochasticity couldn't result a cycle directly, because it involves a different action. we can get it only by making the same stochastic action mistake or just an actual cycle.
|
55
|
+
new_node_ptr = new_node_father
|
56
|
+
old_node_ptr = old_node
|
57
|
+
|
58
|
+
while new_node_ptr != None:
|
59
|
+
new_visits[0] += new_node_ptr.num_visits
|
60
|
+
new_visits[1] += 1
|
61
|
+
new_node_ptr = self.tree.parent(new_node_ptr)
|
62
|
+
|
63
|
+
while old_node_ptr != None: # getting to the old node wasn't necessarily through the current root. check all the way until None, the original root's parent.
|
64
|
+
old_visits[0] += old_node_ptr.num_visits
|
65
|
+
old_visits[1] += 1
|
66
|
+
old_node_ptr = self.tree.parent(old_node_ptr)
|
67
|
+
|
68
|
+
if new_visits[0] / new_visits[1] > old_visits[0] / old_visits[1]: # newer node is the more probable one. make the 1st path the invalid one: its the one that created the circle!
|
69
|
+
old_node.invalid = True
|
70
|
+
# self.tree.update_id(old_id=old_node.identifier, new_id=new_node.identifier)
|
71
|
+
else:
|
72
|
+
new_node.invalid = True
|
73
|
+
|
74
|
+
def is_parent_child_same(self, new_node, node):
|
75
|
+
return new_node.pos[0] == node.pos[0] and new_node.pos[1] == node.pos[1] and new_node.state['direction'] == node.state['direction']
|
76
|
+
|
77
|
+
def expand(self, node, depth):
|
78
|
+
global newely_expanded
|
79
|
+
action = node.untried_action()
|
80
|
+
state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(action))
|
81
|
+
done = terminated | truncated
|
82
|
+
new_identifier = hash((tuple(node.pos), node.state['direction'], action, tuple(self.env.unwrapped.agent_pos), state['direction']))
|
83
|
+
valid_id = new_identifier
|
84
|
+
while new_identifier in self.tree.nodes.keys(): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
|
85
|
+
if self.tree.nodes[new_identifier].invalid == False:
|
86
|
+
valid_id = new_identifier
|
87
|
+
new_identifier = hash((666, new_identifier))
|
88
|
+
# after this while, the id is for sure unused.
|
89
|
+
new_node = Node(identifier=new_identifier, state=state, action=action, action_space=self.action_space, reward=reward, terminal=done, pos=self.env.unwrapped.agent_pos, depth=depth)
|
90
|
+
if self.is_parent_child_same(new_node, node): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
|
91
|
+
new_node.invalid = True
|
92
|
+
new_node.got_invalid = True
|
93
|
+
# if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
|
94
|
+
elif valid_id in self.tree.nodes.keys(): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
|
95
|
+
# print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
|
96
|
+
|
97
|
+
self.decide_invalid_path(new_node_father=node, old_node=self.tree.nodes[valid_id], new_node=new_node)
|
98
|
+
# self.mark_invalid_children(node.children_identifiers, action)
|
99
|
+
|
100
|
+
self.tree.add_node(new_node, node)
|
101
|
+
# if action == 2 and tuple(self.env.unwrapped.agent_pos) == tuple(node.pos): # if the new node is actually invalid, mark it along with the other nodes of the same action as invalid, meaning reward will be 0 for them.
|
102
|
+
# self.mark_invalid_children(node.children_identifiers)
|
103
|
+
newely_expanded += 1
|
104
|
+
return new_node
|
105
|
+
|
106
|
+
def stochastic_action(self, choice):
|
107
|
+
prob_distribution = []
|
108
|
+
actions = range(self.action_space)
|
109
|
+
for action in actions:
|
110
|
+
if action == choice: prob_distribution.append(PROB)
|
111
|
+
else: prob_distribution.append(UNIFORM_PROB)
|
112
|
+
return random.choices(actions, weights=prob_distribution, k=1)[0]
|
113
|
+
|
114
|
+
def expand_selection_stochastic_node(self, node, resulting_identifier, terminated, truncated, reward, action, state, depth):
|
115
|
+
global newely_expanded
|
116
|
+
# the new node could result in a terminating state.
|
117
|
+
done = terminated | truncated
|
118
|
+
valid_id = resulting_identifier
|
119
|
+
while resulting_identifier in self.tree.nodes.keys(): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
|
120
|
+
if self.tree.nodes[resulting_identifier].invalid == False:
|
121
|
+
valid_id = resulting_identifier
|
122
|
+
resulting_identifier = hash((666, resulting_identifier))
|
123
|
+
# after this while, the id is for sure unused.
|
124
|
+
new_node = Node(identifier=resulting_identifier, state=state, action=action, action_space=self.action_space, reward=reward, terminal=done, pos=self.env.unwrapped.agent_pos, depth=depth)
|
125
|
+
if self.is_parent_child_same(new_node, node): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
|
126
|
+
new_node.invalid = True
|
127
|
+
new_node.got_invalid = True
|
128
|
+
# if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
|
129
|
+
elif valid_id in self.tree.nodes.keys(): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
|
130
|
+
# print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
|
131
|
+
self.decide_invalid_path(new_node_father=node, old_node=self.tree.nodes[valid_id], new_node=new_node)
|
132
|
+
# self.mark_invalid_children(node.children_identifiers, action)
|
133
|
+
self.tree.add_node(new_node, node)
|
134
|
+
newely_expanded += 1
|
135
|
+
return new_node
|
136
|
+
|
137
|
+
def simulation(self, node):
|
138
|
+
if node.terminal:
|
139
|
+
return node.reward
|
140
|
+
if self.use_heuristic:
|
141
|
+
# taken from Monte-Carlo Planning for Pathfinding in Real-Time Strategy Games , 2010.
|
142
|
+
# need to handle the case of walking into a wall here: the resulting node will be considered invalid and it's reward and performance needs to be 0, but must handle stochasticity
|
143
|
+
# suggestion to handle stochasticity - consider *all* the children associated with taking action 2 towards a wall as performance 0, even if they accidently led in walking to another direction.
|
144
|
+
# which suggests the invalidity needs to be checked not according to the resulting state, rather according to the intended action itself and the environment! remember, you cannot access the "stochastic_action", it is meant to be hidden from you.
|
145
|
+
if node.pos[0] == self.goal[0] and node.pos[1] == self.goal[1] : return 2
|
146
|
+
if node.invalid: return -0.5
|
147
|
+
else: return 0.8*(1 / (abs(node.pos[0] - self.goal[0]) + abs(node.pos[1] - self.goal[1]))) + 0.2*(1/node.depth) # large depth = less probability of obstacles -> larger nominator higher performance. further from goal -> larger denominator, lower performance.
|
148
|
+
while True:
|
149
|
+
action = random.randint(0, self.action_space-1)
|
150
|
+
state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(action))
|
151
|
+
done = terminated | truncated # this time there could be truncation unlike in the tree policy.
|
152
|
+
if done:
|
153
|
+
return reward
|
154
|
+
|
155
|
+
def compute_value(self, parent, child, exploration_constant):
|
156
|
+
exploration_term = exploration_constant * sqrt(2*log(parent.num_visits) / child.num_visits)
|
157
|
+
return child.performance + exploration_term
|
158
|
+
|
159
|
+
# return the best action from a node. the value of an action is the weighted sum of performance of all children that are associated with this action.
|
160
|
+
def best_action(self, node, exploration_constant):
|
161
|
+
tried_actions_values = {} # dictionary mapping actions to tuples of (cumulative number of visits of children, sum of (child performance * num of visits for child)) to compute the mean later
|
162
|
+
if tuple(node.pos) == (1,2) and node.depth == 3 and node.action == 0:
|
163
|
+
pass
|
164
|
+
children = [child for child in self.tree.children(node) if not child.invalid]
|
165
|
+
if not children: # all children are invalid. this node is invalid aswell.
|
166
|
+
return 2
|
167
|
+
for child in children:
|
168
|
+
value = self.compute_value(node, child, exploration_constant)
|
169
|
+
tried_actions_values.setdefault(child.action, [0, 0]) # create if it doesn't exist
|
170
|
+
tried_actions_values[child.action][0] += child.num_visits # add the number of child visits
|
171
|
+
tried_actions_values[child.action][1] += value * child.num_visits # add the relative performance of this child
|
172
|
+
return max(tried_actions_values, key=lambda k: tried_actions_values[k][1] / tried_actions_values[k][0]) # return the key (action) with the highest average value
|
173
|
+
|
174
|
+
# only changes the environment to make sure the actions which are already a part of the plan have been executed.
|
175
|
+
def execute_partial_plan(self, plan):
|
176
|
+
node = self.tree.root
|
177
|
+
depth = 0
|
178
|
+
for action in plan:
|
179
|
+
depth += 1
|
180
|
+
# important to simulate the env to get to some state, as the nodes don't hold this information.
|
181
|
+
state, reward, terminated, truncated, _ = self.env.step(action)
|
182
|
+
done = terminated
|
183
|
+
if done: return None, False
|
184
|
+
resulting_identifier = hash((tuple(node.pos), node.state['direction'], action, tuple(self.env.unwrapped.agent_pos), state['direction']))
|
185
|
+
node = self.tree.nodes[resulting_identifier]
|
186
|
+
return node, True
|
187
|
+
|
188
|
+
# finds the ultimate path from the root node to a terminal state (the one that maximized rewards)
|
189
|
+
def tree_policy(self, root_depth):
|
190
|
+
node = self.tree.root
|
191
|
+
depth = root_depth
|
192
|
+
while not (node.terminal or node.invalid):
|
193
|
+
depth += 1
|
194
|
+
if self.tree.is_expandable(node):
|
195
|
+
# expansion - in case there's an action that never been tried, its value is infinity to encourage exploration of all children of a node.
|
196
|
+
return self.expand(node, depth), depth
|
197
|
+
else:
|
198
|
+
# selection - balance exploration and exploitation, coming down the tree - but note the selection might lead to new nodes because of stochaticity.
|
199
|
+
best_action = self.best_action(node, exploration_constant=1/sqrt(2.0))
|
200
|
+
if best_action == -1: break
|
201
|
+
# important to simulate the env to get to some state, as the nodes don't hold this information.
|
202
|
+
state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(best_action))
|
203
|
+
# due to stochasticity, nodes could sometimes be terminal and sometimes they aren't. important to update it. also, the resulting state
|
204
|
+
# could be a state we've never been at due to uncertainty of actions' outcomes.
|
205
|
+
# if the resulting state creates a parent-action-child triplet that hasn't been seen before, add to the tree and return it, similar result to 'expand'.
|
206
|
+
# the hashing must include the action, because we want to enable getting to the same state stochastically from 2 different states: walking forward from (1,2) looking right and getting to (2,2) - the expected behavior, should be allowed even if the agent once stood at (2,1), looked down, turned right and accidently proceeded forward.
|
207
|
+
resulting_identifier = [child_id for child_id in node.children_identifiers if all(a == b for a, b in zip(self.tree.nodes[child_id].pos, self.env.unwrapped.agent_pos)) and self.tree.nodes[child_id].action == best_action]
|
208
|
+
if len(resulting_identifier) == 0: # took an action done before, but it lead to a new state.
|
209
|
+
resulting_identifier = hash((tuple(node.pos), node.state['direction'], best_action, tuple(self.env.unwrapped.agent_pos), state['direction']))
|
210
|
+
return self.expand_selection_stochastic_node(node, resulting_identifier, terminated, truncated, reward, best_action, state, depth), depth
|
211
|
+
assert len(resulting_identifier) == 1
|
212
|
+
node = self.tree.nodes[resulting_identifier[0]]
|
213
|
+
return node, depth
|
214
|
+
|
215
|
+
# receives a final state node and updates the rewards of all the nodes on the path to the root
|
216
|
+
def backpropagation(self, node, value):
|
217
|
+
while node != self.tree.parent(self.tree.root):
|
218
|
+
assert node != None # if we got to None it means we got to the actual root with the backpropogation instead of to the current root, which means in this path, someone had a differrent parent than it should, probably a double id.
|
219
|
+
node.num_visits += 1
|
220
|
+
node.total_simulation_reward += value
|
221
|
+
node.performance = node.total_simulation_reward/node.num_visits
|
222
|
+
node = self.tree.parent(node)
|
223
|
+
|
224
|
+
|
225
|
+
def generate_full_policy_sequence(self, env_name, problem_name, save_fig=False, fig_path=None, env_prop=None):
|
226
|
+
trace = []
|
227
|
+
node, prev_node = self.tree.root, self.tree.root
|
228
|
+
print("generating policy sequence.")
|
229
|
+
for action in self.plan:
|
230
|
+
print(f"position {tuple(node.pos)} direction {dict_dir_id_to_str[node.state['direction']]}, action {dict_action_id_to_str[action]}")
|
231
|
+
candidate_children = [child for child in self.tree.children(node) if child.action == action] # there could be some children associated with the best action, representing different outcomes.
|
232
|
+
assert len(candidate_children) > 0
|
233
|
+
node = max(candidate_children, key=lambda node: node.num_visits) # pick the child that was visited most, meaning it represents the desired action and not the undesired outcomes.
|
234
|
+
trace.append(((prev_node.state, tuple(prev_node.pos)), node.action)) # need to add the previous node with the action leading to the next node which is a property of the next node
|
235
|
+
prev_node = node
|
236
|
+
if save_fig:
|
237
|
+
assert fig_path!=None
|
238
|
+
save_figure(trace, env_name, problem_name, fig_path, env_prop)
|
239
|
+
else:
|
240
|
+
assert fig_path==None
|
241
|
+
return trace
|
242
|
+
|
243
|
+
|
244
|
+
def save_model_and_generate_policy(tree, original_root, model_file_path, monteCarloTreeSearch):
|
245
|
+
tree.root = original_root
|
246
|
+
with open(model_file_path, 'wb') as file: # Serialize the model
|
247
|
+
monteCarloTreeSearch.env = None # pickle cannot serialize lambdas which exist in the env
|
248
|
+
pickle.dump(monteCarloTreeSearch, file)
|
249
|
+
|
250
|
+
|
251
|
+
def plan(env_name, problem_name, goal, save_fig=False, fig_path=None, env_prop=None):
|
252
|
+
global newely_expanded
|
253
|
+
model_dir = get_agent_model_dir(env_name=env_name, model_name=problem_name, class_name="MCTS")
|
254
|
+
model_file_path = os.path.join(model_dir, "mcts_model.pth")
|
255
|
+
if os.path.exists(model_file_path):
|
256
|
+
print(f"Loading pre-existing mcts planner in {model_file_path}")
|
257
|
+
with open(model_file_path, 'rb') as file: # Load the pre-existing model
|
258
|
+
try:
|
259
|
+
monteCarloTreeSearch = pickle.load(file)
|
260
|
+
except Exception as e:
|
261
|
+
class RenameUnpickler(pickle.Unpickler):
|
262
|
+
def find_class(self, module, name):
|
263
|
+
renamed_module = module
|
264
|
+
if module.startswith("ml"):
|
265
|
+
renamed_module = "gr_libs." + renamed_module
|
266
|
+
return super(RenameUnpickler, self).find_class(renamed_module, name)
|
267
|
+
def renamed_load(file_obj):
|
268
|
+
return RenameUnpickler(file_obj).load()
|
269
|
+
file.seek(0)
|
270
|
+
monteCarloTreeSearch = renamed_load(file)
|
271
|
+
|
272
|
+
with open(model_file_path, 'wb') as file:
|
273
|
+
pickle.dump(monteCarloTreeSearch, file)
|
274
|
+
|
275
|
+
return monteCarloTreeSearch.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path)
|
276
|
+
if not os.path.exists(model_dir): # if we reached here, the model doesn't exist. make sure its folder exists.
|
277
|
+
os.makedirs(model_dir)
|
278
|
+
steps = 10000
|
279
|
+
print(f"No tree found. Executing MCTS, starting with {steps} rollouts for each action.")
|
280
|
+
env = gym.make(id=problem_name)
|
281
|
+
random.seed(2)
|
282
|
+
tree = Tree()
|
283
|
+
mcts = MonteCarloTreeSearch(env=env, tree=tree, goal=goal)
|
284
|
+
original_root = tree.root
|
285
|
+
depth = 0
|
286
|
+
while not tree.root.terminal: # we iterate until the root is a terminal state, meaning the game is over.
|
287
|
+
max_reward = 0
|
288
|
+
iteration = 0
|
289
|
+
steps = max(2000,int(steps*0.9))
|
290
|
+
print(f"Executing {steps} rollouts for each action now.")
|
291
|
+
tq = tqdm(range(steps), postfix=f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(env.unwrapped.agent_pos)}, newely expanded: {0}")
|
292
|
+
for n in tq:
|
293
|
+
iteration = n
|
294
|
+
mcts.env.reset()
|
295
|
+
# when executing the partial plan, it's possible the environment finished due to the stochasticity. the execution would return false if that happend.
|
296
|
+
depth = len(mcts.plan)
|
297
|
+
mcts.tree.root = original_root # need to return it to the original root before executing the partial plan as it can lead to a different path and the root can change between iterations.
|
298
|
+
node, result = mcts.execute_partial_plan(mcts.plan)
|
299
|
+
if not result:
|
300
|
+
# false return value from partial plan execution means the plan is finished. we can mark our root as terminal and exit, happy with our plan.
|
301
|
+
tree.root.terminal = True
|
302
|
+
save_model_and_generate_policy(tree=tree, original_root=original_root, model_file_path=model_file_path, monteCarloTreeSearch=mcts)
|
303
|
+
return mcts.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path, env_prop)
|
304
|
+
plan_pos, plan_dir = node.pos, dict_dir_id_to_str[node.state['direction']]
|
305
|
+
tree.root = node # determine the root to be the node executed after the plan for this iteration.
|
306
|
+
node, depth = mcts.tree_policy(root_depth=depth) # find a path to a new unvisited node (unique sequence of actions) by utilizing explorative policy or choosing unvisited children recursively
|
307
|
+
# if the node that returned from tree policy is terminal, the reward will be returned from "simulation" function immediately.
|
308
|
+
reward = mcts.simulation(node) # proceed from that node randomly and collect the final reward expected from it (heuristic)
|
309
|
+
if reward > max_reward:
|
310
|
+
max_reward = reward
|
311
|
+
mcts.backpropagation(node, reward) # update the performances of nodes along the way until the root
|
312
|
+
tq.set_postfix_str(f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(plan_pos)}, looking {plan_dir}. newely expanded: {newely_expanded}")
|
313
|
+
# update the root and start from it next time.
|
314
|
+
newely_expanded = 0
|
315
|
+
action = mcts.best_action(node=tree.root, exploration_constant=0)
|
316
|
+
if action == -1:
|
317
|
+
pass
|
318
|
+
mcts.plan.append(action)
|
319
|
+
print(f"Executed action {action}")
|
320
|
+
save_model_and_generate_policy(tree=tree, original_root=original_root, model_file_path=model_file_path, monteCarloTreeSearch=monteCarloTreeSearch)
|
321
|
+
return mcts.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path)
|
322
|
+
|
323
|
+
if __name__ == "__main__":
|
324
|
+
# register(
|
325
|
+
# id="MiniGrid-DynamicGoalEmpty-8x8-3x6-v0",
|
326
|
+
# entry_point="minigrid.envs:DynamicGoalEmpty",
|
327
|
+
# kwargs={"size": 8, "agent_start_pos" : (1, 1), "goal_pos": (3,6) },
|
328
|
+
# )
|
329
|
+
# plan("MiniGrid-DynamicGoalEmpty-8x8-3x6-v0")
|
330
|
+
pass
|
@@ -0,0 +1,33 @@
|
|
1
|
+
import random
|
2
|
+
|
3
|
+
class Node:
|
4
|
+
|
5
|
+
def __init__(self, identifier, state, action, action_space, reward, terminal, pos, depth):
|
6
|
+
self.identifier = identifier
|
7
|
+
self.parent_identifier = None
|
8
|
+
self.children_identifiers = []
|
9
|
+
self.untried_actions = list(range(action_space))
|
10
|
+
self.state = state
|
11
|
+
self.pos = pos
|
12
|
+
self.total_simulation_reward = 0
|
13
|
+
self.num_visits = 0
|
14
|
+
self.performance = 0
|
15
|
+
self.action = action
|
16
|
+
self.reward = reward
|
17
|
+
self.terminal = terminal
|
18
|
+
self.invalid = False
|
19
|
+
self.got_invalid = False
|
20
|
+
self.depth = depth
|
21
|
+
|
22
|
+
def __str__(self):
|
23
|
+
return "{}: (action={}, visits={}, reward={:d}, ratio={:0.4f})".format(
|
24
|
+
self.state,
|
25
|
+
self.action,
|
26
|
+
self.num_visits,
|
27
|
+
int(self.total_simulation_reward),
|
28
|
+
self.performance)
|
29
|
+
|
30
|
+
def untried_action(self):
|
31
|
+
action = random.choice(self.untried_actions)
|
32
|
+
self.untried_actions.remove(action)
|
33
|
+
return action
|
@@ -0,0 +1,102 @@
|
|
1
|
+
def vertical_lines(last_node_flags):
|
2
|
+
vertical_lines = []
|
3
|
+
vertical_line = '\u2502'
|
4
|
+
for last_node_flag in last_node_flags[0:-1]:
|
5
|
+
if last_node_flag == False:
|
6
|
+
vertical_lines.append(vertical_line + ' ' * 3)
|
7
|
+
else:
|
8
|
+
# space between vertical lines
|
9
|
+
vertical_lines.append(' ' * 4)
|
10
|
+
return ''.join(vertical_lines)
|
11
|
+
|
12
|
+
def horizontal_line(last_node_flags):
|
13
|
+
horizontal_line = '\u251c\u2500\u2500 '
|
14
|
+
horizontal_line_end = '\u2514\u2500\u2500 '
|
15
|
+
if last_node_flags[-1]:
|
16
|
+
return horizontal_line_end
|
17
|
+
else:
|
18
|
+
return horizontal_line
|
19
|
+
|
20
|
+
class Tree:
|
21
|
+
|
22
|
+
def __init__(self):
|
23
|
+
self.nodes = {}
|
24
|
+
self.root = None
|
25
|
+
|
26
|
+
def is_expandable(self, node):
|
27
|
+
if node.terminal or node.invalid:
|
28
|
+
return False
|
29
|
+
if len(node.untried_actions) > 0:
|
30
|
+
return True
|
31
|
+
return False
|
32
|
+
|
33
|
+
def iter(self, identifier, depth, last_node_flags):
|
34
|
+
if identifier is None:
|
35
|
+
node = self.root
|
36
|
+
else:
|
37
|
+
node = self.nodes[identifier]
|
38
|
+
|
39
|
+
if depth == 0:
|
40
|
+
yield "", node
|
41
|
+
else:
|
42
|
+
yield vertical_lines(last_node_flags) + horizontal_line(last_node_flags), node
|
43
|
+
|
44
|
+
children = [self.nodes[identifier] for identifier in node.children_identifiers]
|
45
|
+
last_index = len(children) - 1
|
46
|
+
|
47
|
+
depth += 1
|
48
|
+
for index, child in enumerate(children):
|
49
|
+
last_node_flags.append(index == last_index)
|
50
|
+
for edge, node in self.iter(child.identifier, depth, last_node_flags):
|
51
|
+
yield edge, node
|
52
|
+
last_node_flags.pop()
|
53
|
+
|
54
|
+
def add_node(self, node, parent=None):
|
55
|
+
assert node.identifier not in self.nodes.keys()
|
56
|
+
self.nodes.update({node.identifier: node})
|
57
|
+
|
58
|
+
if parent is None:
|
59
|
+
self.root = node
|
60
|
+
self.nodes[node.identifier].parent = None
|
61
|
+
else:
|
62
|
+
self.nodes[parent.identifier].children_identifiers.append(node.identifier)
|
63
|
+
self.nodes[node.identifier].parent_identifier=parent.identifier
|
64
|
+
|
65
|
+
def update_id(self, old_id, new_id):
|
66
|
+
assert new_id not in self.nodes.keys()
|
67
|
+
# prepare needed objects
|
68
|
+
node = self.nodes[old_id]
|
69
|
+
parent = self.parent(node)
|
70
|
+
|
71
|
+
# update the node's parent
|
72
|
+
self.nodes[parent.identifier].children_identifiers.remove(old_id)
|
73
|
+
self.nodes[parent.identifier].children_identifiers.append(new_id)
|
74
|
+
|
75
|
+
# update the node itself
|
76
|
+
node.identifier = new_id
|
77
|
+
|
78
|
+
# update the node's children (if there are any?...)
|
79
|
+
for child_id in node.children_identifiers:
|
80
|
+
self.nodes[child_id].parent_identifier = new_id
|
81
|
+
|
82
|
+
self.nodes.pop(old_id)
|
83
|
+
self.nodes.update({node.identifier: node})
|
84
|
+
|
85
|
+
def children(self, node):
|
86
|
+
children = []
|
87
|
+
for identifier in self.nodes[node.identifier].children_identifiers:
|
88
|
+
children.append(self.nodes[identifier])
|
89
|
+
return children
|
90
|
+
|
91
|
+
def parent(self, node):
|
92
|
+
parent_identifier = self.nodes[node.identifier].parent_identifier
|
93
|
+
if parent_identifier is None:
|
94
|
+
return None
|
95
|
+
else:
|
96
|
+
return self.nodes[parent_identifier]
|
97
|
+
|
98
|
+
def show(self):
|
99
|
+
lines = ""
|
100
|
+
for edge, node in self.iter(identifier=None, depth=0, last_node_flags=[]):
|
101
|
+
lines += "{}{}\n".format(edge, node)
|
102
|
+
print(lines)
|
@@ -0,0 +1 @@
|
|
1
|
+
from gr_libs.ml.sequential.lstm_model import LstmObservations
|
@@ -0,0 +1,192 @@
|
|
1
|
+
import os
|
2
|
+
import torch
|
3
|
+
import torch.nn as nn
|
4
|
+
import torch.nn.functional as F
|
5
|
+
import torch.optim as optim
|
6
|
+
from types import MethodType
|
7
|
+
import numpy as np
|
8
|
+
from gr_libs.ml.utils import device
|
9
|
+
from torch.nn.utils.rnn import pack_padded_sequence
|
10
|
+
|
11
|
+
|
12
|
+
def accuracy_per_epoch(model, data_loader):
|
13
|
+
model.eval()
|
14
|
+
correct = total = 0.0
|
15
|
+
sum_loss = 0.0
|
16
|
+
with torch.no_grad():
|
17
|
+
for (first_traces, second_traces, is_same_goals, first_traces_lengths, second_traces_lengths) in data_loader:
|
18
|
+
y_pred = model.forward_tab(first_traces, second_traces, first_traces_lengths, second_traces_lengths)
|
19
|
+
loss = F.binary_cross_entropy(y_pred, is_same_goals)
|
20
|
+
sum_loss += loss.item()
|
21
|
+
y_pred = (y_pred >= 0.5)
|
22
|
+
correct += torch.sum(y_pred == is_same_goals)
|
23
|
+
total += len(is_same_goals)
|
24
|
+
return correct / total, sum_loss / 32
|
25
|
+
|
26
|
+
def accuracy_per_epoch_cont(model, data_loader):
|
27
|
+
model.eval()
|
28
|
+
correct = total = 0.0
|
29
|
+
sum_loss = 0.0
|
30
|
+
with torch.no_grad():
|
31
|
+
for (first_traces_images, first_traces_texts, second_traces_images, second_traces_texts, is_same_goals, first_traces_lengths, second_traces_lengths) in data_loader:
|
32
|
+
y_pred = model.forward_cont(first_traces_images, first_traces_texts, second_traces_images, second_traces_texts, first_traces_lengths, second_traces_lengths)
|
33
|
+
loss = F.binary_cross_entropy(y_pred, is_same_goals)
|
34
|
+
sum_loss += loss.item()
|
35
|
+
y_pred = (y_pred >= 0.5)
|
36
|
+
correct += torch.sum(y_pred == is_same_goals)
|
37
|
+
total += len(is_same_goals)
|
38
|
+
return correct / total, sum_loss / 32
|
39
|
+
|
40
|
+
# class CNNImageEmbeddor(nn.Module):
|
41
|
+
# def __init__(self, obs_space, action_space, use_text=False):
|
42
|
+
# super().__init__()
|
43
|
+
# self.use_text = use_text
|
44
|
+
# self.image_conv = nn.Sequential(
|
45
|
+
# nn.Conv2d(3, 4, kernel_size=(3, 3), padding=1), # Reduced filters, added padding
|
46
|
+
# nn.ReLU(),
|
47
|
+
# nn.MaxPool2d((2, 2)),
|
48
|
+
# nn.Conv2d(4, 4, (3, 3), padding=1), # Reduced filters, added padding
|
49
|
+
# nn.ReLU(),
|
50
|
+
# nn.MaxPool2d((2, 2)), # Added additional pooling to reduce size
|
51
|
+
# nn.Conv2d(4, 8, (3, 3), padding=1), # Reduced filters, added padding
|
52
|
+
# nn.ReLU(),
|
53
|
+
# nn.BatchNorm2d(8)
|
54
|
+
# )
|
55
|
+
# n = obs_space["image"][0]
|
56
|
+
# m = obs_space["image"][1]
|
57
|
+
# self.image_embedding_size = ((n - 4) // 4 - 3) * ((m - 4) // 4 - 3) * 8
|
58
|
+
# if self.use_text:
|
59
|
+
# self.word_embedding_size = 32
|
60
|
+
# self.word_embedding = nn.Embedding(obs_space["text"], self.word_embedding_size)
|
61
|
+
# self.text_embedding_size = 128
|
62
|
+
# self.text_rnn = nn.GRU(self.word_embedding_size, self.text_embedding_size, batch_first=True)
|
63
|
+
|
64
|
+
def forward(self, images, texts):
|
65
|
+
# images shape: batch_size X max_sequence_len X sample_size. same for text.
|
66
|
+
# need to reshape image to num_channels X height X width, like nn.Conv expects it to be.
|
67
|
+
x = images.transpose(2, 4).transpose(3, 4)
|
68
|
+
orig_shape = x.shape
|
69
|
+
# combine batch and sequence to 1 dimension so conv could handle it
|
70
|
+
x = x.view(orig_shape[0]*orig_shape[1], orig_shape[2], orig_shape[3], orig_shape[4]) # x shape: batch_size * max_sequence_len X sample_size
|
71
|
+
x = self.image_conv(x) # x shape: batch_size * max_sequence_len X last_conv_size X 1 X 1
|
72
|
+
# reshape x back to divide batches from sequences
|
73
|
+
x = x.view(orig_shape[0], orig_shape[1], x.shape[1]) # x shape: batch_size X max_sequence_len X last_conv_size. last 2 dimensions (1,1) are collapsed to last conv.
|
74
|
+
embedding = x
|
75
|
+
|
76
|
+
if self.use_text:
|
77
|
+
embed_text = self._get_embed_text(texts)
|
78
|
+
embedding = torch.cat((embedding, embed_text), dim=1)
|
79
|
+
|
80
|
+
return embedding
|
81
|
+
|
82
|
+
def _get_embed_text(self, text):
|
83
|
+
_, hidden = self.text_rnn(self.word_embedding(text))
|
84
|
+
return hidden[-1]
|
85
|
+
|
86
|
+
class LstmObservations(nn.Module):
|
87
|
+
|
88
|
+
def __init__(self, input_size, hidden_size): # TODO make sure the right cuda is used!
|
89
|
+
super(LstmObservations,self).__init__()
|
90
|
+
#self.embeddor = CNNImageEmbeddor(obs_space, action_space)
|
91
|
+
# check if the traces are a bunch of images
|
92
|
+
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
|
93
|
+
self.dropout = nn.Dropout(0.5) # Added dropout layer
|
94
|
+
# Initialize weights
|
95
|
+
for name, param in self.lstm.named_parameters():
|
96
|
+
if 'weight' in name:
|
97
|
+
nn.init.xavier_uniform_(param)
|
98
|
+
elif 'bias' in name:
|
99
|
+
nn.init.zeros_(param)
|
100
|
+
|
101
|
+
|
102
|
+
# tabular
|
103
|
+
def forward_tab(self, traces1, traces2, lengths1, lengths2):
|
104
|
+
out1, (ht1, ct1) = self.lstm(pack_padded_sequence(traces1, lengths1, batch_first=True, enforce_sorted=False), None) # traces1 & traces 2 shapes: batch_size X max sequence_length X embedding_size
|
105
|
+
out2, (ht2, ct2) = self.lstm(pack_padded_sequence(traces2, lengths2, batch_first=True, enforce_sorted=False), None)
|
106
|
+
# out1, _ = pad_packed_sequence(out1, batch_first=True, total_length=max(lengths1))
|
107
|
+
# out2, _ = pad_packed_sequence(out2, batch_first=True, total_length=max(lengths2))
|
108
|
+
manhattan_dis = torch.exp(-torch.sum(torch.abs(ht1[-1]-ht2[-1]),dim=1,keepdim=True))
|
109
|
+
return manhattan_dis.squeeze()
|
110
|
+
|
111
|
+
# continuous
|
112
|
+
# def forward_cont(self, traces1_images, traces1_texts, traces2_images, traces2_texts, lengths1, lengths2):
|
113
|
+
# # we also embed '0' images, but we take them out of the equation in the lstm (it knows to not treat them when batching)
|
114
|
+
# traces1 = self.embeddor(traces1_images, traces1_texts)
|
115
|
+
# traces2 = self.embeddor(traces2_images, traces2_texts) # traces1 & traces 2 shapes: batch_size X max_sequence_length X embedding_size
|
116
|
+
# out1, (ht1, ct1) = self.lstm(pack_padded_sequence(traces1, lengths1, batch_first=True, enforce_sorted=False), None)
|
117
|
+
# out2, (ht2, ct2) = self.lstm(pack_padded_sequence(traces2, lengths2, batch_first=True, enforce_sorted=False), None)
|
118
|
+
# manhattan_dis = torch.exp(-torch.sum(torch.abs(ht1[-1]-ht2[-1]),dim=1,keepdim=True))
|
119
|
+
# return manhattan_dis.squeeze()
|
120
|
+
|
121
|
+
def embed_sequence(self, trace):
|
122
|
+
trace = torch.stack([torch.tensor(observation, dtype=torch.float32) for observation in trace]).to(device)
|
123
|
+
out, (ht, ct) = self.lstm(trace, None)
|
124
|
+
return ht[-1]
|
125
|
+
|
126
|
+
# def embed_sequence_cont(self, sequence, preprocess_obss):
|
127
|
+
# sequence = [preprocess_obss([obs])[0] for ((obs, (_, _)), _) in sequence]
|
128
|
+
# trace_images = torch.tensor(np.expand_dims(torch.stack([step.image for step in sequence]), axis=0)).to(device)
|
129
|
+
# trace_texts = torch.tensor(np.expand_dims(torch.stack([step.text for step in sequence]), axis=0)).to(device)
|
130
|
+
# embedded_trace = self.embeddor(trace_images, trace_texts)
|
131
|
+
# out, (ht, ct) = self.lstm(embedded_trace)
|
132
|
+
# return ht[-1]
|
133
|
+
|
134
|
+
def train_metric_model(model, train_loader, dev_loader, nepochs=5, patience = 2):
|
135
|
+
devAccuracy = []
|
136
|
+
best_dev_accuracy = 0.0
|
137
|
+
no_improvement_count = 0
|
138
|
+
optimizer = torch.optim.Adadelta(model.parameters(), weight_decay=0.1)
|
139
|
+
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
|
140
|
+
for epoch in range(nepochs):
|
141
|
+
sum_loss, denominator = 0.0, 0.0
|
142
|
+
model.train()
|
143
|
+
for (first_traces, second_traces, is_same_goals, first_traces_lengths, second_traces_lengths) in train_loader:
|
144
|
+
model.zero_grad()
|
145
|
+
y_pred = model.forward_tab(first_traces, second_traces, first_traces_lengths, second_traces_lengths)
|
146
|
+
if len(is_same_goals) == 1: is_same_goals = torch.squeeze(is_same_goals) # for the case of batches in size 1...
|
147
|
+
loss = F.binary_cross_entropy(y_pred, is_same_goals)
|
148
|
+
sum_loss += loss.item()
|
149
|
+
denominator += 1
|
150
|
+
loss.backward()
|
151
|
+
optimizer.step()
|
152
|
+
|
153
|
+
dev_accuracy, dev_loss = accuracy_per_epoch(model, dev_loader)
|
154
|
+
devAccuracy.append(dev_accuracy)
|
155
|
+
if dev_accuracy > best_dev_accuracy:
|
156
|
+
best_dev_accuracy = dev_accuracy
|
157
|
+
no_improvement_count = 0
|
158
|
+
else:
|
159
|
+
no_improvement_count = 1
|
160
|
+
|
161
|
+
print("epoch - {}/{}...".format(epoch + 1, nepochs),
|
162
|
+
"train loss - {:.6f}...".format(sum_loss / denominator),
|
163
|
+
"dev loss - {:.6f}...".format(dev_loss),
|
164
|
+
"dev accuracy - {:.6f}".format(dev_accuracy))
|
165
|
+
|
166
|
+
if no_improvement_count >= patience:
|
167
|
+
print(f"Early stopping after {epoch + 1} epochs with no improvement.")
|
168
|
+
break
|
169
|
+
|
170
|
+
def train_metric_model_cont(model, train_loader, dev_loader, nepochs=5):
|
171
|
+
devAccuracy = []
|
172
|
+
optimizer = torch.optim.Adadelta(model.parameters(),weight_decay=1.25)
|
173
|
+
for epoch in range(nepochs):
|
174
|
+
sum_loss, denominator = 0.0, 0.0
|
175
|
+
model.train()
|
176
|
+
for (first_traces_images, first_traces_texts, second_traces_images, second_traces_texts, is_same_goals, first_traces_lengths, second_traces_lengths) in train_loader:
|
177
|
+
model.zero_grad()
|
178
|
+
y_pred = model.forward_cont(first_traces_images, first_traces_texts, second_traces_images, second_traces_texts, first_traces_lengths, second_traces_lengths)
|
179
|
+
loss = F.binary_cross_entropy(y_pred, is_same_goals)
|
180
|
+
sum_loss += loss.item()
|
181
|
+
denominator += 1
|
182
|
+
loss.backward()
|
183
|
+
optimizer.step()
|
184
|
+
|
185
|
+
dev_accuracy, dev_loss = accuracy_per_epoch_cont(model, dev_loader)
|
186
|
+
devAccuracy.append(dev_accuracy)
|
187
|
+
|
188
|
+
print("epoch - {}/{}...".format(epoch + 1, nepochs),
|
189
|
+
"train loss - {:.6f}...".format(sum_loss / denominator),
|
190
|
+
"dev loss - {:.6f}...".format(dev_loss),
|
191
|
+
"dev accuracy - {:.6f}".format(dev_accuracy))
|
192
|
+
|