gr-libs 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. evaluation/analyze_results_cross_alg_cross_domain.py +277 -0
  2. evaluation/create_minigrid_map_image.py +34 -0
  3. evaluation/file_system.py +42 -0
  4. evaluation/generate_experiments_results.py +92 -0
  5. evaluation/generate_experiments_results_new_ver1.py +254 -0
  6. evaluation/generate_experiments_results_new_ver2.py +331 -0
  7. evaluation/generate_task_specific_statistics_plots.py +272 -0
  8. evaluation/get_plans_images.py +47 -0
  9. evaluation/increasing_and_decreasing_.py +63 -0
  10. gr_libs/__init__.py +2 -0
  11. gr_libs/environment/__init__.py +0 -0
  12. gr_libs/environment/environment.py +227 -0
  13. gr_libs/environment/utils/__init__.py +0 -0
  14. gr_libs/environment/utils/utils.py +17 -0
  15. gr_libs/metrics/__init__.py +0 -0
  16. gr_libs/metrics/metrics.py +224 -0
  17. gr_libs/ml/__init__.py +6 -0
  18. gr_libs/ml/agent.py +56 -0
  19. gr_libs/ml/base/__init__.py +1 -0
  20. gr_libs/ml/base/rl_agent.py +54 -0
  21. gr_libs/ml/consts.py +22 -0
  22. gr_libs/ml/neural/__init__.py +3 -0
  23. gr_libs/ml/neural/deep_rl_learner.py +395 -0
  24. gr_libs/ml/neural/utils/__init__.py +2 -0
  25. gr_libs/ml/neural/utils/dictlist.py +33 -0
  26. gr_libs/ml/neural/utils/penv.py +57 -0
  27. gr_libs/ml/planner/__init__.py +0 -0
  28. gr_libs/ml/planner/mcts/__init__.py +0 -0
  29. gr_libs/ml/planner/mcts/mcts_model.py +330 -0
  30. gr_libs/ml/planner/mcts/utils/__init__.py +2 -0
  31. gr_libs/ml/planner/mcts/utils/node.py +33 -0
  32. gr_libs/ml/planner/mcts/utils/tree.py +102 -0
  33. gr_libs/ml/sequential/__init__.py +1 -0
  34. gr_libs/ml/sequential/lstm_model.py +192 -0
  35. gr_libs/ml/tabular/__init__.py +3 -0
  36. gr_libs/ml/tabular/state.py +21 -0
  37. gr_libs/ml/tabular/tabular_q_learner.py +453 -0
  38. gr_libs/ml/tabular/tabular_rl_agent.py +126 -0
  39. gr_libs/ml/utils/__init__.py +6 -0
  40. gr_libs/ml/utils/env.py +7 -0
  41. gr_libs/ml/utils/format.py +100 -0
  42. gr_libs/ml/utils/math.py +13 -0
  43. gr_libs/ml/utils/other.py +24 -0
  44. gr_libs/ml/utils/storage.py +127 -0
  45. gr_libs/recognizer/__init__.py +0 -0
  46. gr_libs/recognizer/gr_as_rl/__init__.py +0 -0
  47. gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +102 -0
  48. gr_libs/recognizer/graml/__init__.py +0 -0
  49. gr_libs/recognizer/graml/gr_dataset.py +134 -0
  50. gr_libs/recognizer/graml/graml_recognizer.py +266 -0
  51. gr_libs/recognizer/recognizer.py +46 -0
  52. gr_libs/recognizer/utils/__init__.py +1 -0
  53. gr_libs/recognizer/utils/format.py +13 -0
  54. gr_libs-0.1.3.dist-info/METADATA +197 -0
  55. gr_libs-0.1.3.dist-info/RECORD +62 -0
  56. gr_libs-0.1.3.dist-info/WHEEL +5 -0
  57. gr_libs-0.1.3.dist-info/top_level.txt +3 -0
  58. tutorials/graml_minigrid_tutorial.py +30 -0
  59. tutorials/graml_panda_tutorial.py +32 -0
  60. tutorials/graml_parking_tutorial.py +38 -0
  61. tutorials/graml_point_maze_tutorial.py +43 -0
  62. tutorials/graql_minigrid_tutorial.py +29 -0
@@ -0,0 +1,330 @@
1
+ import os
2
+ import random
3
+ from math import sqrt, log
4
+
5
+ from tqdm import tqdm
6
+ import pickle
7
+
8
+ from gr_libs.ml.utils.storage import get_agent_model_dir
9
+ from .utils import Node
10
+ from .utils import Tree
11
+ import gymnasium as gym
12
+
13
+ PROB = 0.8
14
+ UNIFORM_PROB = 0.1
15
+ newely_expanded = 0
16
+ dict_dir_id_to_str = {0:'right', 1:'down', 2:'left', 3:'up'}
17
+ dict_action_id_to_str = {0:'turn left', 1:'turn right', 2:'go straight'}
18
+
19
+ def save_figure(steps, env_name, problem_name, img_path, env_prop):
20
+ sequence = [pos for ((state, pos), action) in steps]
21
+ #print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
22
+ print(f"generating sequence image at {img_path}.")
23
+ env_prop.create_sequence_image(sequence, img_path, problem_name)
24
+
25
+ # TODO add number of expanded nodes and debug by putting breakpoint on the creation of nodes representing (8,4) and checking if they're invalid or something
26
+
27
+ # Explanation on hashing and uncertainty in the acto outcome:
28
+ # We want to detect circles, while not preventing expected behavior. To achieve it, hasing must include previous state, action, and resulting state.
29
+ # Hashing the direction means coming to the same position from different positions gets different id's.
30
+ # Example: the agent might have stood at (2,2), picked action 2 (forward), and accidently turned right, resulting at state ((2,2), right).
31
+ # later, when the agent stood at (2,1), looked right and walked forward, it got to the same state. We would want to enable that, because
32
+ # this is the expected behavior, so these nodes must have unique id's.
33
+ # The situations where circles will indeed be detected, are only if the outcome was the same for the previous state, consistent with the action - whether it was or wasn't expected.
34
+ class MonteCarloTreeSearch():
35
+
36
+ def __init__(self, env, tree, goal, use_heuristic=True):
37
+ self.env = env
38
+ self.tree = tree
39
+ self.action_space = self.env.action_space.n
40
+ self.action_space = 3 # currently
41
+ state, _ = self.env.reset()
42
+ self.use_heuristic = use_heuristic
43
+ self.goal = goal
44
+ self.tree.add_node(Node(identifier=hash((None, None, tuple(self.env.unwrapped.agent_pos), state['direction'])), state=state, action=None, action_space=self.action_space, reward=0, terminal=False, pos=env.unwrapped.agent_pos, depth=0))
45
+ self.plan = []
46
+
47
+ # def mark_invalid_children(self, children_identifiers, action):
48
+ # for child_id in children_identifiers:
49
+ # child = self.tree.nodes[child_id]
50
+ # if child.action == action:
51
+ # child.invalid = True
52
+
53
+ def decide_invalid_path(self, new_node_father, old_node, new_node): # new_node created the circle, old_node got to the configuration first.
54
+ new_visits, old_visits = [1,1], [0,0] # stochasticity couldn't result a cycle directly, because it involves a different action. we can get it only by making the same stochastic action mistake or just an actual cycle.
55
+ new_node_ptr = new_node_father
56
+ old_node_ptr = old_node
57
+
58
+ while new_node_ptr != None:
59
+ new_visits[0] += new_node_ptr.num_visits
60
+ new_visits[1] += 1
61
+ new_node_ptr = self.tree.parent(new_node_ptr)
62
+
63
+ while old_node_ptr != None: # getting to the old node wasn't necessarily through the current root. check all the way until None, the original root's parent.
64
+ old_visits[0] += old_node_ptr.num_visits
65
+ old_visits[1] += 1
66
+ old_node_ptr = self.tree.parent(old_node_ptr)
67
+
68
+ if new_visits[0] / new_visits[1] > old_visits[0] / old_visits[1]: # newer node is the more probable one. make the 1st path the invalid one: its the one that created the circle!
69
+ old_node.invalid = True
70
+ # self.tree.update_id(old_id=old_node.identifier, new_id=new_node.identifier)
71
+ else:
72
+ new_node.invalid = True
73
+
74
+ def is_parent_child_same(self, new_node, node):
75
+ return new_node.pos[0] == node.pos[0] and new_node.pos[1] == node.pos[1] and new_node.state['direction'] == node.state['direction']
76
+
77
+ def expand(self, node, depth):
78
+ global newely_expanded
79
+ action = node.untried_action()
80
+ state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(action))
81
+ done = terminated | truncated
82
+ new_identifier = hash((tuple(node.pos), node.state['direction'], action, tuple(self.env.unwrapped.agent_pos), state['direction']))
83
+ valid_id = new_identifier
84
+ while new_identifier in self.tree.nodes.keys(): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
85
+ if self.tree.nodes[new_identifier].invalid == False:
86
+ valid_id = new_identifier
87
+ new_identifier = hash((666, new_identifier))
88
+ # after this while, the id is for sure unused.
89
+ new_node = Node(identifier=new_identifier, state=state, action=action, action_space=self.action_space, reward=reward, terminal=done, pos=self.env.unwrapped.agent_pos, depth=depth)
90
+ if self.is_parent_child_same(new_node, node): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
91
+ new_node.invalid = True
92
+ new_node.got_invalid = True
93
+ # if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
94
+ elif valid_id in self.tree.nodes.keys(): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
95
+ # print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
96
+
97
+ self.decide_invalid_path(new_node_father=node, old_node=self.tree.nodes[valid_id], new_node=new_node)
98
+ # self.mark_invalid_children(node.children_identifiers, action)
99
+
100
+ self.tree.add_node(new_node, node)
101
+ # if action == 2 and tuple(self.env.unwrapped.agent_pos) == tuple(node.pos): # if the new node is actually invalid, mark it along with the other nodes of the same action as invalid, meaning reward will be 0 for them.
102
+ # self.mark_invalid_children(node.children_identifiers)
103
+ newely_expanded += 1
104
+ return new_node
105
+
106
+ def stochastic_action(self, choice):
107
+ prob_distribution = []
108
+ actions = range(self.action_space)
109
+ for action in actions:
110
+ if action == choice: prob_distribution.append(PROB)
111
+ else: prob_distribution.append(UNIFORM_PROB)
112
+ return random.choices(actions, weights=prob_distribution, k=1)[0]
113
+
114
+ def expand_selection_stochastic_node(self, node, resulting_identifier, terminated, truncated, reward, action, state, depth):
115
+ global newely_expanded
116
+ # the new node could result in a terminating state.
117
+ done = terminated | truncated
118
+ valid_id = resulting_identifier
119
+ while resulting_identifier in self.tree.nodes.keys(): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
120
+ if self.tree.nodes[resulting_identifier].invalid == False:
121
+ valid_id = resulting_identifier
122
+ resulting_identifier = hash((666, resulting_identifier))
123
+ # after this while, the id is for sure unused.
124
+ new_node = Node(identifier=resulting_identifier, state=state, action=action, action_space=self.action_space, reward=reward, terminal=done, pos=self.env.unwrapped.agent_pos, depth=depth)
125
+ if self.is_parent_child_same(new_node, node): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
126
+ new_node.invalid = True
127
+ new_node.got_invalid = True
128
+ # if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
129
+ elif valid_id in self.tree.nodes.keys(): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
130
+ # print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
131
+ self.decide_invalid_path(new_node_father=node, old_node=self.tree.nodes[valid_id], new_node=new_node)
132
+ # self.mark_invalid_children(node.children_identifiers, action)
133
+ self.tree.add_node(new_node, node)
134
+ newely_expanded += 1
135
+ return new_node
136
+
137
+ def simulation(self, node):
138
+ if node.terminal:
139
+ return node.reward
140
+ if self.use_heuristic:
141
+ # taken from Monte-Carlo Planning for Pathfinding in Real-Time Strategy Games , 2010.
142
+ # need to handle the case of walking into a wall here: the resulting node will be considered invalid and it's reward and performance needs to be 0, but must handle stochasticity
143
+ # suggestion to handle stochasticity - consider *all* the children associated with taking action 2 towards a wall as performance 0, even if they accidently led in walking to another direction.
144
+ # which suggests the invalidity needs to be checked not according to the resulting state, rather according to the intended action itself and the environment! remember, you cannot access the "stochastic_action", it is meant to be hidden from you.
145
+ if node.pos[0] == self.goal[0] and node.pos[1] == self.goal[1] : return 2
146
+ if node.invalid: return -0.5
147
+ else: return 0.8*(1 / (abs(node.pos[0] - self.goal[0]) + abs(node.pos[1] - self.goal[1]))) + 0.2*(1/node.depth) # large depth = less probability of obstacles -> larger nominator higher performance. further from goal -> larger denominator, lower performance.
148
+ while True:
149
+ action = random.randint(0, self.action_space-1)
150
+ state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(action))
151
+ done = terminated | truncated # this time there could be truncation unlike in the tree policy.
152
+ if done:
153
+ return reward
154
+
155
+ def compute_value(self, parent, child, exploration_constant):
156
+ exploration_term = exploration_constant * sqrt(2*log(parent.num_visits) / child.num_visits)
157
+ return child.performance + exploration_term
158
+
159
+ # return the best action from a node. the value of an action is the weighted sum of performance of all children that are associated with this action.
160
+ def best_action(self, node, exploration_constant):
161
+ tried_actions_values = {} # dictionary mapping actions to tuples of (cumulative number of visits of children, sum of (child performance * num of visits for child)) to compute the mean later
162
+ if tuple(node.pos) == (1,2) and node.depth == 3 and node.action == 0:
163
+ pass
164
+ children = [child for child in self.tree.children(node) if not child.invalid]
165
+ if not children: # all children are invalid. this node is invalid aswell.
166
+ return 2
167
+ for child in children:
168
+ value = self.compute_value(node, child, exploration_constant)
169
+ tried_actions_values.setdefault(child.action, [0, 0]) # create if it doesn't exist
170
+ tried_actions_values[child.action][0] += child.num_visits # add the number of child visits
171
+ tried_actions_values[child.action][1] += value * child.num_visits # add the relative performance of this child
172
+ return max(tried_actions_values, key=lambda k: tried_actions_values[k][1] / tried_actions_values[k][0]) # return the key (action) with the highest average value
173
+
174
+ # only changes the environment to make sure the actions which are already a part of the plan have been executed.
175
+ def execute_partial_plan(self, plan):
176
+ node = self.tree.root
177
+ depth = 0
178
+ for action in plan:
179
+ depth += 1
180
+ # important to simulate the env to get to some state, as the nodes don't hold this information.
181
+ state, reward, terminated, truncated, _ = self.env.step(action)
182
+ done = terminated
183
+ if done: return None, False
184
+ resulting_identifier = hash((tuple(node.pos), node.state['direction'], action, tuple(self.env.unwrapped.agent_pos), state['direction']))
185
+ node = self.tree.nodes[resulting_identifier]
186
+ return node, True
187
+
188
+ # finds the ultimate path from the root node to a terminal state (the one that maximized rewards)
189
+ def tree_policy(self, root_depth):
190
+ node = self.tree.root
191
+ depth = root_depth
192
+ while not (node.terminal or node.invalid):
193
+ depth += 1
194
+ if self.tree.is_expandable(node):
195
+ # expansion - in case there's an action that never been tried, its value is infinity to encourage exploration of all children of a node.
196
+ return self.expand(node, depth), depth
197
+ else:
198
+ # selection - balance exploration and exploitation, coming down the tree - but note the selection might lead to new nodes because of stochaticity.
199
+ best_action = self.best_action(node, exploration_constant=1/sqrt(2.0))
200
+ if best_action == -1: break
201
+ # important to simulate the env to get to some state, as the nodes don't hold this information.
202
+ state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(best_action))
203
+ # due to stochasticity, nodes could sometimes be terminal and sometimes they aren't. important to update it. also, the resulting state
204
+ # could be a state we've never been at due to uncertainty of actions' outcomes.
205
+ # if the resulting state creates a parent-action-child triplet that hasn't been seen before, add to the tree and return it, similar result to 'expand'.
206
+ # the hashing must include the action, because we want to enable getting to the same state stochastically from 2 different states: walking forward from (1,2) looking right and getting to (2,2) - the expected behavior, should be allowed even if the agent once stood at (2,1), looked down, turned right and accidently proceeded forward.
207
+ resulting_identifier = [child_id for child_id in node.children_identifiers if all(a == b for a, b in zip(self.tree.nodes[child_id].pos, self.env.unwrapped.agent_pos)) and self.tree.nodes[child_id].action == best_action]
208
+ if len(resulting_identifier) == 0: # took an action done before, but it lead to a new state.
209
+ resulting_identifier = hash((tuple(node.pos), node.state['direction'], best_action, tuple(self.env.unwrapped.agent_pos), state['direction']))
210
+ return self.expand_selection_stochastic_node(node, resulting_identifier, terminated, truncated, reward, best_action, state, depth), depth
211
+ assert len(resulting_identifier) == 1
212
+ node = self.tree.nodes[resulting_identifier[0]]
213
+ return node, depth
214
+
215
+ # receives a final state node and updates the rewards of all the nodes on the path to the root
216
+ def backpropagation(self, node, value):
217
+ while node != self.tree.parent(self.tree.root):
218
+ assert node != None # if we got to None it means we got to the actual root with the backpropogation instead of to the current root, which means in this path, someone had a differrent parent than it should, probably a double id.
219
+ node.num_visits += 1
220
+ node.total_simulation_reward += value
221
+ node.performance = node.total_simulation_reward/node.num_visits
222
+ node = self.tree.parent(node)
223
+
224
+
225
+ def generate_full_policy_sequence(self, env_name, problem_name, save_fig=False, fig_path=None, env_prop=None):
226
+ trace = []
227
+ node, prev_node = self.tree.root, self.tree.root
228
+ print("generating policy sequence.")
229
+ for action in self.plan:
230
+ print(f"position {tuple(node.pos)} direction {dict_dir_id_to_str[node.state['direction']]}, action {dict_action_id_to_str[action]}")
231
+ candidate_children = [child for child in self.tree.children(node) if child.action == action] # there could be some children associated with the best action, representing different outcomes.
232
+ assert len(candidate_children) > 0
233
+ node = max(candidate_children, key=lambda node: node.num_visits) # pick the child that was visited most, meaning it represents the desired action and not the undesired outcomes.
234
+ trace.append(((prev_node.state, tuple(prev_node.pos)), node.action)) # need to add the previous node with the action leading to the next node which is a property of the next node
235
+ prev_node = node
236
+ if save_fig:
237
+ assert fig_path!=None
238
+ save_figure(trace, env_name, problem_name, fig_path, env_prop)
239
+ else:
240
+ assert fig_path==None
241
+ return trace
242
+
243
+
244
+ def save_model_and_generate_policy(tree, original_root, model_file_path, monteCarloTreeSearch):
245
+ tree.root = original_root
246
+ with open(model_file_path, 'wb') as file: # Serialize the model
247
+ monteCarloTreeSearch.env = None # pickle cannot serialize lambdas which exist in the env
248
+ pickle.dump(monteCarloTreeSearch, file)
249
+
250
+
251
+ def plan(env_name, problem_name, goal, save_fig=False, fig_path=None, env_prop=None):
252
+ global newely_expanded
253
+ model_dir = get_agent_model_dir(env_name=env_name, model_name=problem_name, class_name="MCTS")
254
+ model_file_path = os.path.join(model_dir, "mcts_model.pth")
255
+ if os.path.exists(model_file_path):
256
+ print(f"Loading pre-existing mcts planner in {model_file_path}")
257
+ with open(model_file_path, 'rb') as file: # Load the pre-existing model
258
+ try:
259
+ monteCarloTreeSearch = pickle.load(file)
260
+ except Exception as e:
261
+ class RenameUnpickler(pickle.Unpickler):
262
+ def find_class(self, module, name):
263
+ renamed_module = module
264
+ if module.startswith("ml"):
265
+ renamed_module = "gr_libs." + renamed_module
266
+ return super(RenameUnpickler, self).find_class(renamed_module, name)
267
+ def renamed_load(file_obj):
268
+ return RenameUnpickler(file_obj).load()
269
+ file.seek(0)
270
+ monteCarloTreeSearch = renamed_load(file)
271
+
272
+ with open(model_file_path, 'wb') as file:
273
+ pickle.dump(monteCarloTreeSearch, file)
274
+
275
+ return monteCarloTreeSearch.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path)
276
+ if not os.path.exists(model_dir): # if we reached here, the model doesn't exist. make sure its folder exists.
277
+ os.makedirs(model_dir)
278
+ steps = 10000
279
+ print(f"No tree found. Executing MCTS, starting with {steps} rollouts for each action.")
280
+ env = gym.make(id=problem_name)
281
+ random.seed(2)
282
+ tree = Tree()
283
+ mcts = MonteCarloTreeSearch(env=env, tree=tree, goal=goal)
284
+ original_root = tree.root
285
+ depth = 0
286
+ while not tree.root.terminal: # we iterate until the root is a terminal state, meaning the game is over.
287
+ max_reward = 0
288
+ iteration = 0
289
+ steps = max(2000,int(steps*0.9))
290
+ print(f"Executing {steps} rollouts for each action now.")
291
+ tq = tqdm(range(steps), postfix=f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(env.unwrapped.agent_pos)}, newely expanded: {0}")
292
+ for n in tq:
293
+ iteration = n
294
+ mcts.env.reset()
295
+ # when executing the partial plan, it's possible the environment finished due to the stochasticity. the execution would return false if that happend.
296
+ depth = len(mcts.plan)
297
+ mcts.tree.root = original_root # need to return it to the original root before executing the partial plan as it can lead to a different path and the root can change between iterations.
298
+ node, result = mcts.execute_partial_plan(mcts.plan)
299
+ if not result:
300
+ # false return value from partial plan execution means the plan is finished. we can mark our root as terminal and exit, happy with our plan.
301
+ tree.root.terminal = True
302
+ save_model_and_generate_policy(tree=tree, original_root=original_root, model_file_path=model_file_path, monteCarloTreeSearch=mcts)
303
+ return mcts.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path, env_prop)
304
+ plan_pos, plan_dir = node.pos, dict_dir_id_to_str[node.state['direction']]
305
+ tree.root = node # determine the root to be the node executed after the plan for this iteration.
306
+ node, depth = mcts.tree_policy(root_depth=depth) # find a path to a new unvisited node (unique sequence of actions) by utilizing explorative policy or choosing unvisited children recursively
307
+ # if the node that returned from tree policy is terminal, the reward will be returned from "simulation" function immediately.
308
+ reward = mcts.simulation(node) # proceed from that node randomly and collect the final reward expected from it (heuristic)
309
+ if reward > max_reward:
310
+ max_reward = reward
311
+ mcts.backpropagation(node, reward) # update the performances of nodes along the way until the root
312
+ tq.set_postfix_str(f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(plan_pos)}, looking {plan_dir}. newely expanded: {newely_expanded}")
313
+ # update the root and start from it next time.
314
+ newely_expanded = 0
315
+ action = mcts.best_action(node=tree.root, exploration_constant=0)
316
+ if action == -1:
317
+ pass
318
+ mcts.plan.append(action)
319
+ print(f"Executed action {action}")
320
+ save_model_and_generate_policy(tree=tree, original_root=original_root, model_file_path=model_file_path, monteCarloTreeSearch=monteCarloTreeSearch)
321
+ return mcts.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path)
322
+
323
+ if __name__ == "__main__":
324
+ # register(
325
+ # id="MiniGrid-DynamicGoalEmpty-8x8-3x6-v0",
326
+ # entry_point="minigrid.envs:DynamicGoalEmpty",
327
+ # kwargs={"size": 8, "agent_start_pos" : (1, 1), "goal_pos": (3,6) },
328
+ # )
329
+ # plan("MiniGrid-DynamicGoalEmpty-8x8-3x6-v0")
330
+ pass
@@ -0,0 +1,2 @@
1
+ from .node import Node
2
+ from .tree import Tree
@@ -0,0 +1,33 @@
1
+ import random
2
+
3
+ class Node:
4
+
5
+ def __init__(self, identifier, state, action, action_space, reward, terminal, pos, depth):
6
+ self.identifier = identifier
7
+ self.parent_identifier = None
8
+ self.children_identifiers = []
9
+ self.untried_actions = list(range(action_space))
10
+ self.state = state
11
+ self.pos = pos
12
+ self.total_simulation_reward = 0
13
+ self.num_visits = 0
14
+ self.performance = 0
15
+ self.action = action
16
+ self.reward = reward
17
+ self.terminal = terminal
18
+ self.invalid = False
19
+ self.got_invalid = False
20
+ self.depth = depth
21
+
22
+ def __str__(self):
23
+ return "{}: (action={}, visits={}, reward={:d}, ratio={:0.4f})".format(
24
+ self.state,
25
+ self.action,
26
+ self.num_visits,
27
+ int(self.total_simulation_reward),
28
+ self.performance)
29
+
30
+ def untried_action(self):
31
+ action = random.choice(self.untried_actions)
32
+ self.untried_actions.remove(action)
33
+ return action
@@ -0,0 +1,102 @@
1
+ def vertical_lines(last_node_flags):
2
+ vertical_lines = []
3
+ vertical_line = '\u2502'
4
+ for last_node_flag in last_node_flags[0:-1]:
5
+ if last_node_flag == False:
6
+ vertical_lines.append(vertical_line + ' ' * 3)
7
+ else:
8
+ # space between vertical lines
9
+ vertical_lines.append(' ' * 4)
10
+ return ''.join(vertical_lines)
11
+
12
+ def horizontal_line(last_node_flags):
13
+ horizontal_line = '\u251c\u2500\u2500 '
14
+ horizontal_line_end = '\u2514\u2500\u2500 '
15
+ if last_node_flags[-1]:
16
+ return horizontal_line_end
17
+ else:
18
+ return horizontal_line
19
+
20
+ class Tree:
21
+
22
+ def __init__(self):
23
+ self.nodes = {}
24
+ self.root = None
25
+
26
+ def is_expandable(self, node):
27
+ if node.terminal or node.invalid:
28
+ return False
29
+ if len(node.untried_actions) > 0:
30
+ return True
31
+ return False
32
+
33
+ def iter(self, identifier, depth, last_node_flags):
34
+ if identifier is None:
35
+ node = self.root
36
+ else:
37
+ node = self.nodes[identifier]
38
+
39
+ if depth == 0:
40
+ yield "", node
41
+ else:
42
+ yield vertical_lines(last_node_flags) + horizontal_line(last_node_flags), node
43
+
44
+ children = [self.nodes[identifier] for identifier in node.children_identifiers]
45
+ last_index = len(children) - 1
46
+
47
+ depth += 1
48
+ for index, child in enumerate(children):
49
+ last_node_flags.append(index == last_index)
50
+ for edge, node in self.iter(child.identifier, depth, last_node_flags):
51
+ yield edge, node
52
+ last_node_flags.pop()
53
+
54
+ def add_node(self, node, parent=None):
55
+ assert node.identifier not in self.nodes.keys()
56
+ self.nodes.update({node.identifier: node})
57
+
58
+ if parent is None:
59
+ self.root = node
60
+ self.nodes[node.identifier].parent = None
61
+ else:
62
+ self.nodes[parent.identifier].children_identifiers.append(node.identifier)
63
+ self.nodes[node.identifier].parent_identifier=parent.identifier
64
+
65
+ def update_id(self, old_id, new_id):
66
+ assert new_id not in self.nodes.keys()
67
+ # prepare needed objects
68
+ node = self.nodes[old_id]
69
+ parent = self.parent(node)
70
+
71
+ # update the node's parent
72
+ self.nodes[parent.identifier].children_identifiers.remove(old_id)
73
+ self.nodes[parent.identifier].children_identifiers.append(new_id)
74
+
75
+ # update the node itself
76
+ node.identifier = new_id
77
+
78
+ # update the node's children (if there are any?...)
79
+ for child_id in node.children_identifiers:
80
+ self.nodes[child_id].parent_identifier = new_id
81
+
82
+ self.nodes.pop(old_id)
83
+ self.nodes.update({node.identifier: node})
84
+
85
+ def children(self, node):
86
+ children = []
87
+ for identifier in self.nodes[node.identifier].children_identifiers:
88
+ children.append(self.nodes[identifier])
89
+ return children
90
+
91
+ def parent(self, node):
92
+ parent_identifier = self.nodes[node.identifier].parent_identifier
93
+ if parent_identifier is None:
94
+ return None
95
+ else:
96
+ return self.nodes[parent_identifier]
97
+
98
+ def show(self):
99
+ lines = ""
100
+ for edge, node in self.iter(identifier=None, depth=0, last_node_flags=[]):
101
+ lines += "{}{}\n".format(edge, node)
102
+ print(lines)
@@ -0,0 +1 @@
1
+ from gr_libs.ml.sequential.lstm_model import LstmObservations
@@ -0,0 +1,192 @@
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import torch.optim as optim
6
+ from types import MethodType
7
+ import numpy as np
8
+ from gr_libs.ml.utils import device
9
+ from torch.nn.utils.rnn import pack_padded_sequence
10
+
11
+
12
+ def accuracy_per_epoch(model, data_loader):
13
+ model.eval()
14
+ correct = total = 0.0
15
+ sum_loss = 0.0
16
+ with torch.no_grad():
17
+ for (first_traces, second_traces, is_same_goals, first_traces_lengths, second_traces_lengths) in data_loader:
18
+ y_pred = model.forward_tab(first_traces, second_traces, first_traces_lengths, second_traces_lengths)
19
+ loss = F.binary_cross_entropy(y_pred, is_same_goals)
20
+ sum_loss += loss.item()
21
+ y_pred = (y_pred >= 0.5)
22
+ correct += torch.sum(y_pred == is_same_goals)
23
+ total += len(is_same_goals)
24
+ return correct / total, sum_loss / 32
25
+
26
+ def accuracy_per_epoch_cont(model, data_loader):
27
+ model.eval()
28
+ correct = total = 0.0
29
+ sum_loss = 0.0
30
+ with torch.no_grad():
31
+ for (first_traces_images, first_traces_texts, second_traces_images, second_traces_texts, is_same_goals, first_traces_lengths, second_traces_lengths) in data_loader:
32
+ y_pred = model.forward_cont(first_traces_images, first_traces_texts, second_traces_images, second_traces_texts, first_traces_lengths, second_traces_lengths)
33
+ loss = F.binary_cross_entropy(y_pred, is_same_goals)
34
+ sum_loss += loss.item()
35
+ y_pred = (y_pred >= 0.5)
36
+ correct += torch.sum(y_pred == is_same_goals)
37
+ total += len(is_same_goals)
38
+ return correct / total, sum_loss / 32
39
+
40
+ # class CNNImageEmbeddor(nn.Module):
41
+ # def __init__(self, obs_space, action_space, use_text=False):
42
+ # super().__init__()
43
+ # self.use_text = use_text
44
+ # self.image_conv = nn.Sequential(
45
+ # nn.Conv2d(3, 4, kernel_size=(3, 3), padding=1), # Reduced filters, added padding
46
+ # nn.ReLU(),
47
+ # nn.MaxPool2d((2, 2)),
48
+ # nn.Conv2d(4, 4, (3, 3), padding=1), # Reduced filters, added padding
49
+ # nn.ReLU(),
50
+ # nn.MaxPool2d((2, 2)), # Added additional pooling to reduce size
51
+ # nn.Conv2d(4, 8, (3, 3), padding=1), # Reduced filters, added padding
52
+ # nn.ReLU(),
53
+ # nn.BatchNorm2d(8)
54
+ # )
55
+ # n = obs_space["image"][0]
56
+ # m = obs_space["image"][1]
57
+ # self.image_embedding_size = ((n - 4) // 4 - 3) * ((m - 4) // 4 - 3) * 8
58
+ # if self.use_text:
59
+ # self.word_embedding_size = 32
60
+ # self.word_embedding = nn.Embedding(obs_space["text"], self.word_embedding_size)
61
+ # self.text_embedding_size = 128
62
+ # self.text_rnn = nn.GRU(self.word_embedding_size, self.text_embedding_size, batch_first=True)
63
+
64
+ def forward(self, images, texts):
65
+ # images shape: batch_size X max_sequence_len X sample_size. same for text.
66
+ # need to reshape image to num_channels X height X width, like nn.Conv expects it to be.
67
+ x = images.transpose(2, 4).transpose(3, 4)
68
+ orig_shape = x.shape
69
+ # combine batch and sequence to 1 dimension so conv could handle it
70
+ x = x.view(orig_shape[0]*orig_shape[1], orig_shape[2], orig_shape[3], orig_shape[4]) # x shape: batch_size * max_sequence_len X sample_size
71
+ x = self.image_conv(x) # x shape: batch_size * max_sequence_len X last_conv_size X 1 X 1
72
+ # reshape x back to divide batches from sequences
73
+ x = x.view(orig_shape[0], orig_shape[1], x.shape[1]) # x shape: batch_size X max_sequence_len X last_conv_size. last 2 dimensions (1,1) are collapsed to last conv.
74
+ embedding = x
75
+
76
+ if self.use_text:
77
+ embed_text = self._get_embed_text(texts)
78
+ embedding = torch.cat((embedding, embed_text), dim=1)
79
+
80
+ return embedding
81
+
82
+ def _get_embed_text(self, text):
83
+ _, hidden = self.text_rnn(self.word_embedding(text))
84
+ return hidden[-1]
85
+
86
+ class LstmObservations(nn.Module):
87
+
88
+ def __init__(self, input_size, hidden_size): # TODO make sure the right cuda is used!
89
+ super(LstmObservations,self).__init__()
90
+ #self.embeddor = CNNImageEmbeddor(obs_space, action_space)
91
+ # check if the traces are a bunch of images
92
+ self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
93
+ self.dropout = nn.Dropout(0.5) # Added dropout layer
94
+ # Initialize weights
95
+ for name, param in self.lstm.named_parameters():
96
+ if 'weight' in name:
97
+ nn.init.xavier_uniform_(param)
98
+ elif 'bias' in name:
99
+ nn.init.zeros_(param)
100
+
101
+
102
+ # tabular
103
+ def forward_tab(self, traces1, traces2, lengths1, lengths2):
104
+ out1, (ht1, ct1) = self.lstm(pack_padded_sequence(traces1, lengths1, batch_first=True, enforce_sorted=False), None) # traces1 & traces 2 shapes: batch_size X max sequence_length X embedding_size
105
+ out2, (ht2, ct2) = self.lstm(pack_padded_sequence(traces2, lengths2, batch_first=True, enforce_sorted=False), None)
106
+ # out1, _ = pad_packed_sequence(out1, batch_first=True, total_length=max(lengths1))
107
+ # out2, _ = pad_packed_sequence(out2, batch_first=True, total_length=max(lengths2))
108
+ manhattan_dis = torch.exp(-torch.sum(torch.abs(ht1[-1]-ht2[-1]),dim=1,keepdim=True))
109
+ return manhattan_dis.squeeze()
110
+
111
+ # continuous
112
+ # def forward_cont(self, traces1_images, traces1_texts, traces2_images, traces2_texts, lengths1, lengths2):
113
+ # # we also embed '0' images, but we take them out of the equation in the lstm (it knows to not treat them when batching)
114
+ # traces1 = self.embeddor(traces1_images, traces1_texts)
115
+ # traces2 = self.embeddor(traces2_images, traces2_texts) # traces1 & traces 2 shapes: batch_size X max_sequence_length X embedding_size
116
+ # out1, (ht1, ct1) = self.lstm(pack_padded_sequence(traces1, lengths1, batch_first=True, enforce_sorted=False), None)
117
+ # out2, (ht2, ct2) = self.lstm(pack_padded_sequence(traces2, lengths2, batch_first=True, enforce_sorted=False), None)
118
+ # manhattan_dis = torch.exp(-torch.sum(torch.abs(ht1[-1]-ht2[-1]),dim=1,keepdim=True))
119
+ # return manhattan_dis.squeeze()
120
+
121
+ def embed_sequence(self, trace):
122
+ trace = torch.stack([torch.tensor(observation, dtype=torch.float32) for observation in trace]).to(device)
123
+ out, (ht, ct) = self.lstm(trace, None)
124
+ return ht[-1]
125
+
126
+ # def embed_sequence_cont(self, sequence, preprocess_obss):
127
+ # sequence = [preprocess_obss([obs])[0] for ((obs, (_, _)), _) in sequence]
128
+ # trace_images = torch.tensor(np.expand_dims(torch.stack([step.image for step in sequence]), axis=0)).to(device)
129
+ # trace_texts = torch.tensor(np.expand_dims(torch.stack([step.text for step in sequence]), axis=0)).to(device)
130
+ # embedded_trace = self.embeddor(trace_images, trace_texts)
131
+ # out, (ht, ct) = self.lstm(embedded_trace)
132
+ # return ht[-1]
133
+
134
+ def train_metric_model(model, train_loader, dev_loader, nepochs=5, patience = 2):
135
+ devAccuracy = []
136
+ best_dev_accuracy = 0.0
137
+ no_improvement_count = 0
138
+ optimizer = torch.optim.Adadelta(model.parameters(), weight_decay=0.1)
139
+ scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
140
+ for epoch in range(nepochs):
141
+ sum_loss, denominator = 0.0, 0.0
142
+ model.train()
143
+ for (first_traces, second_traces, is_same_goals, first_traces_lengths, second_traces_lengths) in train_loader:
144
+ model.zero_grad()
145
+ y_pred = model.forward_tab(first_traces, second_traces, first_traces_lengths, second_traces_lengths)
146
+ if len(is_same_goals) == 1: is_same_goals = torch.squeeze(is_same_goals) # for the case of batches in size 1...
147
+ loss = F.binary_cross_entropy(y_pred, is_same_goals)
148
+ sum_loss += loss.item()
149
+ denominator += 1
150
+ loss.backward()
151
+ optimizer.step()
152
+
153
+ dev_accuracy, dev_loss = accuracy_per_epoch(model, dev_loader)
154
+ devAccuracy.append(dev_accuracy)
155
+ if dev_accuracy > best_dev_accuracy:
156
+ best_dev_accuracy = dev_accuracy
157
+ no_improvement_count = 0
158
+ else:
159
+ no_improvement_count = 1
160
+
161
+ print("epoch - {}/{}...".format(epoch + 1, nepochs),
162
+ "train loss - {:.6f}...".format(sum_loss / denominator),
163
+ "dev loss - {:.6f}...".format(dev_loss),
164
+ "dev accuracy - {:.6f}".format(dev_accuracy))
165
+
166
+ if no_improvement_count >= patience:
167
+ print(f"Early stopping after {epoch + 1} epochs with no improvement.")
168
+ break
169
+
170
+ def train_metric_model_cont(model, train_loader, dev_loader, nepochs=5):
171
+ devAccuracy = []
172
+ optimizer = torch.optim.Adadelta(model.parameters(),weight_decay=1.25)
173
+ for epoch in range(nepochs):
174
+ sum_loss, denominator = 0.0, 0.0
175
+ model.train()
176
+ for (first_traces_images, first_traces_texts, second_traces_images, second_traces_texts, is_same_goals, first_traces_lengths, second_traces_lengths) in train_loader:
177
+ model.zero_grad()
178
+ y_pred = model.forward_cont(first_traces_images, first_traces_texts, second_traces_images, second_traces_texts, first_traces_lengths, second_traces_lengths)
179
+ loss = F.binary_cross_entropy(y_pred, is_same_goals)
180
+ sum_loss += loss.item()
181
+ denominator += 1
182
+ loss.backward()
183
+ optimizer.step()
184
+
185
+ dev_accuracy, dev_loss = accuracy_per_epoch_cont(model, dev_loader)
186
+ devAccuracy.append(dev_accuracy)
187
+
188
+ print("epoch - {}/{}...".format(epoch + 1, nepochs),
189
+ "train loss - {:.6f}...".format(sum_loss / denominator),
190
+ "dev loss - {:.6f}...".format(dev_loss),
191
+ "dev accuracy - {:.6f}".format(dev_accuracy))
192
+
@@ -0,0 +1,3 @@
1
+ from gr_libs.ml.tabular.state import TabularState
2
+ from gr_libs.ml.tabular.tabular_q_learner import TabularQLearner
3
+ from gr_libs.ml.sequential.lstm_model import LstmObservations