gr-libs 0.1.7.post0__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. evaluation/analyze_results_cross_alg_cross_domain.py +236 -246
  2. evaluation/create_minigrid_map_image.py +10 -6
  3. evaluation/file_system.py +16 -5
  4. evaluation/generate_experiments_results.py +123 -74
  5. evaluation/generate_experiments_results_new_ver1.py +227 -243
  6. evaluation/generate_experiments_results_new_ver2.py +317 -317
  7. evaluation/generate_task_specific_statistics_plots.py +481 -253
  8. evaluation/get_plans_images.py +41 -26
  9. evaluation/increasing_and_decreasing_.py +97 -56
  10. gr_libs/__init__.py +2 -1
  11. gr_libs/_version.py +2 -2
  12. gr_libs/environment/__init__.py +16 -8
  13. gr_libs/environment/environment.py +167 -39
  14. gr_libs/environment/utils/utils.py +22 -12
  15. gr_libs/metrics/__init__.py +5 -0
  16. gr_libs/metrics/metrics.py +76 -34
  17. gr_libs/ml/__init__.py +2 -0
  18. gr_libs/ml/agent.py +21 -6
  19. gr_libs/ml/base/__init__.py +1 -1
  20. gr_libs/ml/base/rl_agent.py +13 -10
  21. gr_libs/ml/consts.py +1 -1
  22. gr_libs/ml/neural/deep_rl_learner.py +433 -352
  23. gr_libs/ml/neural/utils/__init__.py +1 -1
  24. gr_libs/ml/neural/utils/dictlist.py +3 -3
  25. gr_libs/ml/neural/utils/penv.py +5 -2
  26. gr_libs/ml/planner/mcts/mcts_model.py +524 -302
  27. gr_libs/ml/planner/mcts/utils/__init__.py +1 -1
  28. gr_libs/ml/planner/mcts/utils/node.py +11 -7
  29. gr_libs/ml/planner/mcts/utils/tree.py +14 -10
  30. gr_libs/ml/sequential/__init__.py +1 -1
  31. gr_libs/ml/sequential/lstm_model.py +256 -175
  32. gr_libs/ml/tabular/state.py +7 -7
  33. gr_libs/ml/tabular/tabular_q_learner.py +123 -73
  34. gr_libs/ml/tabular/tabular_rl_agent.py +20 -19
  35. gr_libs/ml/utils/__init__.py +8 -2
  36. gr_libs/ml/utils/format.py +78 -70
  37. gr_libs/ml/utils/math.py +2 -1
  38. gr_libs/ml/utils/other.py +1 -1
  39. gr_libs/ml/utils/storage.py +88 -28
  40. gr_libs/problems/consts.py +1549 -1227
  41. gr_libs/recognizer/gr_as_rl/gr_as_rl_recognizer.py +145 -80
  42. gr_libs/recognizer/graml/gr_dataset.py +209 -110
  43. gr_libs/recognizer/graml/graml_recognizer.py +431 -240
  44. gr_libs/recognizer/recognizer.py +38 -27
  45. gr_libs/recognizer/utils/__init__.py +1 -1
  46. gr_libs/recognizer/utils/format.py +8 -3
  47. {gr_libs-0.1.7.post0.dist-info → gr_libs-0.1.8.dist-info}/METADATA +1 -1
  48. gr_libs-0.1.8.dist-info/RECORD +70 -0
  49. {gr_libs-0.1.7.post0.dist-info → gr_libs-0.1.8.dist-info}/WHEEL +1 -1
  50. tests/test_gcdraco.py +10 -0
  51. tests/test_graml.py +8 -4
  52. tests/test_graql.py +2 -1
  53. tutorials/gcdraco_panda_tutorial.py +66 -0
  54. tutorials/gcdraco_parking_tutorial.py +61 -0
  55. tutorials/graml_minigrid_tutorial.py +42 -12
  56. tutorials/graml_panda_tutorial.py +35 -14
  57. tutorials/graml_parking_tutorial.py +37 -20
  58. tutorials/graml_point_maze_tutorial.py +33 -13
  59. tutorials/graql_minigrid_tutorial.py +31 -15
  60. gr_libs-0.1.7.post0.dist-info/RECORD +0 -67
  61. {gr_libs-0.1.7.post0.dist-info → gr_libs-0.1.8.dist-info}/top_level.txt +0 -0
@@ -13,318 +13,540 @@ import gymnasium as gym
13
13
  PROB = 0.8
14
14
  UNIFORM_PROB = 0.1
15
15
  newely_expanded = 0
16
- dict_dir_id_to_str = {0:'right', 1:'down', 2:'left', 3:'up'}
17
- dict_action_id_to_str = {0:'turn left', 1:'turn right', 2:'go straight'}
16
+ dict_dir_id_to_str = {0: "right", 1: "down", 2: "left", 3: "up"}
17
+ dict_action_id_to_str = {0: "turn left", 1: "turn right", 2: "go straight"}
18
+
18
19
 
19
20
  def save_figure(steps, env_name, problem_name, img_path, env_prop):
20
- sequence = [pos for ((state, pos), action) in steps]
21
- #print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
22
- print(f"generating sequence image at {img_path}.")
23
- env_prop.create_sequence_image(sequence, img_path, problem_name)
21
+ sequence = [pos for ((state, pos), action) in steps]
22
+ # print(f"sequence to {self.problem_name} is:\n\t{steps}\ngenerating image at {img_path}.")
23
+ print(f"generating sequence image at {img_path}.")
24
+ env_prop.create_sequence_image(sequence, img_path, problem_name)
25
+
24
26
 
25
27
  # TODO add number of expanded nodes and debug by putting breakpoint on the creation of nodes representing (8,4) and checking if they're invalid or something
26
28
 
29
+
27
30
  # Explanation on hashing and uncertainty in the acto outcome:
28
31
  # We want to detect circles, while not preventing expected behavior. To achieve it, hasing must include previous state, action, and resulting state.
29
32
  # Hashing the direction means coming to the same position from different positions gets different id's.
30
- # Example: the agent might have stood at (2,2), picked action 2 (forward), and accidently turned right, resulting at state ((2,2), right).
31
- # later, when the agent stood at (2,1), looked right and walked forward, it got to the same state. We would want to enable that, because
32
- # this is the expected behavior, so these nodes must have unique id's.
33
+ # Example: the agent might have stood at (2,2), picked action 2 (forward), and accidently turned right, resulting at state ((2,2), right).
34
+ # later, when the agent stood at (2,1), looked right and walked forward, it got to the same state. We would want to enable that, because
35
+ # this is the expected behavior, so these nodes must have unique id's.
33
36
  # The situations where circles will indeed be detected, are only if the outcome was the same for the previous state, consistent with the action - whether it was or wasn't expected.
34
- class MonteCarloTreeSearch():
35
-
36
- def __init__(self, env, tree, goal, use_heuristic=True):
37
- self.env = env
38
- self.tree = tree
39
- self.action_space = self.env.action_space.n
40
- self.action_space = 3 # currently
41
- state, _ = self.env.reset()
42
- self.use_heuristic = use_heuristic
43
- self.goal = goal
44
- self.tree.add_node(Node(identifier=hash((None, None, tuple(self.env.unwrapped.agent_pos), state['direction'])), state=state, action=None, action_space=self.action_space, reward=0, terminal=False, pos=env.unwrapped.agent_pos, depth=0))
45
- self.plan = []
46
-
47
- # def mark_invalid_children(self, children_identifiers, action):
48
- # for child_id in children_identifiers:
49
- # child = self.tree.nodes[child_id]
50
- # if child.action == action:
51
- # child.invalid = True
52
-
53
- def decide_invalid_path(self, new_node_father, old_node, new_node): # new_node created the circle, old_node got to the configuration first.
54
- new_visits, old_visits = [1,1], [0,0] # stochasticity couldn't result a cycle directly, because it involves a different action. we can get it only by making the same stochastic action mistake or just an actual cycle.
55
- new_node_ptr = new_node_father
56
- old_node_ptr = old_node
57
-
58
- while new_node_ptr != None:
59
- new_visits[0] += new_node_ptr.num_visits
60
- new_visits[1] += 1
61
- new_node_ptr = self.tree.parent(new_node_ptr)
62
-
63
- while old_node_ptr != None: # getting to the old node wasn't necessarily through the current root. check all the way until None, the original root's parent.
64
- old_visits[0] += old_node_ptr.num_visits
65
- old_visits[1] += 1
66
- old_node_ptr = self.tree.parent(old_node_ptr)
67
-
68
- if new_visits[0] / new_visits[1] > old_visits[0] / old_visits[1]: # newer node is the more probable one. make the 1st path the invalid one: its the one that created the circle!
69
- old_node.invalid = True
70
- # self.tree.update_id(old_id=old_node.identifier, new_id=new_node.identifier)
71
- else:
72
- new_node.invalid = True
73
-
74
- def is_parent_child_same(self, new_node, node):
75
- return new_node.pos[0] == node.pos[0] and new_node.pos[1] == node.pos[1] and new_node.state['direction'] == node.state['direction']
76
-
77
- def expand(self, node, depth):
78
- global newely_expanded
79
- action = node.untried_action()
80
- state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(action))
81
- done = terminated | truncated
82
- new_identifier = hash((tuple(node.pos), node.state['direction'], action, tuple(self.env.unwrapped.agent_pos), state['direction']))
83
- valid_id = new_identifier
84
- while new_identifier in self.tree.nodes.keys(): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
85
- if self.tree.nodes[new_identifier].invalid == False:
86
- valid_id = new_identifier
87
- new_identifier = hash((666, new_identifier))
88
- # after this while, the id is for sure unused.
89
- new_node = Node(identifier=new_identifier, state=state, action=action, action_space=self.action_space, reward=reward, terminal=done, pos=self.env.unwrapped.agent_pos, depth=depth)
90
- if self.is_parent_child_same(new_node, node): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
91
- new_node.invalid = True
92
- new_node.got_invalid = True
93
- # if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
94
- elif valid_id in self.tree.nodes.keys(): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
95
- # print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
96
-
97
- self.decide_invalid_path(new_node_father=node, old_node=self.tree.nodes[valid_id], new_node=new_node)
98
- # self.mark_invalid_children(node.children_identifiers, action)
99
-
100
- self.tree.add_node(new_node, node)
101
- # if action == 2 and tuple(self.env.unwrapped.agent_pos) == tuple(node.pos): # if the new node is actually invalid, mark it along with the other nodes of the same action as invalid, meaning reward will be 0 for them.
102
- # self.mark_invalid_children(node.children_identifiers)
103
- newely_expanded += 1
104
- return new_node
105
-
106
- def stochastic_action(self, choice):
107
- prob_distribution = []
108
- actions = range(self.action_space)
109
- for action in actions:
110
- if action == choice: prob_distribution.append(PROB)
111
- else: prob_distribution.append(UNIFORM_PROB)
112
- return random.choices(actions, weights=prob_distribution, k=1)[0]
113
-
114
- def expand_selection_stochastic_node(self, node, resulting_identifier, terminated, truncated, reward, action, state, depth):
115
- global newely_expanded
116
- # the new node could result in a terminating state.
117
- done = terminated | truncated
118
- valid_id = resulting_identifier
119
- while resulting_identifier in self.tree.nodes.keys(): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
120
- if self.tree.nodes[resulting_identifier].invalid == False:
121
- valid_id = resulting_identifier
122
- resulting_identifier = hash((666, resulting_identifier))
123
- # after this while, the id is for sure unused.
124
- new_node = Node(identifier=resulting_identifier, state=state, action=action, action_space=self.action_space, reward=reward, terminal=done, pos=self.env.unwrapped.agent_pos, depth=depth)
125
- if self.is_parent_child_same(new_node, node): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
126
- new_node.invalid = True
127
- new_node.got_invalid = True
128
- # if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
129
- elif valid_id in self.tree.nodes.keys(): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
130
- # print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
131
- self.decide_invalid_path(new_node_father=node, old_node=self.tree.nodes[valid_id], new_node=new_node)
132
- # self.mark_invalid_children(node.children_identifiers, action)
133
- self.tree.add_node(new_node, node)
134
- newely_expanded += 1
135
- return new_node
136
-
137
- def simulation(self, node):
138
- if node.terminal:
139
- return node.reward
140
- if self.use_heuristic:
141
- # taken from Monte-Carlo Planning for Pathfinding in Real-Time Strategy Games , 2010.
142
- # need to handle the case of walking into a wall here: the resulting node will be considered invalid and it's reward and performance needs to be 0, but must handle stochasticity
143
- # suggestion to handle stochasticity - consider *all* the children associated with taking action 2 towards a wall as performance 0, even if they accidently led in walking to another direction.
144
- # which suggests the invalidity needs to be checked not according to the resulting state, rather according to the intended action itself and the environment! remember, you cannot access the "stochastic_action", it is meant to be hidden from you.
145
- if node.pos[0] == self.goal[0] and node.pos[1] == self.goal[1] : return 2
146
- if node.invalid: return -0.5
147
- else: return 0.8*(1 / (abs(node.pos[0] - self.goal[0]) + abs(node.pos[1] - self.goal[1]))) + 0.2*(1/node.depth) # large depth = less probability of obstacles -> larger nominator higher performance. further from goal -> larger denominator, lower performance.
148
- while True:
149
- action = random.randint(0, self.action_space-1)
150
- state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(action))
151
- done = terminated | truncated # this time there could be truncation unlike in the tree policy.
152
- if done:
153
- return reward
154
-
155
- def compute_value(self, parent, child, exploration_constant):
156
- exploration_term = exploration_constant * sqrt(2*log(parent.num_visits) / child.num_visits)
157
- return child.performance + exploration_term
158
-
159
- # return the best action from a node. the value of an action is the weighted sum of performance of all children that are associated with this action.
160
- def best_action(self, node, exploration_constant):
161
- tried_actions_values = {} # dictionary mapping actions to tuples of (cumulative number of visits of children, sum of (child performance * num of visits for child)) to compute the mean later
162
- if tuple(node.pos) == (1,2) and node.depth == 3 and node.action == 0:
163
- pass
164
- children = [child for child in self.tree.children(node) if not child.invalid]
165
- if not children: # all children are invalid. this node is invalid aswell.
166
- return 2
167
- for child in children:
168
- value = self.compute_value(node, child, exploration_constant)
169
- tried_actions_values.setdefault(child.action, [0, 0]) # create if it doesn't exist
170
- tried_actions_values[child.action][0] += child.num_visits # add the number of child visits
171
- tried_actions_values[child.action][1] += value * child.num_visits # add the relative performance of this child
172
- return max(tried_actions_values, key=lambda k: tried_actions_values[k][1] / tried_actions_values[k][0]) # return the key (action) with the highest average value
173
-
174
- # only changes the environment to make sure the actions which are already a part of the plan have been executed.
175
- def execute_partial_plan(self, plan):
176
- node = self.tree.root
177
- depth = 0
178
- for action in plan:
179
- depth += 1
180
- # important to simulate the env to get to some state, as the nodes don't hold this information.
181
- state, reward, terminated, truncated, _ = self.env.step(action)
182
- done = terminated
183
- if done: return None, False
184
- resulting_identifier = hash((tuple(node.pos), node.state['direction'], action, tuple(self.env.unwrapped.agent_pos), state['direction']))
185
- node = self.tree.nodes[resulting_identifier]
186
- return node, True
187
-
188
- # finds the ultimate path from the root node to a terminal state (the one that maximized rewards)
189
- def tree_policy(self, root_depth):
190
- node = self.tree.root
191
- depth = root_depth
192
- while not (node.terminal or node.invalid):
193
- depth += 1
194
- if self.tree.is_expandable(node):
195
- # expansion - in case there's an action that never been tried, its value is infinity to encourage exploration of all children of a node.
196
- return self.expand(node, depth), depth
197
- else:
198
- # selection - balance exploration and exploitation, coming down the tree - but note the selection might lead to new nodes because of stochaticity.
199
- best_action = self.best_action(node, exploration_constant=1/sqrt(2.0))
200
- if best_action == -1: break
201
- # important to simulate the env to get to some state, as the nodes don't hold this information.
202
- state, reward, terminated, truncated, _ = self.env.step(self.stochastic_action(best_action))
203
- # due to stochasticity, nodes could sometimes be terminal and sometimes they aren't. important to update it. also, the resulting state
204
- # could be a state we've never been at due to uncertainty of actions' outcomes.
205
- # if the resulting state creates a parent-action-child triplet that hasn't been seen before, add to the tree and return it, similar result to 'expand'.
206
- # the hashing must include the action, because we want to enable getting to the same state stochastically from 2 different states: walking forward from (1,2) looking right and getting to (2,2) - the expected behavior, should be allowed even if the agent once stood at (2,1), looked down, turned right and accidently proceeded forward.
207
- resulting_identifier = [child_id for child_id in node.children_identifiers if all(a == b for a, b in zip(self.tree.nodes[child_id].pos, self.env.unwrapped.agent_pos)) and self.tree.nodes[child_id].action == best_action]
208
- if len(resulting_identifier) == 0: # took an action done before, but it lead to a new state.
209
- resulting_identifier = hash((tuple(node.pos), node.state['direction'], best_action, tuple(self.env.unwrapped.agent_pos), state['direction']))
210
- return self.expand_selection_stochastic_node(node, resulting_identifier, terminated, truncated, reward, best_action, state, depth), depth
211
- assert len(resulting_identifier) == 1
212
- node = self.tree.nodes[resulting_identifier[0]]
213
- return node, depth
214
-
215
- # receives a final state node and updates the rewards of all the nodes on the path to the root
216
- def backpropagation(self, node, value):
217
- while node != self.tree.parent(self.tree.root):
218
- assert node != None # if we got to None it means we got to the actual root with the backpropogation instead of to the current root, which means in this path, someone had a differrent parent than it should, probably a double id.
219
- node.num_visits += 1
220
- node.total_simulation_reward += value
221
- node.performance = node.total_simulation_reward/node.num_visits
222
- node = self.tree.parent(node)
223
-
224
-
225
- def generate_full_policy_sequence(self, env_name, problem_name, save_fig=False, fig_path=None, env_prop=None):
226
- trace = []
227
- node, prev_node = self.tree.root, self.tree.root
228
- print("generating policy sequence.")
229
- for action in self.plan:
230
- print(f"position {tuple(node.pos)} direction {dict_dir_id_to_str[node.state['direction']]}, action {dict_action_id_to_str[action]}")
231
- candidate_children = [child for child in self.tree.children(node) if child.action == action] # there could be some children associated with the best action, representing different outcomes.
232
- assert len(candidate_children) > 0
233
- node = max(candidate_children, key=lambda node: node.num_visits) # pick the child that was visited most, meaning it represents the desired action and not the undesired outcomes.
234
- trace.append(((prev_node.state, tuple(prev_node.pos)), node.action)) # need to add the previous node with the action leading to the next node which is a property of the next node
235
- prev_node = node
236
- if save_fig:
237
- assert fig_path!=None
238
- save_figure(trace, env_name, problem_name, fig_path, env_prop)
239
- else:
240
- assert fig_path==None
241
- return trace
242
-
243
-
244
- def save_model_and_generate_policy(tree, original_root, model_file_path, monteCarloTreeSearch):
245
- tree.root = original_root
246
- with open(model_file_path, 'wb') as file: # Serialize the model
247
- monteCarloTreeSearch.env = None # pickle cannot serialize lambdas which exist in the env
248
- pickle.dump(monteCarloTreeSearch, file)
37
+ class MonteCarloTreeSearch:
38
+
39
+ def __init__(self, env, tree, goal, use_heuristic=True):
40
+ self.env = env
41
+ self.tree = tree
42
+ self.action_space = self.env.action_space.n
43
+ self.action_space = 3 # currently
44
+ state, _ = self.env.reset()
45
+ self.use_heuristic = use_heuristic
46
+ self.goal = goal
47
+ self.tree.add_node(
48
+ Node(
49
+ identifier=hash(
50
+ (
51
+ None,
52
+ None,
53
+ tuple(self.env.unwrapped.agent_pos),
54
+ state["direction"],
55
+ )
56
+ ),
57
+ state=state,
58
+ action=None,
59
+ action_space=self.action_space,
60
+ reward=0,
61
+ terminal=False,
62
+ pos=env.unwrapped.agent_pos,
63
+ depth=0,
64
+ )
65
+ )
66
+ self.plan = []
67
+
68
+ # def mark_invalid_children(self, children_identifiers, action):
69
+ # for child_id in children_identifiers:
70
+ # child = self.tree.nodes[child_id]
71
+ # if child.action == action:
72
+ # child.invalid = True
73
+
74
+ def decide_invalid_path(
75
+ self, new_node_father, old_node, new_node
76
+ ): # new_node created the circle, old_node got to the configuration first.
77
+ new_visits, old_visits = [1, 1], [
78
+ 0,
79
+ 0,
80
+ ] # stochasticity couldn't result a cycle directly, because it involves a different action. we can get it only by making the same stochastic action mistake or just an actual cycle.
81
+ new_node_ptr = new_node_father
82
+ old_node_ptr = old_node
83
+
84
+ while new_node_ptr != None:
85
+ new_visits[0] += new_node_ptr.num_visits
86
+ new_visits[1] += 1
87
+ new_node_ptr = self.tree.parent(new_node_ptr)
88
+
89
+ while (
90
+ old_node_ptr != None
91
+ ): # getting to the old node wasn't necessarily through the current root. check all the way until None, the original root's parent.
92
+ old_visits[0] += old_node_ptr.num_visits
93
+ old_visits[1] += 1
94
+ old_node_ptr = self.tree.parent(old_node_ptr)
95
+
96
+ if (
97
+ new_visits[0] / new_visits[1] > old_visits[0] / old_visits[1]
98
+ ): # newer node is the more probable one. make the 1st path the invalid one: its the one that created the circle!
99
+ old_node.invalid = True
100
+ # self.tree.update_id(old_id=old_node.identifier, new_id=new_node.identifier)
101
+ else:
102
+ new_node.invalid = True
103
+
104
+ def is_parent_child_same(self, new_node, node):
105
+ return (
106
+ new_node.pos[0] == node.pos[0]
107
+ and new_node.pos[1] == node.pos[1]
108
+ and new_node.state["direction"] == node.state["direction"]
109
+ )
110
+
111
+ def expand(self, node, depth):
112
+ global newely_expanded
113
+ action = node.untried_action()
114
+ state, reward, terminated, truncated, _ = self.env.step(
115
+ self.stochastic_action(action)
116
+ )
117
+ done = terminated | truncated
118
+ new_identifier = hash(
119
+ (
120
+ tuple(node.pos),
121
+ node.state["direction"],
122
+ action,
123
+ tuple(self.env.unwrapped.agent_pos),
124
+ state["direction"],
125
+ )
126
+ )
127
+ valid_id = new_identifier
128
+ while (
129
+ new_identifier in self.tree.nodes.keys()
130
+ ): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
131
+ if self.tree.nodes[new_identifier].invalid == False:
132
+ valid_id = new_identifier
133
+ new_identifier = hash((666, new_identifier))
134
+ # after this while, the id is for sure unused.
135
+ new_node = Node(
136
+ identifier=new_identifier,
137
+ state=state,
138
+ action=action,
139
+ action_space=self.action_space,
140
+ reward=reward,
141
+ terminal=done,
142
+ pos=self.env.unwrapped.agent_pos,
143
+ depth=depth,
144
+ )
145
+ if self.is_parent_child_same(
146
+ new_node, node
147
+ ): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
148
+ new_node.invalid = True
149
+ new_node.got_invalid = True
150
+ # if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
151
+ elif (
152
+ valid_id in self.tree.nodes.keys()
153
+ ): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
154
+ # print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
155
+
156
+ self.decide_invalid_path(
157
+ new_node_father=node,
158
+ old_node=self.tree.nodes[valid_id],
159
+ new_node=new_node,
160
+ )
161
+ # self.mark_invalid_children(node.children_identifiers, action)
162
+
163
+ self.tree.add_node(new_node, node)
164
+ # if action == 2 and tuple(self.env.unwrapped.agent_pos) == tuple(node.pos): # if the new node is actually invalid, mark it along with the other nodes of the same action as invalid, meaning reward will be 0 for them.
165
+ # self.mark_invalid_children(node.children_identifiers)
166
+ newely_expanded += 1
167
+ return new_node
168
+
169
+ def stochastic_action(self, choice):
170
+ prob_distribution = []
171
+ actions = range(self.action_space)
172
+ for action in actions:
173
+ if action == choice:
174
+ prob_distribution.append(PROB)
175
+ else:
176
+ prob_distribution.append(UNIFORM_PROB)
177
+ return random.choices(actions, weights=prob_distribution, k=1)[0]
178
+
179
+ def expand_selection_stochastic_node(
180
+ self,
181
+ node,
182
+ resulting_identifier,
183
+ terminated,
184
+ truncated,
185
+ reward,
186
+ action,
187
+ state,
188
+ depth,
189
+ ):
190
+ global newely_expanded
191
+ # the new node could result in a terminating state.
192
+ done = terminated | truncated
193
+ valid_id = resulting_identifier
194
+ while (
195
+ resulting_identifier in self.tree.nodes.keys()
196
+ ): # iterate over all circle nodes. important not to hash the parent node id to get the next id, because it will not be the same for all circle nodes.
197
+ if self.tree.nodes[resulting_identifier].invalid == False:
198
+ valid_id = resulting_identifier
199
+ resulting_identifier = hash((666, resulting_identifier))
200
+ # after this while, the id is for sure unused.
201
+ new_node = Node(
202
+ identifier=resulting_identifier,
203
+ state=state,
204
+ action=action,
205
+ action_space=self.action_space,
206
+ reward=reward,
207
+ terminal=done,
208
+ pos=self.env.unwrapped.agent_pos,
209
+ depth=depth,
210
+ )
211
+ if self.is_parent_child_same(
212
+ new_node, node
213
+ ): # this is not a regular circle but it indicates that the action - regardless if happened with or without intention, led to staying put. note this could happen even if the first if is true - twice in history someone tried to go against the wall from 2 different paths. both should be tagged invalid.
214
+ new_node.invalid = True
215
+ new_node.got_invalid = True
216
+ # if this is a legit (s,a,s'), find the valid one and check whether this one might be more valid.
217
+ elif (
218
+ valid_id in self.tree.nodes.keys()
219
+ ): # who can tell which node is invalid? might be that this is the more probable way to get here, it just happened later. maybe think of summing back up the num of visits to decide which one to make invalid.
220
+ # print("CIRCLE DETECTED!") # circle can be detected by 2 nodes making the wrong stochastic action one after another, in different times!
221
+ self.decide_invalid_path(
222
+ new_node_father=node,
223
+ old_node=self.tree.nodes[valid_id],
224
+ new_node=new_node,
225
+ )
226
+ # self.mark_invalid_children(node.children_identifiers, action)
227
+ self.tree.add_node(new_node, node)
228
+ newely_expanded += 1
229
+ return new_node
230
+
231
+ def simulation(self, node):
232
+ if node.terminal:
233
+ return node.reward
234
+ if self.use_heuristic:
235
+ # taken from Monte-Carlo Planning for Pathfinding in Real-Time Strategy Games , 2010.
236
+ # need to handle the case of walking into a wall here: the resulting node will be considered invalid and it's reward and performance needs to be 0, but must handle stochasticity
237
+ # suggestion to handle stochasticity - consider *all* the children associated with taking action 2 towards a wall as performance 0, even if they accidently led in walking to another direction.
238
+ # which suggests the invalidity needs to be checked not according to the resulting state, rather according to the intended action itself and the environment! remember, you cannot access the "stochastic_action", it is meant to be hidden from you.
239
+ if node.pos[0] == self.goal[0] and node.pos[1] == self.goal[1]:
240
+ return 2
241
+ if node.invalid:
242
+ return -0.5
243
+ else:
244
+ return 0.8 * (
245
+ 1
246
+ / (
247
+ abs(node.pos[0] - self.goal[0])
248
+ + abs(node.pos[1] - self.goal[1])
249
+ )
250
+ ) + 0.2 * (
251
+ 1 / node.depth
252
+ ) # large depth = less probability of obstacles -> larger nominator higher performance. further from goal -> larger denominator, lower performance.
253
+ while True:
254
+ action = random.randint(0, self.action_space - 1)
255
+ state, reward, terminated, truncated, _ = self.env.step(
256
+ self.stochastic_action(action)
257
+ )
258
+ done = (
259
+ terminated | truncated
260
+ ) # this time there could be truncation unlike in the tree policy.
261
+ if done:
262
+ return reward
263
+
264
+ def compute_value(self, parent, child, exploration_constant):
265
+ exploration_term = exploration_constant * sqrt(
266
+ 2 * log(parent.num_visits) / child.num_visits
267
+ )
268
+ return child.performance + exploration_term
269
+
270
+ # return the best action from a node. the value of an action is the weighted sum of performance of all children that are associated with this action.
271
+ def best_action(self, node, exploration_constant):
272
+ tried_actions_values = (
273
+ {}
274
+ ) # dictionary mapping actions to tuples of (cumulative number of visits of children, sum of (child performance * num of visits for child)) to compute the mean later
275
+ if tuple(node.pos) == (1, 2) and node.depth == 3 and node.action == 0:
276
+ pass
277
+ children = [child for child in self.tree.children(node) if not child.invalid]
278
+ if not children: # all children are invalid. this node is invalid aswell.
279
+ return 2
280
+ for child in children:
281
+ value = self.compute_value(node, child, exploration_constant)
282
+ tried_actions_values.setdefault(
283
+ child.action, [0, 0]
284
+ ) # create if it doesn't exist
285
+ tried_actions_values[child.action][
286
+ 0
287
+ ] += child.num_visits # add the number of child visits
288
+ tried_actions_values[child.action][1] += (
289
+ value * child.num_visits
290
+ ) # add the relative performance of this child
291
+ return max(
292
+ tried_actions_values,
293
+ key=lambda k: tried_actions_values[k][1] / tried_actions_values[k][0],
294
+ ) # return the key (action) with the highest average value
295
+
296
+ # only changes the environment to make sure the actions which are already a part of the plan have been executed.
297
+ def execute_partial_plan(self, plan):
298
+ node = self.tree.root
299
+ depth = 0
300
+ for action in plan:
301
+ depth += 1
302
+ # important to simulate the env to get to some state, as the nodes don't hold this information.
303
+ state, reward, terminated, truncated, _ = self.env.step(action)
304
+ done = terminated
305
+ if done:
306
+ return None, False
307
+ resulting_identifier = hash(
308
+ (
309
+ tuple(node.pos),
310
+ node.state["direction"],
311
+ action,
312
+ tuple(self.env.unwrapped.agent_pos),
313
+ state["direction"],
314
+ )
315
+ )
316
+ node = self.tree.nodes[resulting_identifier]
317
+ return node, True
318
+
319
+ # finds the ultimate path from the root node to a terminal state (the one that maximized rewards)
320
+ def tree_policy(self, root_depth):
321
+ node = self.tree.root
322
+ depth = root_depth
323
+ while not (node.terminal or node.invalid):
324
+ depth += 1
325
+ if self.tree.is_expandable(node):
326
+ # expansion - in case there's an action that never been tried, its value is infinity to encourage exploration of all children of a node.
327
+ return self.expand(node, depth), depth
328
+ else:
329
+ # selection - balance exploration and exploitation, coming down the tree - but note the selection might lead to new nodes because of stochaticity.
330
+ best_action = self.best_action(node, exploration_constant=1 / sqrt(2.0))
331
+ if best_action == -1:
332
+ break
333
+ # important to simulate the env to get to some state, as the nodes don't hold this information.
334
+ state, reward, terminated, truncated, _ = self.env.step(
335
+ self.stochastic_action(best_action)
336
+ )
337
+ # due to stochasticity, nodes could sometimes be terminal and sometimes they aren't. important to update it. also, the resulting state
338
+ # could be a state we've never been at due to uncertainty of actions' outcomes.
339
+ # if the resulting state creates a parent-action-child triplet that hasn't been seen before, add to the tree and return it, similar result to 'expand'.
340
+ # the hashing must include the action, because we want to enable getting to the same state stochastically from 2 different states: walking forward from (1,2) looking right and getting to (2,2) - the expected behavior, should be allowed even if the agent once stood at (2,1), looked down, turned right and accidently proceeded forward.
341
+ resulting_identifier = [
342
+ child_id
343
+ for child_id in node.children_identifiers
344
+ if all(
345
+ a == b
346
+ for a, b in zip(
347
+ self.tree.nodes[child_id].pos, self.env.unwrapped.agent_pos
348
+ )
349
+ )
350
+ and self.tree.nodes[child_id].action == best_action
351
+ ]
352
+ if (
353
+ len(resulting_identifier) == 0
354
+ ): # took an action done before, but it lead to a new state.
355
+ resulting_identifier = hash(
356
+ (
357
+ tuple(node.pos),
358
+ node.state["direction"],
359
+ best_action,
360
+ tuple(self.env.unwrapped.agent_pos),
361
+ state["direction"],
362
+ )
363
+ )
364
+ return (
365
+ self.expand_selection_stochastic_node(
366
+ node,
367
+ resulting_identifier,
368
+ terminated,
369
+ truncated,
370
+ reward,
371
+ best_action,
372
+ state,
373
+ depth,
374
+ ),
375
+ depth,
376
+ )
377
+ assert len(resulting_identifier) == 1
378
+ node = self.tree.nodes[resulting_identifier[0]]
379
+ return node, depth
380
+
381
+ # receives a final state node and updates the rewards of all the nodes on the path to the root
382
+ def backpropagation(self, node, value):
383
+ while node != self.tree.parent(self.tree.root):
384
+ assert (
385
+ node != None
386
+ ) # if we got to None it means we got to the actual root with the backpropogation instead of to the current root, which means in this path, someone had a differrent parent than it should, probably a double id.
387
+ node.num_visits += 1
388
+ node.total_simulation_reward += value
389
+ node.performance = node.total_simulation_reward / node.num_visits
390
+ node = self.tree.parent(node)
391
+
392
+ def generate_full_policy_sequence(
393
+ self, env_name, problem_name, save_fig=False, fig_path=None, env_prop=None
394
+ ):
395
+ trace = []
396
+ node, prev_node = self.tree.root, self.tree.root
397
+ print("generating policy sequence.")
398
+ for action in self.plan:
399
+ print(
400
+ f"position {tuple(node.pos)} direction {dict_dir_id_to_str[node.state['direction']]}, action {dict_action_id_to_str[action]}"
401
+ )
402
+ candidate_children = [
403
+ child for child in self.tree.children(node) if child.action == action
404
+ ] # there could be some children associated with the best action, representing different outcomes.
405
+ assert len(candidate_children) > 0
406
+ node = max(
407
+ candidate_children, key=lambda node: node.num_visits
408
+ ) # pick the child that was visited most, meaning it represents the desired action and not the undesired outcomes.
409
+ trace.append(
410
+ ((prev_node.state, tuple(prev_node.pos)), node.action)
411
+ ) # need to add the previous node with the action leading to the next node which is a property of the next node
412
+ prev_node = node
413
+ if save_fig:
414
+ assert fig_path != None
415
+ save_figure(trace, env_name, problem_name, fig_path, env_prop)
416
+ else:
417
+ assert fig_path == None
418
+ return trace
419
+
420
+
421
+ def save_model_and_generate_policy(
422
+ tree, original_root, model_file_path, monteCarloTreeSearch
423
+ ):
424
+ tree.root = original_root
425
+ with open(model_file_path, "wb") as file: # Serialize the model
426
+ monteCarloTreeSearch.env = (
427
+ None # pickle cannot serialize lambdas which exist in the env
428
+ )
429
+ pickle.dump(monteCarloTreeSearch, file)
249
430
 
250
431
 
251
432
  def plan(env_name, problem_name, goal, save_fig=False, fig_path=None, env_prop=None):
252
- global newely_expanded
253
- model_dir = get_agent_model_dir(env_name=env_name, model_name=problem_name, class_name="MCTS")
254
- model_file_path = os.path.join(model_dir, "mcts_model.pth")
255
- if os.path.exists(model_file_path):
256
- print(f"Loading pre-existing mcts planner in {model_file_path}")
257
- with open(model_file_path, 'rb') as file: # Load the pre-existing model
258
- try:
259
- monteCarloTreeSearch = pickle.load(file)
260
- except Exception as e:
261
- class RenameUnpickler(pickle.Unpickler):
262
- def find_class(self, module, name):
263
- renamed_module = module
264
- if module.startswith("ml"):
265
- renamed_module = "gr_libs." + renamed_module
266
- return super(RenameUnpickler, self).find_class(renamed_module, name)
267
- def renamed_load(file_obj):
268
- return RenameUnpickler(file_obj).load()
269
- file.seek(0)
270
- monteCarloTreeSearch = renamed_load(file)
271
-
272
- with open(model_file_path, 'wb') as file:
273
- pickle.dump(monteCarloTreeSearch, file)
274
-
275
- return monteCarloTreeSearch.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path)
276
- if not os.path.exists(model_dir): # if we reached here, the model doesn't exist. make sure its folder exists.
277
- os.makedirs(model_dir)
278
- steps = 10000
279
- print(f"No tree found. Executing MCTS, starting with {steps} rollouts for each action.")
280
- env = gym.make(id=problem_name)
281
- random.seed(2)
282
- tree = Tree()
283
- mcts = MonteCarloTreeSearch(env=env, tree=tree, goal=goal)
284
- original_root = tree.root
285
- depth = 0
286
- while not tree.root.terminal: # we iterate until the root is a terminal state, meaning the game is over.
287
- max_reward = 0
288
- iteration = 0
289
- steps = max(2000,int(steps*0.9))
290
- print(f"Executing {steps} rollouts for each action now.")
291
- tq = tqdm(range(steps), postfix=f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(env.unwrapped.agent_pos)}, newely expanded: {0}")
292
- for n in tq:
293
- iteration = n
294
- mcts.env.reset()
295
- # when executing the partial plan, it's possible the environment finished due to the stochasticity. the execution would return false if that happend.
296
- depth = len(mcts.plan)
297
- mcts.tree.root = original_root # need to return it to the original root before executing the partial plan as it can lead to a different path and the root can change between iterations.
298
- node, result = mcts.execute_partial_plan(mcts.plan)
299
- if not result:
300
- # false return value from partial plan execution means the plan is finished. we can mark our root as terminal and exit, happy with our plan.
301
- tree.root.terminal = True
302
- save_model_and_generate_policy(tree=tree, original_root=original_root, model_file_path=model_file_path, monteCarloTreeSearch=mcts)
303
- return mcts.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path, env_prop)
304
- plan_pos, plan_dir = node.pos, dict_dir_id_to_str[node.state['direction']]
305
- tree.root = node # determine the root to be the node executed after the plan for this iteration.
306
- node, depth = mcts.tree_policy(root_depth=depth) # find a path to a new unvisited node (unique sequence of actions) by utilizing explorative policy or choosing unvisited children recursively
307
- # if the node that returned from tree policy is terminal, the reward will be returned from "simulation" function immediately.
308
- reward = mcts.simulation(node) # proceed from that node randomly and collect the final reward expected from it (heuristic)
309
- if reward > max_reward:
310
- max_reward = reward
311
- mcts.backpropagation(node, reward) # update the performances of nodes along the way until the root
312
- tq.set_postfix_str(f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(plan_pos)}, looking {plan_dir}. newely expanded: {newely_expanded}")
313
- # update the root and start from it next time.
314
- newely_expanded = 0
315
- action = mcts.best_action(node=tree.root, exploration_constant=0)
316
- if action == -1:
317
- pass
318
- mcts.plan.append(action)
319
- print(f"Executed action {action}")
320
- save_model_and_generate_policy(tree=tree, original_root=original_root, model_file_path=model_file_path, monteCarloTreeSearch=monteCarloTreeSearch)
321
- return mcts.generate_full_policy_sequence(env_name, problem_name, save_fig, fig_path)
322
-
433
+ global newely_expanded
434
+ model_dir = get_agent_model_dir(
435
+ env_name=env_name, model_name=problem_name, class_name="MCTS"
436
+ )
437
+ model_file_path = os.path.join(model_dir, "mcts_model.pth")
438
+ if os.path.exists(model_file_path):
439
+ print(f"Loading pre-existing mcts planner in {model_file_path}")
440
+ with open(model_file_path, "rb") as file: # Load the pre-existing model
441
+ try:
442
+ monteCarloTreeSearch = pickle.load(file)
443
+ except Exception as e:
444
+
445
+ class RenameUnpickler(pickle.Unpickler):
446
+ def find_class(self, module, name):
447
+ renamed_module = module
448
+ if module.startswith("ml"):
449
+ renamed_module = "gr_libs." + renamed_module
450
+ return super(RenameUnpickler, self).find_class(
451
+ renamed_module, name
452
+ )
453
+
454
+ def renamed_load(file_obj):
455
+ return RenameUnpickler(file_obj).load()
456
+
457
+ file.seek(0)
458
+ monteCarloTreeSearch = renamed_load(file)
459
+
460
+ with open(model_file_path, "wb") as file:
461
+ pickle.dump(monteCarloTreeSearch, file)
462
+
463
+ return monteCarloTreeSearch.generate_full_policy_sequence(
464
+ env_name, problem_name, save_fig, fig_path
465
+ )
466
+ if not os.path.exists(
467
+ model_dir
468
+ ): # if we reached here, the model doesn't exist. make sure its folder exists.
469
+ os.makedirs(model_dir)
470
+ steps = 10000
471
+ print(
472
+ f"No tree found. Executing MCTS, starting with {steps} rollouts for each action."
473
+ )
474
+ env = gym.make(id=problem_name)
475
+ random.seed(2)
476
+ tree = Tree()
477
+ mcts = MonteCarloTreeSearch(env=env, tree=tree, goal=goal)
478
+ original_root = tree.root
479
+ depth = 0
480
+ while (
481
+ not tree.root.terminal
482
+ ): # we iterate until the root is a terminal state, meaning the game is over.
483
+ max_reward = 0
484
+ iteration = 0
485
+ steps = max(2000, int(steps * 0.9))
486
+ print(f"Executing {steps} rollouts for each action now.")
487
+ tq = tqdm(
488
+ range(steps),
489
+ postfix=f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(env.unwrapped.agent_pos)}, newely expanded: {0}",
490
+ )
491
+ for n in tq:
492
+ iteration = n
493
+ mcts.env.reset()
494
+ # when executing the partial plan, it's possible the environment finished due to the stochasticity. the execution would return false if that happend.
495
+ depth = len(mcts.plan)
496
+ mcts.tree.root = original_root # need to return it to the original root before executing the partial plan as it can lead to a different path and the root can change between iterations.
497
+ node, result = mcts.execute_partial_plan(mcts.plan)
498
+ if not result:
499
+ # false return value from partial plan execution means the plan is finished. we can mark our root as terminal and exit, happy with our plan.
500
+ tree.root.terminal = True
501
+ save_model_and_generate_policy(
502
+ tree=tree,
503
+ original_root=original_root,
504
+ model_file_path=model_file_path,
505
+ monteCarloTreeSearch=mcts,
506
+ )
507
+ return mcts.generate_full_policy_sequence(
508
+ env_name, problem_name, save_fig, fig_path, env_prop
509
+ )
510
+ plan_pos, plan_dir = node.pos, dict_dir_id_to_str[node.state["direction"]]
511
+ tree.root = node # determine the root to be the node executed after the plan for this iteration.
512
+ node, depth = mcts.tree_policy(
513
+ root_depth=depth
514
+ ) # find a path to a new unvisited node (unique sequence of actions) by utilizing explorative policy or choosing unvisited children recursively
515
+ # if the node that returned from tree policy is terminal, the reward will be returned from "simulation" function immediately.
516
+ reward = mcts.simulation(
517
+ node
518
+ ) # proceed from that node randomly and collect the final reward expected from it (heuristic)
519
+ if reward > max_reward:
520
+ max_reward = reward
521
+ mcts.backpropagation(
522
+ node, reward
523
+ ) # update the performances of nodes along the way until the root
524
+ tq.set_postfix_str(
525
+ f"Iteration: {iteration}, Num of steps: {len(mcts.plan)}. depth: {depth}. Max reward: {max_reward}. plan to {tuple(plan_pos)}, looking {plan_dir}. newely expanded: {newely_expanded}"
526
+ )
527
+ # update the root and start from it next time.
528
+ newely_expanded = 0
529
+ action = mcts.best_action(node=tree.root, exploration_constant=0)
530
+ if action == -1:
531
+ pass
532
+ mcts.plan.append(action)
533
+ print(f"Executed action {action}")
534
+ save_model_and_generate_policy(
535
+ tree=tree,
536
+ original_root=original_root,
537
+ model_file_path=model_file_path,
538
+ monteCarloTreeSearch=monteCarloTreeSearch,
539
+ )
540
+ return mcts.generate_full_policy_sequence(
541
+ env_name, problem_name, save_fig, fig_path
542
+ )
543
+
544
+
323
545
  if __name__ == "__main__":
324
- # register(
325
- # id="MiniGrid-DynamicGoalEmpty-8x8-3x6-v0",
326
- # entry_point="minigrid.envs:DynamicGoalEmpty",
327
- # kwargs={"size": 8, "agent_start_pos" : (1, 1), "goal_pos": (3,6) },
328
- # )
329
- # plan("MiniGrid-DynamicGoalEmpty-8x8-3x6-v0")
330
- pass
546
+ # register(
547
+ # id="MiniGrid-DynamicGoalEmpty-8x8-3x6-v0",
548
+ # entry_point="minigrid.envs:DynamicGoalEmpty",
549
+ # kwargs={"size": 8, "agent_start_pos" : (1, 1), "goal_pos": (3,6) },
550
+ # )
551
+ # plan("MiniGrid-DynamicGoalEmpty-8x8-3x6-v0")
552
+ pass