browsergym-workarena 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. browsergym/workarena/__init__.py +13 -1
  2. browsergym/workarena/api/category.py +74 -0
  3. browsergym/workarena/api/change_request.py +87 -0
  4. browsergym/workarena/api/computer_asset.py +90 -0
  5. browsergym/workarena/api/cost_center.py +19 -0
  6. browsergym/workarena/api/expense_line.py +89 -0
  7. browsergym/workarena/api/incident.py +45 -0
  8. browsergym/workarena/api/knowledge.py +29 -0
  9. browsergym/workarena/api/problem.py +90 -0
  10. browsergym/workarena/api/report.py +183 -0
  11. browsergym/workarena/api/requested_items.py +63 -0
  12. browsergym/workarena/api/user.py +11 -8
  13. browsergym/workarena/api/utils.py +47 -3
  14. browsergym/workarena/config.py +21 -1
  15. browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +1 -1
  16. browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +1 -0
  17. browsergym/workarena/data_files/setup_files/knowledge/protocols.json +46 -0
  18. browsergym/workarena/data_files/setup_files/knowledge/test.html +1 -0
  19. browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +2 -24
  20. browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +4 -40
  21. browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +12 -0
  22. browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +1 -42
  23. browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +2 -18
  24. browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +12 -0
  25. browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +12 -0
  26. browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +2 -19
  27. browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +3 -50
  28. browsergym/workarena/data_files/task_configs/all_menu.json +1 -1
  29. browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +1 -1
  30. browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +1 -1
  31. browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +1 -1
  32. browsergym/workarena/data_files/task_configs/impersonation_users.json +1 -1
  33. browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +1 -1
  34. browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +1 -1
  35. browsergym/workarena/human_eval/console.js +176 -0
  36. browsergym/workarena/human_eval/tool.py +366 -0
  37. browsergym/workarena/install.py +81 -20
  38. browsergym/workarena/tasks/base.py +55 -20
  39. browsergym/workarena/tasks/comp_building_block.py +4 -0
  40. browsergym/workarena/tasks/compositional/__init__.py +76 -0
  41. browsergym/workarena/tasks/compositional/base.py +364 -0
  42. browsergym/workarena/tasks/compositional/dash_do_base.py +1366 -0
  43. browsergym/workarena/tasks/compositional/dash_do_catalog.py +1127 -0
  44. browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +2047 -0
  45. browsergym/workarena/tasks/compositional/dash_do_create_incident.py +403 -0
  46. browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +278 -0
  47. browsergym/workarena/tasks/compositional/dash_do_create_problem.py +336 -0
  48. browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +235 -0
  49. browsergym/workarena/tasks/compositional/dash_do_filter.py +1600 -0
  50. browsergym/workarena/tasks/compositional/dash_do_request_item.py +1315 -0
  51. browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +693 -0
  52. browsergym/workarena/tasks/compositional/delete_record.py +341 -0
  53. browsergym/workarena/tasks/compositional/edit_knowledge_base.py +457 -0
  54. browsergym/workarena/tasks/compositional/expense_management.py +598 -0
  55. browsergym/workarena/tasks/compositional/filter_and_do.py +139 -0
  56. browsergym/workarena/tasks/compositional/find_and_order_item.py +345 -0
  57. browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +1417 -0
  58. browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +499 -0
  59. browsergym/workarena/tasks/compositional/maximize_investment_return.py +1763 -0
  60. browsergym/workarena/tasks/compositional/navigate_and_do.py +1151 -0
  61. browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +2100 -0
  62. browsergym/workarena/tasks/compositional/offboard_user.py +207 -0
  63. browsergym/workarena/tasks/compositional/onboard_user.py +226 -0
  64. browsergym/workarena/tasks/compositional/update_task.py +145 -0
  65. browsergym/workarena/tasks/compositional/utils/curriculum.py +215 -0
  66. browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +151 -0
  67. browsergym/workarena/tasks/compositional/utils/knapsack.py +192 -0
  68. browsergym/workarena/tasks/compositional/warranty_check.py +227 -0
  69. browsergym/workarena/tasks/compositional/work_assignment.py +804 -0
  70. browsergym/workarena/tasks/compositional/workload_balancing.py +396 -0
  71. browsergym/workarena/tasks/dashboard.py +188 -8
  72. browsergym/workarena/tasks/form.py +1024 -232
  73. browsergym/workarena/tasks/knowledge.py +216 -25
  74. browsergym/workarena/tasks/list.py +519 -102
  75. browsergym/workarena/tasks/mark_duplicate_problem.py +171 -0
  76. browsergym/workarena/tasks/navigation.py +55 -13
  77. browsergym/workarena/tasks/scripts/extract_all_menu_items.py +9 -2
  78. browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +6 -5
  79. browsergym/workarena/tasks/scripts/service_catalog.py +2 -1
  80. browsergym/workarena/tasks/scripts/validate.py +8 -2
  81. browsergym/workarena/tasks/send_chat_message.py +90 -0
  82. browsergym/workarena/tasks/service_catalog.py +94 -26
  83. browsergym/workarena/tasks/utils/form.py +1 -4
  84. browsergym/workarena/tasks/utils/private_tasks.py +63 -0
  85. browsergym/workarena/tasks/utils/utils.py +13 -0
  86. {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.0.dist-info}/METADATA +19 -18
  87. browsergym_workarena-0.3.0.dist-info/RECORD +138 -0
  88. {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.0.dist-info}/entry_points.txt +1 -0
  89. browsergym_workarena-0.2.1.dist-info/RECORD +0 -85
  90. {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.0.dist-info}/WHEEL +0 -0
  91. {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,176 @@
1
+ document.addEventListener('DOMContentLoaded', function() {
2
+
3
+ // Disable right-click in all frames to prevent people from opening new tabs that don't have the main header
4
+ document.addEventListener('contextmenu', function(event) {
5
+ event.preventDefault();
6
+ });
7
+
8
+ // Disable Command/Ctrl + Click for the same reasons
9
+ document.addEventListener('click', function(event) {
10
+ if (event.metaKey || event.ctrlKey) {
11
+ event.preventDefault();
12
+ }
13
+ });
14
+
15
+ // Disable middle-click for the same reasons
16
+ document.addEventListener('auxclick', function(event) {
17
+ if (event.button === 1) {
18
+ event.preventDefault();
19
+ }
20
+ });
21
+
22
+ if (window != top) {
23
+ return; // Do nothing if not in top window
24
+ }
25
+
26
+ // Check if the div already exists
27
+ let newDiv = document.getElementById("humanEvalConsole");
28
+ if (!newDiv) {
29
+ // Create a new div element if it doesn't exist
30
+ newDiv = document.createElement("div");
31
+ newDiv.id = "humanEvalConsole";
32
+
33
+ // Create a title for the div
34
+ const title = document.createElement("h3");
35
+ title.innerText = "Human Evaluation Console";
36
+ title.style.textAlign = "center";
37
+ newDiv.appendChild(title);
38
+
39
+ // Progress status indicator
40
+ const progressDiv = document.createElement("div");
41
+ progressDiv.id = "progressDiv";
42
+ progressDiv.style.marginTop = "-5px";
43
+ progressDiv.style.marginBottom = "5px";
44
+ newDiv.appendChild(progressDiv);
45
+
46
+ // Create the 'New tab' button
47
+ const newTabButton = document.createElement("button");
48
+ newTabButton.innerText = "+";
49
+ newTabButton.style.backgroundColor = "yellow";
50
+ newTabButton.style.color = "black";
51
+ newTabButton.style.border = "none";
52
+ newTabButton.style.padding = "5px 5px";
53
+ newTabButton.style.marginRight = "10px";
54
+ newTabButton.setAttribute("title", "New Tab");
55
+ newTabButton.onclick = function() {
56
+ window.open(window.location.href, '_blank');
57
+ };
58
+
59
+ // Create the 'Validate' button
60
+ const validateButton = document.createElement("button");
61
+ validateButton.innerText = "Validate";
62
+ validateButton.style.backgroundColor = "green";
63
+ validateButton.style.color = "white";
64
+ validateButton.style.border = "none";
65
+ validateButton.style.padding = "10px 20px";
66
+ validateButton.style.marginRight = "10px";
67
+ validateButton.onclick = function() {
68
+ window.NEED_VALIDATION = 1;
69
+ console.log("Validation flag set:", window.NEED_VALIDATION);
70
+ document.getElementById("taskStatusDiv").innerText = "Validation in progress...";
71
+ };
72
+
73
+ // Create the 'Give up' button
74
+ const giveUpButton = document.createElement("button");
75
+ giveUpButton.innerText = "Give up";
76
+ giveUpButton.style.backgroundColor = "red";
77
+ giveUpButton.style.color = "white";
78
+ giveUpButton.style.border = "none";
79
+ giveUpButton.style.padding = "10px 20px";
80
+ giveUpButton.style.marginRight = "10px";
81
+ giveUpButton.onclick = function() {
82
+ window.HUMAN_ABANDON = 1;
83
+ console.log("Give up flag set:", window.HUMAN_ABANDON);
84
+ document.getElementById("taskStatusDiv").innerText = "Human abandoned task.";
85
+ };
86
+
87
+ // Create the 'Infeasible' button
88
+ const infeasibleButton = document.createElement("button");
89
+ infeasibleButton.innerText = "Infeasible";
90
+ infeasibleButton.style.backgroundColor = "blue";
91
+ infeasibleButton.style.color = "white";
92
+ infeasibleButton.style.border = "none";
93
+ infeasibleButton.style.padding = "10px 20px";
94
+ infeasibleButton.onclick = function() {
95
+ let reasonTextBox = document.getElementById("reasonTextBox");
96
+ if (!reasonTextBox) {
97
+ // Show a new div to get the reason
98
+ const reasonTextBox = document.createElement("input");
99
+ reasonTextBox.id = "reasonTextBox";
100
+ reasonTextBox.type = "text";
101
+ reasonTextBox.setAttribute("placeholder", "Reason: e.g., Field 'Bob' does not exist.");
102
+ reasonTextBox.style.width = "300px";
103
+ reasonTextBox.style.marginRight = "10px";
104
+ newDiv.appendChild(reasonTextBox)
105
+ reasonTextBox.focus()
106
+
107
+ const reasonButton = document.createElement("button");
108
+ reasonButton.innerText = "Submit";
109
+ reasonButton.style.backgroundColor = "black";
110
+ reasonButton.style.color = "white";
111
+ reasonButton.style.border = "none";
112
+ reasonButton.onclick = function() {
113
+ window.HUMAN_INFEASIBLE = 1;
114
+ console.log("Infeasible flag set:", window.HUMAN_ABANDON);
115
+ document.getElementById("taskStatusDiv").innerText = "Human marked task as infeasible.";
116
+ };
117
+ newDiv.appendChild(reasonButton)
118
+ }
119
+ };
120
+
121
+ // Append buttons to the div
122
+ newDiv.appendChild(newTabButton)
123
+ newDiv.appendChild(validateButton);
124
+ newDiv.appendChild(giveUpButton);
125
+ newDiv.appendChild(infeasibleButton);
126
+
127
+ // Create a status div below the buttons
128
+ const taskStatusDiv = document.createElement("div");
129
+ taskStatusDiv.id = "taskStatusDiv";
130
+ taskStatusDiv.innerText = "Waiting for action...";
131
+ taskStatusDiv.style.marginTop = "10px";
132
+ newDiv.appendChild(taskStatusDiv); // Append the status div to the main div
133
+
134
+ // Append the div to the body of the document
135
+ document.body.appendChild(newDiv);
136
+ }
137
+
138
+ // Ensure the div is draggable vertically
139
+ newDiv.style.position = "fixed";
140
+ newDiv.style.right = "10px";
141
+ newDiv.style.bottom = "10px";
142
+ newDiv.style.zIndex = "1000";
143
+ newDiv.style.backgroundColor = "#f0f0f0";
144
+ newDiv.style.border = "1px solid black";
145
+ newDiv.style.padding = "10px";
146
+ newDiv.style.borderRadius = "8px"; // Rounded corners
147
+ newDiv.style.cursor = "ns-resize"; // Cursor indicates vertical movement
148
+
149
+ let isDragging = false;
150
+
151
+ newDiv.onmousedown = function(event) {
152
+ event.preventDefault(); // Prevent default text selection
153
+ isDragging = true;
154
+ let startY = event.clientY;
155
+ let startBottom = parseInt(window.getComputedStyle(newDiv).bottom, 10);
156
+
157
+ function onMouseMove(event) {
158
+ if (isDragging) {
159
+ let newBottom = startBottom - (event.clientY - startY);
160
+ newDiv.style.bottom = newBottom + 'px'; // Update bottom position only
161
+ }
162
+ }
163
+
164
+ document.addEventListener('mousemove', onMouseMove);
165
+
166
+ document.onmouseup = function() {
167
+ document.removeEventListener('mousemove', onMouseMove);
168
+ newDiv.onmouseup = null;
169
+ isDragging = false; // Stop dragging
170
+ };
171
+ };
172
+
173
+ newDiv.ondragstart = function() {
174
+ return false; // Prevent default dragging behavior
175
+ };
176
+ });
@@ -0,0 +1,366 @@
1
+ """
2
+ WorkArena Human Evaluation Tool
3
+
4
+ Known issues:
5
+ * Blocking page interaction: We can't block loading until validation is done because some validation
6
+ functions require the page to be loaded. This means the user might act
7
+ while validation is ongoing. However, they would need to be very quick to
8
+ cause issues.
9
+
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import logging
15
+ import os
16
+ import random
17
+ import tenacity
18
+
19
+ from time import sleep, time
20
+
21
+ from browsergym.core.env import BrowserEnv
22
+ from browsergym.workarena import ALL_WORKARENA_TASKS, get_all_tasks_humans
23
+ from browsergym.workarena.tasks.compositional.base import CompositionalTask
24
+
25
+
26
+ logging.basicConfig(level=logging.INFO)
27
+
28
+
29
+ # All available task classes by name
30
+ TASKS = {task.__name__: task for task in ALL_WORKARENA_TASKS}
31
+
32
+
33
+ def get_servicenow_pages(context):
34
+ return [p for p in context.pages if "service-now" in p.url]
35
+
36
+
37
+ @tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
38
+ def validation_flag_activated(context):
39
+ return any(
40
+ f.evaluate("typeof window.NEED_VALIDATION !== 'undefined' && window.NEED_VALIDATION")
41
+ for p in get_servicenow_pages(context)
42
+ for f in p.frames
43
+ )
44
+
45
+
46
+ @tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
47
+ def reset_validation_flag(context):
48
+ try:
49
+ for page in get_servicenow_pages(context):
50
+ for f in page.frames:
51
+ f.evaluate("window.NEED_VALIDATION = 0;")
52
+ except Exception as e:
53
+ print(e, "Failed to reset validation flag") # Worst case we'll just keep validating
54
+
55
+
56
+ @tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
57
+ def abandon_flag_activated(context):
58
+ return any(
59
+ f.evaluate("typeof window.HUMAN_ABANDON !== 'undefined' && window.HUMAN_ABANDON")
60
+ for p in get_servicenow_pages(context)
61
+ for f in p.frames
62
+ )
63
+
64
+
65
+ @tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
66
+ def infeasible_flag_activated(context):
67
+ infeasible = any(
68
+ f.evaluate("typeof window.HUMAN_INFEASIBLE !== 'undefined'")
69
+ for p in get_servicenow_pages(context)
70
+ for f in p.frames
71
+ )
72
+
73
+ reason = None
74
+ if infeasible:
75
+ for p in get_servicenow_pages(context):
76
+ try:
77
+ reason = p.evaluate("document.getElementById('reasonTextBox').value")
78
+ break
79
+ except:
80
+ pass
81
+
82
+ return infeasible, reason
83
+
84
+
85
+ @tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
86
+ def human_console_set_status(msg, context):
87
+ for p in get_servicenow_pages(context):
88
+ p.evaluate(f"document.getElementById('taskStatusDiv').innerText = '{msg}'")
89
+
90
+
91
+ @tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
92
+ def human_console_set_progress_status(msg, context):
93
+ for p in get_servicenow_pages(context):
94
+ p.evaluate(f"document.getElementById('progressDiv').innerText = '{msg}'")
95
+
96
+
97
+ def log_result(annotator_info: dict, task_info: dict, metrics: dict, path: str):
98
+ # Read existing log
99
+ if os.path.exists(path):
100
+ log = json.load(open(path, "r"))
101
+ else:
102
+ log = []
103
+
104
+ # Append log
105
+ log.append({"annotator_info": annotator_info, "task_info": task_info, "metrics": metrics})
106
+ json.dump(log, open(path, "w"))
107
+
108
+ logging.info(f"Logged result: {task_info} -- {metrics}")
109
+
110
+
111
+ def task_already_evaluated(path: str, annotator_info: dict, task_info: dict):
112
+ if not os.path.exists(path):
113
+ return False
114
+
115
+ log = json.load(open(path, "r"))
116
+ for entry in log:
117
+ if entry["annotator_info"] == annotator_info and entry["task_info"] == task_info:
118
+ return True
119
+
120
+ return False
121
+
122
+
123
+ def setup_environment(task_info: dict):
124
+ task_cls = TASKS[task_info["task_name"]]
125
+ env = BrowserEnv(
126
+ task_entrypoint=task_cls,
127
+ headless=False,
128
+ )
129
+ info, _ = env.reset(seed=task_info["task_seed"])
130
+
131
+ # Inject human-eval helper scripts (reload to apply)
132
+ env.task.page.context.add_init_script("window.NEED_VALIDATION = 1;")
133
+ env.task.page.context.add_init_script(
134
+ path=os.path.join(os.path.dirname(__file__), "console.js")
135
+ )
136
+ env.task.page.reload()
137
+
138
+ # Patch the chat messages so that the human posts as the bot
139
+ env.chat.page.evaluate(
140
+ """
141
+ (function() {
142
+ let old;
143
+
144
+ // Function to wait for addChatMessage to be defined
145
+ function waitForAddChatMessage() {
146
+ if (typeof addChatMessage !== 'undefined') {
147
+ // Save the original 'addChatMessage' function to 'old'
148
+ if (typeof old === 'undefined') {
149
+ old = new Function('return ' + addChatMessage.toString())();
150
+ }
151
+
152
+ // Redefine 'addChatMessage' to wrap the original function
153
+ addChatMessage = function(role, timeString, msg) {
154
+ if (role === 'user') {
155
+ role = 'assistant'; // Swap role from 'user' to 'assistant'
156
+ }
157
+ else if (role === 'assistant') {
158
+ role = 'user'; // Swap role from 'assistant' to 'user'
159
+ }
160
+ old(role, timeString, msg); // Call the original function
161
+ };
162
+ } else {
163
+ // Retry after a short delay
164
+ setTimeout(waitForAddChatMessage, 100);
165
+ }
166
+ }
167
+
168
+ // Start waiting for addChatMessage to be defined
169
+ waitForAddChatMessage();
170
+ })();
171
+ """
172
+ )
173
+
174
+ # Mark all chat messages as patched so that we don't patch them again
175
+ for m in env.chat.messages:
176
+ m["patched"] = True
177
+
178
+ return env
179
+
180
+
181
+ def load_curriculum(path):
182
+ """
183
+ Load curriculum from a file or generate a random one.
184
+
185
+ Parameters:
186
+ -----------
187
+ path: str
188
+ Path to the curriculum file. If set to "random", a random curriculum will be generated.
189
+
190
+ Returns:
191
+ --------
192
+ curriculum: list
193
+
194
+ """
195
+ if path == "random":
196
+ logging.info("Generating random curriculum")
197
+ all_tasks = get_all_tasks_humans(filter="l2") + get_all_tasks_humans(filter="l3")
198
+ random.shuffle(all_tasks)
199
+ curriculum = [{"task_name": x[0].__name__, "task_seed": x[1]} for x in all_tasks]
200
+ else:
201
+ logging.info(f"Loading curriculum from {path}")
202
+ with open(path, "r") as f:
203
+ curriculum = [
204
+ {"task_name": l.split(",")[0].strip(), "task_seed": int(l.split(",")[1].strip())}
205
+ for l in f.readlines()
206
+ if len(l.strip()) > 0
207
+ ]
208
+
209
+ return curriculum
210
+
211
+
212
+ @tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
213
+ def validate_solution(env):
214
+ infos = []
215
+ messages = []
216
+ for p in env.context.pages:
217
+ reward, stop, message, info = env.task.validate(p, env.chat.messages)
218
+
219
+ # If a terminal condition is encountered, return it.
220
+ if reward == 1 or (reward == 0 and stop):
221
+ return reward, stop, message, info
222
+
223
+ infos.append(info)
224
+ messages.append(message)
225
+
226
+ return reward, stop, ", ".join(messages), {"message": ", ".join([i["message"] for i in infos])}
227
+
228
+
229
+ def main():
230
+
231
+ # Initialize the argument parser
232
+ parser = argparse.ArgumentParser(
233
+ description="Get annotator info and log path from command line arguments."
234
+ )
235
+
236
+ # Define the command line arguments
237
+ parser.add_argument("--email", type=str, required=True, help="Email of the annotator")
238
+ parser.add_argument(
239
+ "--curriculum",
240
+ type=str,
241
+ required=True,
242
+ help='Path to the curriculum file (optional: use "random" for a random one)',
243
+ )
244
+ parser.add_argument(
245
+ "--log",
246
+ type=str,
247
+ required=False,
248
+ default="human_eval_log.json",
249
+ help="Path to the log file",
250
+ )
251
+ parser.add_argument("--reset-log", action="store_true", help="Reset the log file")
252
+
253
+ # Parse the arguments
254
+ args = parser.parse_args()
255
+
256
+ annotator_info = {"email": args.email}
257
+ logging.info(f"Annotator info: {annotator_info}")
258
+
259
+ # Reset the log file if requested
260
+ logging.info(f"Log file: {args.log}")
261
+ if args.reset_log:
262
+ logging.info("Resetting log file")
263
+ json.dump([], open(args.log, "w"))
264
+
265
+ # Loop over the curriculum
266
+ curriculum = load_curriculum(args.curriculum)
267
+ logging.info(f"Starting evaluation for {len(curriculum)} tasks")
268
+ for i, task_info in enumerate(curriculum):
269
+
270
+ if task_already_evaluated(args.log, annotator_info, task_info):
271
+ logging.info(f"Task {task_info} already evaluated. Skipping.")
272
+ continue
273
+
274
+ # Setup the environment
275
+ logging.info(f"Setting up environment for task {task_info}")
276
+ env = setup_environment(task_info)
277
+
278
+ # Game loop
279
+ logging.info(f"Starting evaluation for task {task_info}")
280
+ start_time = time()
281
+ end = False
282
+ success = False
283
+ prev_chat_len = len(env.chat.messages)
284
+ while True:
285
+ human_console_set_progress_status(
286
+ f"Task {i + 1} / {len(curriculum)} --- Elapsed: {round(time() - start_time, 2)} sec.",
287
+ env.context,
288
+ )
289
+
290
+ # Event: Human marked task as infeasible
291
+ infeasible, infeasible_reason = infeasible_flag_activated(env.context)
292
+ if infeasible and not any([m["role"] == "infeasible" for m in env.chat.messages]):
293
+ logging.info(f"Human marked task as infeasible. Reason: {infeasible_reason}")
294
+ human_console_set_status("Task marked as infeasible.", env.context)
295
+ env.chat.messages.append({"role": "infeasible", "message": infeasible_reason})
296
+ # TODO: There is a small glitch where if the user changes their message after,
297
+ # the new infeasible message will be saved instead of the initial one that
298
+ # was added to the chat messages. We can't stop after infeasible has been
299
+ # declared.
300
+
301
+ # Event: Validation is required
302
+ if validation_flag_activated(env.context) or len(env.chat.messages) != prev_chat_len:
303
+ human_console_set_status("Validation in progress...", env.context)
304
+
305
+ # Patch all chat messages
306
+ for m in env.chat.messages:
307
+ if not m.get("patched", False):
308
+ if m["role"] == "user":
309
+ m["role"] = "assistant"
310
+ elif m["role"] == "assistant":
311
+ m["role"] = "user"
312
+ m["patched"] = True
313
+
314
+ reward, stop, message, info = validate_solution(env)
315
+ logging.info(f"Validation: {info} -- reward: {reward} -- stop: {stop}")
316
+
317
+ if reward == 1:
318
+ human_console_set_status("Success!", env.context)
319
+ end = True
320
+ success = True
321
+ else:
322
+ if not end: # If we're not already stopping for another reason
323
+ if stop:
324
+ human_console_set_status(
325
+ "Task not completed. Stop required.", env.context
326
+ )
327
+ end = True
328
+ success = False
329
+ else:
330
+ human_console_set_status("Task not completed. Keep going.", env.context)
331
+
332
+ prev_chat_len = len(env.chat.messages)
333
+ reset_validation_flag(env.context)
334
+
335
+ # Event: Human abandoned task
336
+ if abandon_flag_activated(env.context):
337
+ end = True
338
+ success = False
339
+ human_console_set_status("Task abandoned by human.", env.context)
340
+
341
+ # Event: Task is finished
342
+ if end:
343
+ log_result(
344
+ path=args.log,
345
+ annotator_info=annotator_info,
346
+ task_info=task_info,
347
+ metrics={
348
+ "duration": time() - start_time,
349
+ "success": success,
350
+ "infeasible": infeasible_reason if infeasible else None,
351
+ "abandoned": abandon_flag_activated(env.context),
352
+ "chat_messages": env.chat.messages,
353
+ },
354
+ )
355
+ sleep(3) # Sleep so human has time to read status before it closes
356
+ break
357
+
358
+ sleep(0.1)
359
+
360
+ human_console_set_status("Cleaning environment. This may take a while...", env.context)
361
+ env.close()
362
+ logging.info(f"Finished evaluation for task {task_info}")
363
+
364
+
365
+ if __name__ == "__main__":
366
+ main()