browsergym-workarena 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browsergym/workarena/__init__.py +13 -1
- browsergym/workarena/api/category.py +74 -0
- browsergym/workarena/api/change_request.py +87 -0
- browsergym/workarena/api/computer_asset.py +90 -0
- browsergym/workarena/api/cost_center.py +19 -0
- browsergym/workarena/api/expense_line.py +89 -0
- browsergym/workarena/api/incident.py +45 -0
- browsergym/workarena/api/knowledge.py +29 -0
- browsergym/workarena/api/problem.py +90 -0
- browsergym/workarena/api/report.py +183 -0
- browsergym/workarena/api/requested_items.py +63 -0
- browsergym/workarena/api/user.py +11 -8
- browsergym/workarena/api/utils.py +47 -3
- browsergym/workarena/config.py +21 -1
- browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +1 -1
- browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +1 -0
- browsergym/workarena/data_files/setup_files/knowledge/protocols.json +46 -0
- browsergym/workarena/data_files/setup_files/knowledge/test.html +1 -0
- browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +2 -24
- browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +4 -40
- browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +12 -0
- browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +1 -42
- browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +2 -18
- browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +12 -0
- browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +12 -0
- browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +2 -19
- browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +3 -50
- browsergym/workarena/data_files/task_configs/all_menu.json +1 -1
- browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +1 -1
- browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +1 -1
- browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +1 -1
- browsergym/workarena/data_files/task_configs/impersonation_users.json +1 -1
- browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +1 -1
- browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +1 -1
- browsergym/workarena/human_eval/console.js +176 -0
- browsergym/workarena/human_eval/tool.py +366 -0
- browsergym/workarena/install.py +81 -20
- browsergym/workarena/tasks/base.py +55 -20
- browsergym/workarena/tasks/comp_building_block.py +4 -0
- browsergym/workarena/tasks/compositional/__init__.py +76 -0
- browsergym/workarena/tasks/compositional/base.py +364 -0
- browsergym/workarena/tasks/compositional/dash_do_base.py +1366 -0
- browsergym/workarena/tasks/compositional/dash_do_catalog.py +1127 -0
- browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +2047 -0
- browsergym/workarena/tasks/compositional/dash_do_create_incident.py +403 -0
- browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +278 -0
- browsergym/workarena/tasks/compositional/dash_do_create_problem.py +336 -0
- browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +235 -0
- browsergym/workarena/tasks/compositional/dash_do_filter.py +1600 -0
- browsergym/workarena/tasks/compositional/dash_do_request_item.py +1315 -0
- browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +693 -0
- browsergym/workarena/tasks/compositional/delete_record.py +341 -0
- browsergym/workarena/tasks/compositional/edit_knowledge_base.py +457 -0
- browsergym/workarena/tasks/compositional/expense_management.py +598 -0
- browsergym/workarena/tasks/compositional/filter_and_do.py +139 -0
- browsergym/workarena/tasks/compositional/find_and_order_item.py +345 -0
- browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +1417 -0
- browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +499 -0
- browsergym/workarena/tasks/compositional/maximize_investment_return.py +1763 -0
- browsergym/workarena/tasks/compositional/navigate_and_do.py +1151 -0
- browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +2100 -0
- browsergym/workarena/tasks/compositional/offboard_user.py +207 -0
- browsergym/workarena/tasks/compositional/onboard_user.py +226 -0
- browsergym/workarena/tasks/compositional/update_task.py +145 -0
- browsergym/workarena/tasks/compositional/utils/curriculum.py +215 -0
- browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +151 -0
- browsergym/workarena/tasks/compositional/utils/knapsack.py +192 -0
- browsergym/workarena/tasks/compositional/warranty_check.py +227 -0
- browsergym/workarena/tasks/compositional/work_assignment.py +804 -0
- browsergym/workarena/tasks/compositional/workload_balancing.py +396 -0
- browsergym/workarena/tasks/dashboard.py +194 -12
- browsergym/workarena/tasks/form.py +1024 -232
- browsergym/workarena/tasks/knowledge.py +216 -25
- browsergym/workarena/tasks/list.py +519 -102
- browsergym/workarena/tasks/mark_duplicate_problem.py +171 -0
- browsergym/workarena/tasks/navigation.py +55 -13
- browsergym/workarena/tasks/scripts/extract_all_menu_items.py +9 -2
- browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +6 -5
- browsergym/workarena/tasks/scripts/service_catalog.py +2 -1
- browsergym/workarena/tasks/scripts/validate.py +8 -2
- browsergym/workarena/tasks/send_chat_message.py +90 -0
- browsergym/workarena/tasks/service_catalog.py +94 -26
- browsergym/workarena/tasks/utils/form.py +1 -4
- browsergym/workarena/tasks/utils/private_tasks.py +63 -0
- browsergym/workarena/tasks/utils/utils.py +13 -0
- {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.1.dist-info}/METADATA +19 -18
- browsergym_workarena-0.3.1.dist-info/RECORD +138 -0
- {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.1.dist-info}/entry_points.txt +1 -0
- browsergym_workarena-0.2.1.dist-info/RECORD +0 -85
- {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.1.dist-info}/WHEEL +0 -0
- {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
document.addEventListener('DOMContentLoaded', function() {
|
|
2
|
+
|
|
3
|
+
// Disable right-click in all frames to prevent people from opening new tabs that don't have the main header
|
|
4
|
+
document.addEventListener('contextmenu', function(event) {
|
|
5
|
+
event.preventDefault();
|
|
6
|
+
});
|
|
7
|
+
|
|
8
|
+
// Disable Command/Ctrl + Click for the same reasons
|
|
9
|
+
document.addEventListener('click', function(event) {
|
|
10
|
+
if (event.metaKey || event.ctrlKey) {
|
|
11
|
+
event.preventDefault();
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
// Disable middle-click for the same reasons
|
|
16
|
+
document.addEventListener('auxclick', function(event) {
|
|
17
|
+
if (event.button === 1) {
|
|
18
|
+
event.preventDefault();
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
if (window != top) {
|
|
23
|
+
return; // Do nothing if not in top window
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Check if the div already exists
|
|
27
|
+
let newDiv = document.getElementById("humanEvalConsole");
|
|
28
|
+
if (!newDiv) {
|
|
29
|
+
// Create a new div element if it doesn't exist
|
|
30
|
+
newDiv = document.createElement("div");
|
|
31
|
+
newDiv.id = "humanEvalConsole";
|
|
32
|
+
|
|
33
|
+
// Create a title for the div
|
|
34
|
+
const title = document.createElement("h3");
|
|
35
|
+
title.innerText = "Human Evaluation Console";
|
|
36
|
+
title.style.textAlign = "center";
|
|
37
|
+
newDiv.appendChild(title);
|
|
38
|
+
|
|
39
|
+
// Progress status indicator
|
|
40
|
+
const progressDiv = document.createElement("div");
|
|
41
|
+
progressDiv.id = "progressDiv";
|
|
42
|
+
progressDiv.style.marginTop = "-5px";
|
|
43
|
+
progressDiv.style.marginBottom = "5px";
|
|
44
|
+
newDiv.appendChild(progressDiv);
|
|
45
|
+
|
|
46
|
+
// Create the 'New tab' button
|
|
47
|
+
const newTabButton = document.createElement("button");
|
|
48
|
+
newTabButton.innerText = "+";
|
|
49
|
+
newTabButton.style.backgroundColor = "yellow";
|
|
50
|
+
newTabButton.style.color = "black";
|
|
51
|
+
newTabButton.style.border = "none";
|
|
52
|
+
newTabButton.style.padding = "5px 5px";
|
|
53
|
+
newTabButton.style.marginRight = "10px";
|
|
54
|
+
newTabButton.setAttribute("title", "New Tab");
|
|
55
|
+
newTabButton.onclick = function() {
|
|
56
|
+
window.open(window.location.href, '_blank');
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
// Create the 'Validate' button
|
|
60
|
+
const validateButton = document.createElement("button");
|
|
61
|
+
validateButton.innerText = "Validate";
|
|
62
|
+
validateButton.style.backgroundColor = "green";
|
|
63
|
+
validateButton.style.color = "white";
|
|
64
|
+
validateButton.style.border = "none";
|
|
65
|
+
validateButton.style.padding = "10px 20px";
|
|
66
|
+
validateButton.style.marginRight = "10px";
|
|
67
|
+
validateButton.onclick = function() {
|
|
68
|
+
window.NEED_VALIDATION = 1;
|
|
69
|
+
console.log("Validation flag set:", window.NEED_VALIDATION);
|
|
70
|
+
document.getElementById("taskStatusDiv").innerText = "Validation in progress...";
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
// Create the 'Give up' button
|
|
74
|
+
const giveUpButton = document.createElement("button");
|
|
75
|
+
giveUpButton.innerText = "Give up";
|
|
76
|
+
giveUpButton.style.backgroundColor = "red";
|
|
77
|
+
giveUpButton.style.color = "white";
|
|
78
|
+
giveUpButton.style.border = "none";
|
|
79
|
+
giveUpButton.style.padding = "10px 20px";
|
|
80
|
+
giveUpButton.style.marginRight = "10px";
|
|
81
|
+
giveUpButton.onclick = function() {
|
|
82
|
+
window.HUMAN_ABANDON = 1;
|
|
83
|
+
console.log("Give up flag set:", window.HUMAN_ABANDON);
|
|
84
|
+
document.getElementById("taskStatusDiv").innerText = "Human abandoned task.";
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
// Create the 'Infeasible' button
|
|
88
|
+
const infeasibleButton = document.createElement("button");
|
|
89
|
+
infeasibleButton.innerText = "Infeasible";
|
|
90
|
+
infeasibleButton.style.backgroundColor = "blue";
|
|
91
|
+
infeasibleButton.style.color = "white";
|
|
92
|
+
infeasibleButton.style.border = "none";
|
|
93
|
+
infeasibleButton.style.padding = "10px 20px";
|
|
94
|
+
infeasibleButton.onclick = function() {
|
|
95
|
+
let reasonTextBox = document.getElementById("reasonTextBox");
|
|
96
|
+
if (!reasonTextBox) {
|
|
97
|
+
// Show a new div to get the reason
|
|
98
|
+
const reasonTextBox = document.createElement("input");
|
|
99
|
+
reasonTextBox.id = "reasonTextBox";
|
|
100
|
+
reasonTextBox.type = "text";
|
|
101
|
+
reasonTextBox.setAttribute("placeholder", "Reason: e.g., Field 'Bob' does not exist.");
|
|
102
|
+
reasonTextBox.style.width = "300px";
|
|
103
|
+
reasonTextBox.style.marginRight = "10px";
|
|
104
|
+
newDiv.appendChild(reasonTextBox)
|
|
105
|
+
reasonTextBox.focus()
|
|
106
|
+
|
|
107
|
+
const reasonButton = document.createElement("button");
|
|
108
|
+
reasonButton.innerText = "Submit";
|
|
109
|
+
reasonButton.style.backgroundColor = "black";
|
|
110
|
+
reasonButton.style.color = "white";
|
|
111
|
+
reasonButton.style.border = "none";
|
|
112
|
+
reasonButton.onclick = function() {
|
|
113
|
+
window.HUMAN_INFEASIBLE = 1;
|
|
114
|
+
console.log("Infeasible flag set:", window.HUMAN_ABANDON);
|
|
115
|
+
document.getElementById("taskStatusDiv").innerText = "Human marked task as infeasible.";
|
|
116
|
+
};
|
|
117
|
+
newDiv.appendChild(reasonButton)
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
// Append buttons to the div
|
|
122
|
+
newDiv.appendChild(newTabButton)
|
|
123
|
+
newDiv.appendChild(validateButton);
|
|
124
|
+
newDiv.appendChild(giveUpButton);
|
|
125
|
+
newDiv.appendChild(infeasibleButton);
|
|
126
|
+
|
|
127
|
+
// Create a status div below the buttons
|
|
128
|
+
const taskStatusDiv = document.createElement("div");
|
|
129
|
+
taskStatusDiv.id = "taskStatusDiv";
|
|
130
|
+
taskStatusDiv.innerText = "Waiting for action...";
|
|
131
|
+
taskStatusDiv.style.marginTop = "10px";
|
|
132
|
+
newDiv.appendChild(taskStatusDiv); // Append the status div to the main div
|
|
133
|
+
|
|
134
|
+
// Append the div to the body of the document
|
|
135
|
+
document.body.appendChild(newDiv);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Ensure the div is draggable vertically
|
|
139
|
+
newDiv.style.position = "fixed";
|
|
140
|
+
newDiv.style.right = "10px";
|
|
141
|
+
newDiv.style.bottom = "10px";
|
|
142
|
+
newDiv.style.zIndex = "1000";
|
|
143
|
+
newDiv.style.backgroundColor = "#f0f0f0";
|
|
144
|
+
newDiv.style.border = "1px solid black";
|
|
145
|
+
newDiv.style.padding = "10px";
|
|
146
|
+
newDiv.style.borderRadius = "8px"; // Rounded corners
|
|
147
|
+
newDiv.style.cursor = "ns-resize"; // Cursor indicates vertical movement
|
|
148
|
+
|
|
149
|
+
let isDragging = false;
|
|
150
|
+
|
|
151
|
+
newDiv.onmousedown = function(event) {
|
|
152
|
+
event.preventDefault(); // Prevent default text selection
|
|
153
|
+
isDragging = true;
|
|
154
|
+
let startY = event.clientY;
|
|
155
|
+
let startBottom = parseInt(window.getComputedStyle(newDiv).bottom, 10);
|
|
156
|
+
|
|
157
|
+
function onMouseMove(event) {
|
|
158
|
+
if (isDragging) {
|
|
159
|
+
let newBottom = startBottom - (event.clientY - startY);
|
|
160
|
+
newDiv.style.bottom = newBottom + 'px'; // Update bottom position only
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
document.addEventListener('mousemove', onMouseMove);
|
|
165
|
+
|
|
166
|
+
document.onmouseup = function() {
|
|
167
|
+
document.removeEventListener('mousemove', onMouseMove);
|
|
168
|
+
newDiv.onmouseup = null;
|
|
169
|
+
isDragging = false; // Stop dragging
|
|
170
|
+
};
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
newDiv.ondragstart = function() {
|
|
174
|
+
return false; // Prevent default dragging behavior
|
|
175
|
+
};
|
|
176
|
+
});
|
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
"""
|
|
2
|
+
WorkArena Human Evaluation Tool
|
|
3
|
+
|
|
4
|
+
Known issues:
|
|
5
|
+
* Blocking page interaction: We can't block loading until validation is done because some validation
|
|
6
|
+
functions require the page to be loaded. This means the user might act
|
|
7
|
+
while validation is ongoing. However, they would need to be very quick to
|
|
8
|
+
cause issues.
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
import random
|
|
17
|
+
import tenacity
|
|
18
|
+
|
|
19
|
+
from time import sleep, time
|
|
20
|
+
|
|
21
|
+
from browsergym.core.env import BrowserEnv
|
|
22
|
+
from browsergym.workarena import ALL_WORKARENA_TASKS, get_all_tasks_humans
|
|
23
|
+
from browsergym.workarena.tasks.compositional.base import CompositionalTask
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
logging.basicConfig(level=logging.INFO)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# All available task classes by name
|
|
30
|
+
TASKS = {task.__name__: task for task in ALL_WORKARENA_TASKS}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_servicenow_pages(context):
|
|
34
|
+
return [p for p in context.pages if "service-now" in p.url]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
|
|
38
|
+
def validation_flag_activated(context):
|
|
39
|
+
return any(
|
|
40
|
+
f.evaluate("typeof window.NEED_VALIDATION !== 'undefined' && window.NEED_VALIDATION")
|
|
41
|
+
for p in get_servicenow_pages(context)
|
|
42
|
+
for f in p.frames
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
|
|
47
|
+
def reset_validation_flag(context):
|
|
48
|
+
try:
|
|
49
|
+
for page in get_servicenow_pages(context):
|
|
50
|
+
for f in page.frames:
|
|
51
|
+
f.evaluate("window.NEED_VALIDATION = 0;")
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(e, "Failed to reset validation flag") # Worst case we'll just keep validating
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
|
|
57
|
+
def abandon_flag_activated(context):
|
|
58
|
+
return any(
|
|
59
|
+
f.evaluate("typeof window.HUMAN_ABANDON !== 'undefined' && window.HUMAN_ABANDON")
|
|
60
|
+
for p in get_servicenow_pages(context)
|
|
61
|
+
for f in p.frames
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
|
|
66
|
+
def infeasible_flag_activated(context):
|
|
67
|
+
infeasible = any(
|
|
68
|
+
f.evaluate("typeof window.HUMAN_INFEASIBLE !== 'undefined'")
|
|
69
|
+
for p in get_servicenow_pages(context)
|
|
70
|
+
for f in p.frames
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
reason = None
|
|
74
|
+
if infeasible:
|
|
75
|
+
for p in get_servicenow_pages(context):
|
|
76
|
+
try:
|
|
77
|
+
reason = p.evaluate("document.getElementById('reasonTextBox').value")
|
|
78
|
+
break
|
|
79
|
+
except:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
return infeasible, reason
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
|
|
86
|
+
def human_console_set_status(msg, context):
|
|
87
|
+
for p in get_servicenow_pages(context):
|
|
88
|
+
p.evaluate(f"document.getElementById('taskStatusDiv').innerText = '{msg}'")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
|
|
92
|
+
def human_console_set_progress_status(msg, context):
|
|
93
|
+
for p in get_servicenow_pages(context):
|
|
94
|
+
p.evaluate(f"document.getElementById('progressDiv').innerText = '{msg}'")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def log_result(annotator_info: dict, task_info: dict, metrics: dict, path: str):
|
|
98
|
+
# Read existing log
|
|
99
|
+
if os.path.exists(path):
|
|
100
|
+
log = json.load(open(path, "r"))
|
|
101
|
+
else:
|
|
102
|
+
log = []
|
|
103
|
+
|
|
104
|
+
# Append log
|
|
105
|
+
log.append({"annotator_info": annotator_info, "task_info": task_info, "metrics": metrics})
|
|
106
|
+
json.dump(log, open(path, "w"))
|
|
107
|
+
|
|
108
|
+
logging.info(f"Logged result: {task_info} -- {metrics}")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def task_already_evaluated(path: str, annotator_info: dict, task_info: dict):
|
|
112
|
+
if not os.path.exists(path):
|
|
113
|
+
return False
|
|
114
|
+
|
|
115
|
+
log = json.load(open(path, "r"))
|
|
116
|
+
for entry in log:
|
|
117
|
+
if entry["annotator_info"] == annotator_info and entry["task_info"] == task_info:
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def setup_environment(task_info: dict):
|
|
124
|
+
task_cls = TASKS[task_info["task_name"]]
|
|
125
|
+
env = BrowserEnv(
|
|
126
|
+
task_entrypoint=task_cls,
|
|
127
|
+
headless=False,
|
|
128
|
+
)
|
|
129
|
+
info, _ = env.reset(seed=task_info["task_seed"])
|
|
130
|
+
|
|
131
|
+
# Inject human-eval helper scripts (reload to apply)
|
|
132
|
+
env.task.page.context.add_init_script("window.NEED_VALIDATION = 1;")
|
|
133
|
+
env.task.page.context.add_init_script(
|
|
134
|
+
path=os.path.join(os.path.dirname(__file__), "console.js")
|
|
135
|
+
)
|
|
136
|
+
env.task.page.reload()
|
|
137
|
+
|
|
138
|
+
# Patch the chat messages so that the human posts as the bot
|
|
139
|
+
env.chat.page.evaluate(
|
|
140
|
+
"""
|
|
141
|
+
(function() {
|
|
142
|
+
let old;
|
|
143
|
+
|
|
144
|
+
// Function to wait for addChatMessage to be defined
|
|
145
|
+
function waitForAddChatMessage() {
|
|
146
|
+
if (typeof addChatMessage !== 'undefined') {
|
|
147
|
+
// Save the original 'addChatMessage' function to 'old'
|
|
148
|
+
if (typeof old === 'undefined') {
|
|
149
|
+
old = new Function('return ' + addChatMessage.toString())();
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Redefine 'addChatMessage' to wrap the original function
|
|
153
|
+
addChatMessage = function(role, timeString, msg) {
|
|
154
|
+
if (role === 'user') {
|
|
155
|
+
role = 'assistant'; // Swap role from 'user' to 'assistant'
|
|
156
|
+
}
|
|
157
|
+
else if (role === 'assistant') {
|
|
158
|
+
role = 'user'; // Swap role from 'assistant' to 'user'
|
|
159
|
+
}
|
|
160
|
+
old(role, timeString, msg); // Call the original function
|
|
161
|
+
};
|
|
162
|
+
} else {
|
|
163
|
+
// Retry after a short delay
|
|
164
|
+
setTimeout(waitForAddChatMessage, 100);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Start waiting for addChatMessage to be defined
|
|
169
|
+
waitForAddChatMessage();
|
|
170
|
+
})();
|
|
171
|
+
"""
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Mark all chat messages as patched so that we don't patch them again
|
|
175
|
+
for m in env.chat.messages:
|
|
176
|
+
m["patched"] = True
|
|
177
|
+
|
|
178
|
+
return env
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def load_curriculum(path):
|
|
182
|
+
"""
|
|
183
|
+
Load curriculum from a file or generate a random one.
|
|
184
|
+
|
|
185
|
+
Parameters:
|
|
186
|
+
-----------
|
|
187
|
+
path: str
|
|
188
|
+
Path to the curriculum file. If set to "random", a random curriculum will be generated.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
--------
|
|
192
|
+
curriculum: list
|
|
193
|
+
|
|
194
|
+
"""
|
|
195
|
+
if path == "random":
|
|
196
|
+
logging.info("Generating random curriculum")
|
|
197
|
+
all_tasks = get_all_tasks_humans(filter="l2") + get_all_tasks_humans(filter="l3")
|
|
198
|
+
random.shuffle(all_tasks)
|
|
199
|
+
curriculum = [{"task_name": x[0].__name__, "task_seed": x[1]} for x in all_tasks]
|
|
200
|
+
else:
|
|
201
|
+
logging.info(f"Loading curriculum from {path}")
|
|
202
|
+
with open(path, "r") as f:
|
|
203
|
+
curriculum = [
|
|
204
|
+
{"task_name": l.split(",")[0].strip(), "task_seed": int(l.split(",")[1].strip())}
|
|
205
|
+
for l in f.readlines()
|
|
206
|
+
if len(l.strip()) > 0
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
return curriculum
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
@tenacity.retry(wait=tenacity.wait_fixed(1), stop=tenacity.stop_after_attempt(5), reraise=True)
|
|
213
|
+
def validate_solution(env):
|
|
214
|
+
infos = []
|
|
215
|
+
messages = []
|
|
216
|
+
for p in env.context.pages:
|
|
217
|
+
reward, stop, message, info = env.task.validate(p, env.chat.messages)
|
|
218
|
+
|
|
219
|
+
# If a terminal condition is encountered, return it.
|
|
220
|
+
if reward == 1 or (reward == 0 and stop):
|
|
221
|
+
return reward, stop, message, info
|
|
222
|
+
|
|
223
|
+
infos.append(info)
|
|
224
|
+
messages.append(message)
|
|
225
|
+
|
|
226
|
+
return reward, stop, ", ".join(messages), {"message": ", ".join([i["message"] for i in infos])}
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def main():
|
|
230
|
+
|
|
231
|
+
# Initialize the argument parser
|
|
232
|
+
parser = argparse.ArgumentParser(
|
|
233
|
+
description="Get annotator info and log path from command line arguments."
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Define the command line arguments
|
|
237
|
+
parser.add_argument("--email", type=str, required=True, help="Email of the annotator")
|
|
238
|
+
parser.add_argument(
|
|
239
|
+
"--curriculum",
|
|
240
|
+
type=str,
|
|
241
|
+
required=True,
|
|
242
|
+
help='Path to the curriculum file (optional: use "random" for a random one)',
|
|
243
|
+
)
|
|
244
|
+
parser.add_argument(
|
|
245
|
+
"--log",
|
|
246
|
+
type=str,
|
|
247
|
+
required=False,
|
|
248
|
+
default="human_eval_log.json",
|
|
249
|
+
help="Path to the log file",
|
|
250
|
+
)
|
|
251
|
+
parser.add_argument("--reset-log", action="store_true", help="Reset the log file")
|
|
252
|
+
|
|
253
|
+
# Parse the arguments
|
|
254
|
+
args = parser.parse_args()
|
|
255
|
+
|
|
256
|
+
annotator_info = {"email": args.email}
|
|
257
|
+
logging.info(f"Annotator info: {annotator_info}")
|
|
258
|
+
|
|
259
|
+
# Reset the log file if requested
|
|
260
|
+
logging.info(f"Log file: {args.log}")
|
|
261
|
+
if args.reset_log:
|
|
262
|
+
logging.info("Resetting log file")
|
|
263
|
+
json.dump([], open(args.log, "w"))
|
|
264
|
+
|
|
265
|
+
# Loop over the curriculum
|
|
266
|
+
curriculum = load_curriculum(args.curriculum)
|
|
267
|
+
logging.info(f"Starting evaluation for {len(curriculum)} tasks")
|
|
268
|
+
for i, task_info in enumerate(curriculum):
|
|
269
|
+
|
|
270
|
+
if task_already_evaluated(args.log, annotator_info, task_info):
|
|
271
|
+
logging.info(f"Task {task_info} already evaluated. Skipping.")
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
# Setup the environment
|
|
275
|
+
logging.info(f"Setting up environment for task {task_info}")
|
|
276
|
+
env = setup_environment(task_info)
|
|
277
|
+
|
|
278
|
+
# Game loop
|
|
279
|
+
logging.info(f"Starting evaluation for task {task_info}")
|
|
280
|
+
start_time = time()
|
|
281
|
+
end = False
|
|
282
|
+
success = False
|
|
283
|
+
prev_chat_len = len(env.chat.messages)
|
|
284
|
+
while True:
|
|
285
|
+
human_console_set_progress_status(
|
|
286
|
+
f"Task {i + 1} / {len(curriculum)} --- Elapsed: {round(time() - start_time, 2)} sec.",
|
|
287
|
+
env.context,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# Event: Human marked task as infeasible
|
|
291
|
+
infeasible, infeasible_reason = infeasible_flag_activated(env.context)
|
|
292
|
+
if infeasible and not any([m["role"] == "infeasible" for m in env.chat.messages]):
|
|
293
|
+
logging.info(f"Human marked task as infeasible. Reason: {infeasible_reason}")
|
|
294
|
+
human_console_set_status("Task marked as infeasible.", env.context)
|
|
295
|
+
env.chat.messages.append({"role": "infeasible", "message": infeasible_reason})
|
|
296
|
+
# TODO: There is a small glitch where if the user changes their message after,
|
|
297
|
+
# the new infeasible message will be saved instead of the initial one that
|
|
298
|
+
# was added to the chat messages. We can't stop after infeasible has been
|
|
299
|
+
# declared.
|
|
300
|
+
|
|
301
|
+
# Event: Validation is required
|
|
302
|
+
if validation_flag_activated(env.context) or len(env.chat.messages) != prev_chat_len:
|
|
303
|
+
human_console_set_status("Validation in progress...", env.context)
|
|
304
|
+
|
|
305
|
+
# Patch all chat messages
|
|
306
|
+
for m in env.chat.messages:
|
|
307
|
+
if not m.get("patched", False):
|
|
308
|
+
if m["role"] == "user":
|
|
309
|
+
m["role"] = "assistant"
|
|
310
|
+
elif m["role"] == "assistant":
|
|
311
|
+
m["role"] = "user"
|
|
312
|
+
m["patched"] = True
|
|
313
|
+
|
|
314
|
+
reward, stop, message, info = validate_solution(env)
|
|
315
|
+
logging.info(f"Validation: {info} -- reward: {reward} -- stop: {stop}")
|
|
316
|
+
|
|
317
|
+
if reward == 1:
|
|
318
|
+
human_console_set_status("Success!", env.context)
|
|
319
|
+
end = True
|
|
320
|
+
success = True
|
|
321
|
+
else:
|
|
322
|
+
if not end: # If we're not already stopping for another reason
|
|
323
|
+
if stop:
|
|
324
|
+
human_console_set_status(
|
|
325
|
+
"Task not completed. Stop required.", env.context
|
|
326
|
+
)
|
|
327
|
+
end = True
|
|
328
|
+
success = False
|
|
329
|
+
else:
|
|
330
|
+
human_console_set_status("Task not completed. Keep going.", env.context)
|
|
331
|
+
|
|
332
|
+
prev_chat_len = len(env.chat.messages)
|
|
333
|
+
reset_validation_flag(env.context)
|
|
334
|
+
|
|
335
|
+
# Event: Human abandoned task
|
|
336
|
+
if abandon_flag_activated(env.context):
|
|
337
|
+
end = True
|
|
338
|
+
success = False
|
|
339
|
+
human_console_set_status("Task abandoned by human.", env.context)
|
|
340
|
+
|
|
341
|
+
# Event: Task is finished
|
|
342
|
+
if end:
|
|
343
|
+
log_result(
|
|
344
|
+
path=args.log,
|
|
345
|
+
annotator_info=annotator_info,
|
|
346
|
+
task_info=task_info,
|
|
347
|
+
metrics={
|
|
348
|
+
"duration": time() - start_time,
|
|
349
|
+
"success": success,
|
|
350
|
+
"infeasible": infeasible_reason if infeasible else None,
|
|
351
|
+
"abandoned": abandon_flag_activated(env.context),
|
|
352
|
+
"chat_messages": env.chat.messages,
|
|
353
|
+
},
|
|
354
|
+
)
|
|
355
|
+
sleep(3) # Sleep so human has time to read status before it closes
|
|
356
|
+
break
|
|
357
|
+
|
|
358
|
+
sleep(0.1)
|
|
359
|
+
|
|
360
|
+
human_console_set_status("Cleaning environment. This may take a while...", env.context)
|
|
361
|
+
env.close()
|
|
362
|
+
logging.info(f"Finished evaluation for task {task_info}")
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
if __name__ == "__main__":
|
|
366
|
+
main()
|