browsergym-workarena 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browsergym/workarena/__init__.py +13 -1
- browsergym/workarena/api/category.py +74 -0
- browsergym/workarena/api/change_request.py +87 -0
- browsergym/workarena/api/computer_asset.py +90 -0
- browsergym/workarena/api/cost_center.py +19 -0
- browsergym/workarena/api/expense_line.py +89 -0
- browsergym/workarena/api/incident.py +45 -0
- browsergym/workarena/api/knowledge.py +29 -0
- browsergym/workarena/api/problem.py +90 -0
- browsergym/workarena/api/report.py +183 -0
- browsergym/workarena/api/requested_items.py +63 -0
- browsergym/workarena/api/user.py +11 -8
- browsergym/workarena/api/utils.py +47 -3
- browsergym/workarena/config.py +21 -1
- browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +1 -1
- browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +1 -0
- browsergym/workarena/data_files/setup_files/knowledge/protocols.json +46 -0
- browsergym/workarena/data_files/setup_files/knowledge/test.html +1 -0
- browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +2 -24
- browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +4 -40
- browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +12 -0
- browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +1 -42
- browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +2 -18
- browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +12 -0
- browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +12 -0
- browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +2 -19
- browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +3 -50
- browsergym/workarena/data_files/task_configs/all_menu.json +1 -1
- browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +1 -1
- browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +1 -1
- browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +1 -1
- browsergym/workarena/data_files/task_configs/impersonation_users.json +1 -1
- browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +1 -1
- browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +1 -1
- browsergym/workarena/human_eval/console.js +176 -0
- browsergym/workarena/human_eval/tool.py +366 -0
- browsergym/workarena/install.py +81 -20
- browsergym/workarena/tasks/base.py +55 -20
- browsergym/workarena/tasks/comp_building_block.py +4 -0
- browsergym/workarena/tasks/compositional/__init__.py +76 -0
- browsergym/workarena/tasks/compositional/base.py +364 -0
- browsergym/workarena/tasks/compositional/dash_do_base.py +1366 -0
- browsergym/workarena/tasks/compositional/dash_do_catalog.py +1127 -0
- browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +2047 -0
- browsergym/workarena/tasks/compositional/dash_do_create_incident.py +403 -0
- browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +278 -0
- browsergym/workarena/tasks/compositional/dash_do_create_problem.py +336 -0
- browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +235 -0
- browsergym/workarena/tasks/compositional/dash_do_filter.py +1600 -0
- browsergym/workarena/tasks/compositional/dash_do_request_item.py +1315 -0
- browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +693 -0
- browsergym/workarena/tasks/compositional/delete_record.py +341 -0
- browsergym/workarena/tasks/compositional/edit_knowledge_base.py +457 -0
- browsergym/workarena/tasks/compositional/expense_management.py +598 -0
- browsergym/workarena/tasks/compositional/filter_and_do.py +139 -0
- browsergym/workarena/tasks/compositional/find_and_order_item.py +345 -0
- browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +1417 -0
- browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +499 -0
- browsergym/workarena/tasks/compositional/maximize_investment_return.py +1763 -0
- browsergym/workarena/tasks/compositional/navigate_and_do.py +1151 -0
- browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +2100 -0
- browsergym/workarena/tasks/compositional/offboard_user.py +207 -0
- browsergym/workarena/tasks/compositional/onboard_user.py +226 -0
- browsergym/workarena/tasks/compositional/update_task.py +145 -0
- browsergym/workarena/tasks/compositional/utils/curriculum.py +215 -0
- browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +151 -0
- browsergym/workarena/tasks/compositional/utils/knapsack.py +192 -0
- browsergym/workarena/tasks/compositional/warranty_check.py +227 -0
- browsergym/workarena/tasks/compositional/work_assignment.py +804 -0
- browsergym/workarena/tasks/compositional/workload_balancing.py +396 -0
- browsergym/workarena/tasks/dashboard.py +194 -12
- browsergym/workarena/tasks/form.py +1024 -232
- browsergym/workarena/tasks/knowledge.py +216 -25
- browsergym/workarena/tasks/list.py +519 -102
- browsergym/workarena/tasks/mark_duplicate_problem.py +171 -0
- browsergym/workarena/tasks/navigation.py +55 -13
- browsergym/workarena/tasks/scripts/extract_all_menu_items.py +9 -2
- browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +6 -5
- browsergym/workarena/tasks/scripts/service_catalog.py +2 -1
- browsergym/workarena/tasks/scripts/validate.py +8 -2
- browsergym/workarena/tasks/send_chat_message.py +90 -0
- browsergym/workarena/tasks/service_catalog.py +94 -26
- browsergym/workarena/tasks/utils/form.py +1 -4
- browsergym/workarena/tasks/utils/private_tasks.py +63 -0
- browsergym/workarena/tasks/utils/utils.py +13 -0
- {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.1.dist-info}/METADATA +19 -18
- browsergym_workarena-0.3.1.dist-info/RECORD +138 -0
- {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.1.dist-info}/entry_points.txt +1 -0
- browsergym_workarena-0.2.1.dist-info/RECORD +0 -85
- {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.1.dist-info}/WHEEL +0 -0
- {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
from typing import List, Tuple
|
|
6
|
+
from playwright.sync_api._generated import Page
|
|
7
|
+
|
|
8
|
+
from browsergym.workarena.config import PROTOCOL_KB_FILEPATH
|
|
9
|
+
|
|
10
|
+
from .update_task import UpdatePrivateTask
|
|
11
|
+
|
|
12
|
+
from ..base import AbstractServiceNowTask
|
|
13
|
+
from ..navigation import AllMenuTask
|
|
14
|
+
|
|
15
|
+
from ...instance import SNowInstance
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CompositionalTask(AbstractServiceNowTask):
|
|
19
|
+
# Final private task instructions
|
|
20
|
+
final_private_task_instructions = 'Don\'t forget to mark this task as "Closed - complete" once successfully completed. If the task appears infeasible, mark the task as "Closed - skipped" .'
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
seed: int = None,
|
|
25
|
+
instance: SNowInstance = None,
|
|
26
|
+
start_rel_url: str = "/now/nav/ui/home",
|
|
27
|
+
fixed_config: list[AbstractServiceNowTask] = None,
|
|
28
|
+
level: int = 2,
|
|
29
|
+
protocol_name: str = "",
|
|
30
|
+
user_roles: List[str] = ["admin"],
|
|
31
|
+
) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Create a compositional task with specific subtasks
|
|
34
|
+
|
|
35
|
+
Parameters:
|
|
36
|
+
-----------
|
|
37
|
+
instance: SNowInstance
|
|
38
|
+
The ServiceNow instance to run the task on.
|
|
39
|
+
start_rel_url: str
|
|
40
|
+
The relative URL to start the task from.
|
|
41
|
+
fixed_config: list[AbstractServiceNowTask]
|
|
42
|
+
A list of subtasks.
|
|
43
|
+
level: int
|
|
44
|
+
The level of the task; choice between 2 and 3. L2 will have all the info in the the goal and start in the SNOW home page.
|
|
45
|
+
L3 will start in a private task page describing the information needed to complete the task and the related company protocol
|
|
46
|
+
to complete it.
|
|
47
|
+
protocol_name: str
|
|
48
|
+
The name of the protocol to follow to complete the task; only used for level 3 tasks.
|
|
49
|
+
user_roles: list[str]
|
|
50
|
+
The roles to assign to the user (default: ["admin"])
|
|
51
|
+
"""
|
|
52
|
+
super().__init__(
|
|
53
|
+
seed=seed, instance=instance, start_rel_url=start_rel_url, user_roles=user_roles
|
|
54
|
+
)
|
|
55
|
+
# Set the task as completed in L3
|
|
56
|
+
self.set_private_task_as_completed = True
|
|
57
|
+
self.seed = seed
|
|
58
|
+
|
|
59
|
+
self.fixed_config = fixed_config
|
|
60
|
+
self.protocol_name = protocol_name
|
|
61
|
+
self.task_description = ""
|
|
62
|
+
self.short_description = ""
|
|
63
|
+
|
|
64
|
+
assert level in [2, 3], "Level must be either 2 or 3"
|
|
65
|
+
self.level = level
|
|
66
|
+
if self.level == 2:
|
|
67
|
+
start_rel_url = "/now/nav/ui/home"
|
|
68
|
+
else:
|
|
69
|
+
self.private_task_id = "PTSK" + str(id(self) % (10**8)).zfill(8)
|
|
70
|
+
self.sys_id = None
|
|
71
|
+
start_rel_url = "" # For level 3 tasks, the start URL depends on the sys ID of the private task created for it
|
|
72
|
+
|
|
73
|
+
def __len__(self) -> int:
|
|
74
|
+
return len(self.subtasks)
|
|
75
|
+
|
|
76
|
+
def setup_goal(
|
|
77
|
+
self,
|
|
78
|
+
page: Page,
|
|
79
|
+
config: list[AbstractServiceNowTask],
|
|
80
|
+
build_pretty_print_description: bool = True,
|
|
81
|
+
) -> tuple[str, str, dict]:
|
|
82
|
+
super().setup_goal(page=page)
|
|
83
|
+
# Index to keep track of the task we are currently validating
|
|
84
|
+
self.valid_index = 0
|
|
85
|
+
|
|
86
|
+
# Setup all the subtasks
|
|
87
|
+
self.subtasks = []
|
|
88
|
+
self.subgoals = []
|
|
89
|
+
for task in config:
|
|
90
|
+
if (
|
|
91
|
+
self.level == 2 and not task.used_in_level_2
|
|
92
|
+
): # Skip tasks that are not used in level 2; e.g. navigate to the company protocol
|
|
93
|
+
continue
|
|
94
|
+
self.subtasks.append(task)
|
|
95
|
+
self.subgoals.append(self.subtasks[-1].setup(page=page, do_start=False)[0])
|
|
96
|
+
|
|
97
|
+
if self.level == 3:
|
|
98
|
+
if build_pretty_print_description:
|
|
99
|
+
self._build_pretty_printed_description(config)
|
|
100
|
+
level_3_final_tasks = [
|
|
101
|
+
# Navigate to the My Work task list
|
|
102
|
+
AllMenuTask(
|
|
103
|
+
instance=self.instance,
|
|
104
|
+
fixed_config={
|
|
105
|
+
"application": "Service Desk",
|
|
106
|
+
"module": "My Work",
|
|
107
|
+
"url": "/now/nav/ui/classic/params/target/task_list.do%3Fsysparm_userpref_module%3D1523b8d4c611227b00be8216ec331b9a%26sysparm_query%3Dactive%253Dtrue%255Eassigned_to%253Djavascript%253AgetMyAssignments%2528%2529%255Estate%2521%253D-5%255EEQ",
|
|
108
|
+
},
|
|
109
|
+
is_validated=False,
|
|
110
|
+
used_in_level_2=False,
|
|
111
|
+
),
|
|
112
|
+
# Close the private task
|
|
113
|
+
UpdatePrivateTask(
|
|
114
|
+
instance=self.instance,
|
|
115
|
+
fixed_config={
|
|
116
|
+
"task_description": self.task_description,
|
|
117
|
+
"short_description": self.short_description,
|
|
118
|
+
},
|
|
119
|
+
set_as_completed=self.set_private_task_as_completed,
|
|
120
|
+
is_validated=True,
|
|
121
|
+
used_in_level_2=False,
|
|
122
|
+
),
|
|
123
|
+
]
|
|
124
|
+
self.subtasks.extend(level_3_final_tasks)
|
|
125
|
+
# Set identical user credentials for all subtasks
|
|
126
|
+
for task in self.subtasks:
|
|
127
|
+
task._base_initial_instance = self.instance
|
|
128
|
+
task._base_user_name, task._base_user_password, task._base_user_sysid = (
|
|
129
|
+
self._base_user_name,
|
|
130
|
+
self._base_user_password,
|
|
131
|
+
self._base_user_sysid,
|
|
132
|
+
)
|
|
133
|
+
task.instance = self.instance
|
|
134
|
+
task.instance.snow_credentials = (self._base_user_name, self._base_user_password)
|
|
135
|
+
|
|
136
|
+
# Finish the setup with the L3-specific tasks
|
|
137
|
+
for task in self.subtasks[-2:]:
|
|
138
|
+
task.setup(page=page, do_start=False)
|
|
139
|
+
# The sys ID of the private task is the sys ID of the last task in the list
|
|
140
|
+
self.sys_id = level_3_final_tasks[-1].sys_id
|
|
141
|
+
|
|
142
|
+
self.start_url = (
|
|
143
|
+
self.instance.snow_url
|
|
144
|
+
+ f"/now/nav/ui/classic/params/target/vtb_task.do%3Fsys_id%3D{self.sys_id}"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# For level 2, include all substeps in the goal
|
|
148
|
+
# For level 3, the goal is already set in the private task
|
|
149
|
+
if self.level == 2:
|
|
150
|
+
task_intro = self.short_description + "\n"
|
|
151
|
+
# Get the protocol to follow for the task and pre-pend it to the goal
|
|
152
|
+
goal = task_intro
|
|
153
|
+
goal += " \n Concretely, you need to complete the following steps:"
|
|
154
|
+
|
|
155
|
+
# In some cases, more than one subtasks with identical subgoals are present and the duplicated tasks have empty goals
|
|
156
|
+
# These multiple tasks are used to provide a complete cheat for the tasks like ManageChangeRequestScheduleTask subclasses
|
|
157
|
+
# To avoid having empty steps in the enumeration, we check if the goal is empty and skip if it is
|
|
158
|
+
i = 1
|
|
159
|
+
for subgoal in self.subgoals:
|
|
160
|
+
if not subgoal:
|
|
161
|
+
continue
|
|
162
|
+
goal += f"\n{i}. {subgoal}"
|
|
163
|
+
i += 1
|
|
164
|
+
|
|
165
|
+
elif self.level == 3:
|
|
166
|
+
goal = f"Please complete the following task."
|
|
167
|
+
|
|
168
|
+
return goal, {}
|
|
169
|
+
|
|
170
|
+
def _get_config(self) -> list[AbstractServiceNowTask]:
|
|
171
|
+
"""
|
|
172
|
+
Get a configuration for a given compositional task, in the form of a list subtasks.
|
|
173
|
+
"""
|
|
174
|
+
raise NotImplementedError("This method should be implemented in a subclass")
|
|
175
|
+
|
|
176
|
+
def cheat(self, page: Page, chat_messages: list[str], subtask_idx: int) -> None:
|
|
177
|
+
"""
|
|
178
|
+
Solve the a subtask of the task
|
|
179
|
+
|
|
180
|
+
Parameters:
|
|
181
|
+
----------
|
|
182
|
+
page: Page
|
|
183
|
+
The page to solve the task on
|
|
184
|
+
chat_messages: list[str]
|
|
185
|
+
The list of messages in the chat
|
|
186
|
+
subtask_idx: int
|
|
187
|
+
The index of the subtask to solve.
|
|
188
|
+
|
|
189
|
+
Note:
|
|
190
|
+
-----
|
|
191
|
+
* We proceed separately for each subtask since this enables validation of each subtask separately.
|
|
192
|
+
This is useful for certifying the feasibility of tasks in the benchmark. Otherwise, cheat would
|
|
193
|
+
bring us to the final state of the task, which would make it impossible to validate subtasks.
|
|
194
|
+
* Use len(self) to get the number of subtasks in the task.
|
|
195
|
+
|
|
196
|
+
"""
|
|
197
|
+
super().cheat(page, chat_messages)
|
|
198
|
+
self.subtasks[subtask_idx].cheat(page, chat_messages)
|
|
199
|
+
|
|
200
|
+
def _build_pretty_printed_description(self, config: list[AbstractServiceNowTask]) -> str:
|
|
201
|
+
"""
|
|
202
|
+
Get the task information for the private task description; used for level 3 tasks.
|
|
203
|
+
Args:
|
|
204
|
+
config: list[AbstractServiceNowTask]
|
|
205
|
+
The list of subtasks in the task
|
|
206
|
+
"""
|
|
207
|
+
for subtask in config:
|
|
208
|
+
if subtask.is_validated or subtask.has_description:
|
|
209
|
+
self.task_description += subtask.get_pretty_printed_description()
|
|
210
|
+
self.task_description += "\n"
|
|
211
|
+
self.task_description += self.final_private_task_instructions
|
|
212
|
+
|
|
213
|
+
return self.task_description
|
|
214
|
+
|
|
215
|
+
def validate(self, page: Page, chat_messages: list[str]) -> Tuple[float, bool, str, dict]:
|
|
216
|
+
super().validate(page, chat_messages)
|
|
217
|
+
|
|
218
|
+
# Initialize the index of the first subtask that requires validation
|
|
219
|
+
while (
|
|
220
|
+
self.valid_index < len(self.subtasks)
|
|
221
|
+
and not self.subtasks[self.valid_index].is_validated
|
|
222
|
+
):
|
|
223
|
+
self.valid_index += 1
|
|
224
|
+
|
|
225
|
+
if self.valid_index == len(self.subtasks):
|
|
226
|
+
return (
|
|
227
|
+
1,
|
|
228
|
+
True,
|
|
229
|
+
"Nice work, thank you!",
|
|
230
|
+
{"message": "Task completed successfully."},
|
|
231
|
+
)
|
|
232
|
+
# Validate the current subtask
|
|
233
|
+
subtask = self.subtasks[self.valid_index]
|
|
234
|
+
reward, stop, info, message = subtask.validate(page, chat_messages)
|
|
235
|
+
|
|
236
|
+
# If the subtask is valid
|
|
237
|
+
if reward >= 1.0:
|
|
238
|
+
# ... override the info and message to avoid success messages from the subtask
|
|
239
|
+
info = message["message"] = (
|
|
240
|
+
f"Step {self.valid_index + 1} has been completed successfully."
|
|
241
|
+
)
|
|
242
|
+
# ... this is a subtask, so we don't want to stop
|
|
243
|
+
stop = False
|
|
244
|
+
# ... increment index to flag this one as solved
|
|
245
|
+
self.valid_index += 1
|
|
246
|
+
|
|
247
|
+
# If the subtask is not valid
|
|
248
|
+
else:
|
|
249
|
+
# ... contextualize the info and message per subtask
|
|
250
|
+
info = f"Step {self.valid_index + 1}: " + info
|
|
251
|
+
message["message"] = f"Step {self.valid_index + 1}: " + message.get("message", "")
|
|
252
|
+
|
|
253
|
+
# Check if all subtasks are solved
|
|
254
|
+
if self.valid_index == len(self.subtasks):
|
|
255
|
+
return (
|
|
256
|
+
1,
|
|
257
|
+
True,
|
|
258
|
+
"Nice work, thank you!",
|
|
259
|
+
{"message": "Task completed successfully."},
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
return 0, stop, info, message
|
|
263
|
+
|
|
264
|
+
def teardown(self) -> None:
|
|
265
|
+
# XXX: In base.py we define the teardown method as being independent of the
|
|
266
|
+
# current state of the page. This means that we can just call all the
|
|
267
|
+
# subtasks' teardown methods.
|
|
268
|
+
for task in self.subtasks:
|
|
269
|
+
task.teardown()
|
|
270
|
+
super().teardown()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class InfeasibleCompositionalTask(CompositionalTask):
|
|
274
|
+
"""
|
|
275
|
+
Base class for infeasible tasks.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
--------
|
|
279
|
+
infeasible_reason (List[str]):
|
|
280
|
+
The reason why the task is infeasible. If a task is infeasible, the validation will look for one of the reasons in the chat messages.
|
|
281
|
+
set by children classes.
|
|
282
|
+
"""
|
|
283
|
+
|
|
284
|
+
def __init__(
|
|
285
|
+
self,
|
|
286
|
+
seed: int = None,
|
|
287
|
+
instance: SNowInstance = None,
|
|
288
|
+
start_rel_url: str = "/now/nav/ui/home",
|
|
289
|
+
fixed_config: list[AbstractServiceNowTask] = None,
|
|
290
|
+
level: int = 2,
|
|
291
|
+
protocol_name: str = "",
|
|
292
|
+
user_roles: List[str] = ["admin"],
|
|
293
|
+
infeasible_reasons: List[str] = [""],
|
|
294
|
+
provide_reason: bool = False,
|
|
295
|
+
**kwargs,
|
|
296
|
+
) -> None:
|
|
297
|
+
super().__init__(
|
|
298
|
+
seed=seed,
|
|
299
|
+
instance=instance,
|
|
300
|
+
start_rel_url=start_rel_url,
|
|
301
|
+
fixed_config=fixed_config,
|
|
302
|
+
level=level,
|
|
303
|
+
protocol_name=protocol_name,
|
|
304
|
+
user_roles=user_roles,
|
|
305
|
+
)
|
|
306
|
+
# Set the private task as closed-skipped in L3 tasks
|
|
307
|
+
self.set_private_task_as_completed = False
|
|
308
|
+
self.provide_reason = provide_reason
|
|
309
|
+
self.infeasible_reasons = infeasible_reasons
|
|
310
|
+
self.infeasible_reason_found = (
|
|
311
|
+
False # flag to check if the infeasible reason was found in the chat messages
|
|
312
|
+
)
|
|
313
|
+
self.__dict__.update(kwargs)
|
|
314
|
+
|
|
315
|
+
def cheat(self, page: Page, chat_messages: list[str], subtask_idx: int) -> None:
|
|
316
|
+
"""Add the infeasible reasons to the chat messages"""
|
|
317
|
+
# Index of the infeasible task that requires sending a message in the chat for the cheat
|
|
318
|
+
# for L2 tasks, the last task is the infeasible one, whereas for L3 tasks, third from last is infeasible
|
|
319
|
+
cheat_index = len(self.subtasks) - 1 if self.level == 2 else len(self.subtasks) - 3
|
|
320
|
+
# We expect the last task of the trajectory to be infeasible. Therefore, we do all but the last cheat step
|
|
321
|
+
if subtask_idx != cheat_index:
|
|
322
|
+
super().cheat(page, chat_messages, subtask_idx)
|
|
323
|
+
else:
|
|
324
|
+
message = ", ".join(self.infeasible_reasons)
|
|
325
|
+
chat_messages.append({"role": "infeasible", "message": message})
|
|
326
|
+
|
|
327
|
+
def validate(self, page: Page, chat_messages: list[str]) -> Tuple[float, bool, str, dict]:
|
|
328
|
+
"""Check if the infeasible reasons are in the chat messages"""
|
|
329
|
+
if chat_messages and chat_messages[-1]["role"] == "infeasible":
|
|
330
|
+
answer = chat_messages[-1]["message"].lower()
|
|
331
|
+
else:
|
|
332
|
+
return (
|
|
333
|
+
0,
|
|
334
|
+
False,
|
|
335
|
+
"",
|
|
336
|
+
{"message": "The assistant did not provide an answer."},
|
|
337
|
+
)
|
|
338
|
+
if self.provide_reason and answer == "":
|
|
339
|
+
return (
|
|
340
|
+
0,
|
|
341
|
+
False,
|
|
342
|
+
"",
|
|
343
|
+
{"message": "The assistant did not provide a reason for the infeasibility."},
|
|
344
|
+
)
|
|
345
|
+
if not self.infeasible_reason_found:
|
|
346
|
+
for reason in self.infeasible_reasons:
|
|
347
|
+
if reason.lower() in answer:
|
|
348
|
+
self.infeasible_reason_found = True
|
|
349
|
+
break
|
|
350
|
+
if not self.infeasible_reason_found:
|
|
351
|
+
return (
|
|
352
|
+
0,
|
|
353
|
+
False,
|
|
354
|
+
"",
|
|
355
|
+
{"message": "The assistant did not provide the correct answer."},
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
return super().validate(page, chat_messages)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class HumanEvalTask:
|
|
362
|
+
"""Base class to label tasks suitable for human evaluation."""
|
|
363
|
+
|
|
364
|
+
pass
|