browsergym-workarena 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. browsergym/workarena/__init__.py +13 -1
  2. browsergym/workarena/api/category.py +74 -0
  3. browsergym/workarena/api/change_request.py +87 -0
  4. browsergym/workarena/api/computer_asset.py +90 -0
  5. browsergym/workarena/api/cost_center.py +19 -0
  6. browsergym/workarena/api/expense_line.py +89 -0
  7. browsergym/workarena/api/incident.py +45 -0
  8. browsergym/workarena/api/knowledge.py +29 -0
  9. browsergym/workarena/api/problem.py +90 -0
  10. browsergym/workarena/api/report.py +183 -0
  11. browsergym/workarena/api/requested_items.py +63 -0
  12. browsergym/workarena/api/user.py +11 -8
  13. browsergym/workarena/api/utils.py +47 -3
  14. browsergym/workarena/config.py +21 -1
  15. browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +1 -1
  16. browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +1 -0
  17. browsergym/workarena/data_files/setup_files/knowledge/protocols.json +46 -0
  18. browsergym/workarena/data_files/setup_files/knowledge/test.html +1 -0
  19. browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +2 -24
  20. browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +4 -40
  21. browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +12 -0
  22. browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +1 -42
  23. browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +2 -18
  24. browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +12 -0
  25. browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +12 -0
  26. browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +2 -19
  27. browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +3 -50
  28. browsergym/workarena/data_files/task_configs/all_menu.json +1 -1
  29. browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +1 -1
  30. browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +1 -1
  31. browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +1 -1
  32. browsergym/workarena/data_files/task_configs/impersonation_users.json +1 -1
  33. browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +1 -1
  34. browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +1 -1
  35. browsergym/workarena/human_eval/console.js +176 -0
  36. browsergym/workarena/human_eval/tool.py +366 -0
  37. browsergym/workarena/install.py +81 -20
  38. browsergym/workarena/tasks/base.py +55 -20
  39. browsergym/workarena/tasks/comp_building_block.py +4 -0
  40. browsergym/workarena/tasks/compositional/__init__.py +76 -0
  41. browsergym/workarena/tasks/compositional/base.py +364 -0
  42. browsergym/workarena/tasks/compositional/dash_do_base.py +1366 -0
  43. browsergym/workarena/tasks/compositional/dash_do_catalog.py +1127 -0
  44. browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +2047 -0
  45. browsergym/workarena/tasks/compositional/dash_do_create_incident.py +403 -0
  46. browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +278 -0
  47. browsergym/workarena/tasks/compositional/dash_do_create_problem.py +336 -0
  48. browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +235 -0
  49. browsergym/workarena/tasks/compositional/dash_do_filter.py +1600 -0
  50. browsergym/workarena/tasks/compositional/dash_do_request_item.py +1315 -0
  51. browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +693 -0
  52. browsergym/workarena/tasks/compositional/delete_record.py +341 -0
  53. browsergym/workarena/tasks/compositional/edit_knowledge_base.py +457 -0
  54. browsergym/workarena/tasks/compositional/expense_management.py +598 -0
  55. browsergym/workarena/tasks/compositional/filter_and_do.py +139 -0
  56. browsergym/workarena/tasks/compositional/find_and_order_item.py +345 -0
  57. browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +1417 -0
  58. browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +499 -0
  59. browsergym/workarena/tasks/compositional/maximize_investment_return.py +1763 -0
  60. browsergym/workarena/tasks/compositional/navigate_and_do.py +1151 -0
  61. browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +2100 -0
  62. browsergym/workarena/tasks/compositional/offboard_user.py +207 -0
  63. browsergym/workarena/tasks/compositional/onboard_user.py +226 -0
  64. browsergym/workarena/tasks/compositional/update_task.py +145 -0
  65. browsergym/workarena/tasks/compositional/utils/curriculum.py +215 -0
  66. browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +151 -0
  67. browsergym/workarena/tasks/compositional/utils/knapsack.py +192 -0
  68. browsergym/workarena/tasks/compositional/warranty_check.py +227 -0
  69. browsergym/workarena/tasks/compositional/work_assignment.py +804 -0
  70. browsergym/workarena/tasks/compositional/workload_balancing.py +396 -0
  71. browsergym/workarena/tasks/dashboard.py +188 -8
  72. browsergym/workarena/tasks/form.py +1024 -232
  73. browsergym/workarena/tasks/knowledge.py +216 -25
  74. browsergym/workarena/tasks/list.py +519 -102
  75. browsergym/workarena/tasks/mark_duplicate_problem.py +171 -0
  76. browsergym/workarena/tasks/navigation.py +55 -13
  77. browsergym/workarena/tasks/scripts/extract_all_menu_items.py +9 -2
  78. browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +6 -5
  79. browsergym/workarena/tasks/scripts/service_catalog.py +2 -1
  80. browsergym/workarena/tasks/scripts/validate.py +8 -2
  81. browsergym/workarena/tasks/send_chat_message.py +90 -0
  82. browsergym/workarena/tasks/service_catalog.py +94 -26
  83. browsergym/workarena/tasks/utils/form.py +1 -4
  84. browsergym/workarena/tasks/utils/private_tasks.py +63 -0
  85. browsergym/workarena/tasks/utils/utils.py +13 -0
  86. {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.0.dist-info}/METADATA +19 -18
  87. browsergym_workarena-0.3.0.dist-info/RECORD +138 -0
  88. {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.0.dist-info}/entry_points.txt +1 -0
  89. browsergym_workarena-0.2.1.dist-info/RECORD +0 -85
  90. {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.0.dist-info}/WHEEL +0 -0
  91. {browsergym_workarena-0.2.1.dist-info → browsergym_workarena-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,364 @@
1
+ import json
2
+ import time
3
+ import warnings
4
+
5
+ from typing import List, Tuple
6
+ from playwright.sync_api._generated import Page
7
+
8
+ from browsergym.workarena.config import PROTOCOL_KB_FILEPATH
9
+
10
+ from .update_task import UpdatePrivateTask
11
+
12
+ from ..base import AbstractServiceNowTask
13
+ from ..navigation import AllMenuTask
14
+
15
+ from ...instance import SNowInstance
16
+
17
+
18
+ class CompositionalTask(AbstractServiceNowTask):
19
+ # Final private task instructions
20
+ final_private_task_instructions = 'Don\'t forget to mark this task as "Closed - complete" once successfully completed. If the task appears infeasible, mark the task as "Closed - skipped" .'
21
+
22
+ def __init__(
23
+ self,
24
+ seed: int = None,
25
+ instance: SNowInstance = None,
26
+ start_rel_url: str = "/now/nav/ui/home",
27
+ fixed_config: list[AbstractServiceNowTask] = None,
28
+ level: int = 2,
29
+ protocol_name: str = "",
30
+ user_roles: List[str] = ["admin"],
31
+ ) -> None:
32
+ """
33
+ Create a compositional task with specific subtasks
34
+
35
+ Parameters:
36
+ -----------
37
+ instance: SNowInstance
38
+ The ServiceNow instance to run the task on.
39
+ start_rel_url: str
40
+ The relative URL to start the task from.
41
+ fixed_config: list[AbstractServiceNowTask]
42
+ A list of subtasks.
43
+ level: int
44
+ The level of the task; choice between 2 and 3. L2 will have all the info in the the goal and start in the SNOW home page.
45
+ L3 will start in a private task page describing the information needed to complete the task and the related company protocol
46
+ to complete it.
47
+ protocol_name: str
48
+ The name of the protocol to follow to complete the task; only used for level 3 tasks.
49
+ user_roles: list[str]
50
+ The roles to assign to the user (default: ["admin"])
51
+ """
52
+ super().__init__(
53
+ seed=seed, instance=instance, start_rel_url=start_rel_url, user_roles=user_roles
54
+ )
55
+ # Set the task as completed in L3
56
+ self.set_private_task_as_completed = True
57
+ self.seed = seed
58
+
59
+ self.fixed_config = fixed_config
60
+ self.protocol_name = protocol_name
61
+ self.task_description = ""
62
+ self.short_description = ""
63
+
64
+ assert level in [2, 3], "Level must be either 2 or 3"
65
+ self.level = level
66
+ if self.level == 2:
67
+ start_rel_url = "/now/nav/ui/home"
68
+ else:
69
+ self.private_task_id = "PTSK" + str(id(self) % (10**8)).zfill(8)
70
+ self.sys_id = None
71
+ start_rel_url = "" # For level 3 tasks, the start URL depends on the sys ID of the private task created for it
72
+
73
+ def __len__(self) -> int:
74
+ return len(self.subtasks)
75
+
76
+ def setup_goal(
77
+ self,
78
+ page: Page,
79
+ config: list[AbstractServiceNowTask],
80
+ build_pretty_print_description: bool = True,
81
+ ) -> tuple[str, str, dict]:
82
+ super().setup_goal(page=page)
83
+ # Index to keep track of the task we are currently validating
84
+ self.valid_index = 0
85
+
86
+ # Setup all the subtasks
87
+ self.subtasks = []
88
+ self.subgoals = []
89
+ for task in config:
90
+ if (
91
+ self.level == 2 and not task.used_in_level_2
92
+ ): # Skip tasks that are not used in level 2; e.g. navigate to the company protocol
93
+ continue
94
+ self.subtasks.append(task)
95
+ self.subgoals.append(self.subtasks[-1].setup(page=page, do_start=False)[0])
96
+
97
+ if self.level == 3:
98
+ if build_pretty_print_description:
99
+ self._build_pretty_printed_description(config)
100
+ level_3_final_tasks = [
101
+ # Navigate to the My Work task list
102
+ AllMenuTask(
103
+ instance=self.instance,
104
+ fixed_config={
105
+ "application": "Service Desk",
106
+ "module": "My Work",
107
+ "url": "/now/nav/ui/classic/params/target/task_list.do%3Fsysparm_userpref_module%3D1523b8d4c611227b00be8216ec331b9a%26sysparm_query%3Dactive%253Dtrue%255Eassigned_to%253Djavascript%253AgetMyAssignments%2528%2529%255Estate%2521%253D-5%255EEQ",
108
+ },
109
+ is_validated=False,
110
+ used_in_level_2=False,
111
+ ),
112
+ # Close the private task
113
+ UpdatePrivateTask(
114
+ instance=self.instance,
115
+ fixed_config={
116
+ "task_description": self.task_description,
117
+ "short_description": self.short_description,
118
+ },
119
+ set_as_completed=self.set_private_task_as_completed,
120
+ is_validated=True,
121
+ used_in_level_2=False,
122
+ ),
123
+ ]
124
+ self.subtasks.extend(level_3_final_tasks)
125
+ # Set identical user credentials for all subtasks
126
+ for task in self.subtasks:
127
+ task._base_initial_instance = self.instance
128
+ task._base_user_name, task._base_user_password, task._base_user_sysid = (
129
+ self._base_user_name,
130
+ self._base_user_password,
131
+ self._base_user_sysid,
132
+ )
133
+ task.instance = self.instance
134
+ task.instance.snow_credentials = (self._base_user_name, self._base_user_password)
135
+
136
+ # Finish the setup with the L3-specific tasks
137
+ for task in self.subtasks[-2:]:
138
+ task.setup(page=page, do_start=False)
139
+ # The sys ID of the private task is the sys ID of the last task in the list
140
+ self.sys_id = level_3_final_tasks[-1].sys_id
141
+
142
+ self.start_url = (
143
+ self.instance.snow_url
144
+ + f"/now/nav/ui/classic/params/target/vtb_task.do%3Fsys_id%3D{self.sys_id}"
145
+ )
146
+
147
+ # For level 2, include all substeps in the goal
148
+ # For level 3, the goal is already set in the private task
149
+ if self.level == 2:
150
+ task_intro = self.short_description + "\n"
151
+ # Get the protocol to follow for the task and pre-pend it to the goal
152
+ goal = task_intro
153
+ goal += " \n Concretely, you need to complete the following steps:"
154
+
155
+ # In some cases, more than one subtasks with identical subgoals are present and the duplicated tasks have empty goals
156
+ # These multiple tasks are used to provide a complete cheat for the tasks like ManageChangeRequestScheduleTask subclasses
157
+ # To avoid having empty steps in the enumeration, we check if the goal is empty and skip if it is
158
+ i = 1
159
+ for subgoal in self.subgoals:
160
+ if not subgoal:
161
+ continue
162
+ goal += f"\n{i}. {subgoal}"
163
+ i += 1
164
+
165
+ elif self.level == 3:
166
+ goal = f"Please complete the following task."
167
+
168
+ return goal, {}
169
+
170
+ def _get_config(self) -> list[AbstractServiceNowTask]:
171
+ """
172
+ Get a configuration for a given compositional task, in the form of a list subtasks.
173
+ """
174
+ raise NotImplementedError("This method should be implemented in a subclass")
175
+
176
+ def cheat(self, page: Page, chat_messages: list[str], subtask_idx: int) -> None:
177
+ """
178
+ Solve the a subtask of the task
179
+
180
+ Parameters:
181
+ ----------
182
+ page: Page
183
+ The page to solve the task on
184
+ chat_messages: list[str]
185
+ The list of messages in the chat
186
+ subtask_idx: int
187
+ The index of the subtask to solve.
188
+
189
+ Note:
190
+ -----
191
+ * We proceed separately for each subtask since this enables validation of each subtask separately.
192
+ This is useful for certifying the feasibility of tasks in the benchmark. Otherwise, cheat would
193
+ bring us to the final state of the task, which would make it impossible to validate subtasks.
194
+ * Use len(self) to get the number of subtasks in the task.
195
+
196
+ """
197
+ super().cheat(page, chat_messages)
198
+ self.subtasks[subtask_idx].cheat(page, chat_messages)
199
+
200
+ def _build_pretty_printed_description(self, config: list[AbstractServiceNowTask]) -> str:
201
+ """
202
+ Get the task information for the private task description; used for level 3 tasks.
203
+ Args:
204
+ config: list[AbstractServiceNowTask]
205
+ The list of subtasks in the task
206
+ """
207
+ for subtask in config:
208
+ if subtask.is_validated or subtask.has_description:
209
+ self.task_description += subtask.get_pretty_printed_description()
210
+ self.task_description += "\n"
211
+ self.task_description += self.final_private_task_instructions
212
+
213
+ return self.task_description
214
+
215
+ def validate(self, page: Page, chat_messages: list[str]) -> Tuple[float, bool, str, dict]:
216
+ super().validate(page, chat_messages)
217
+
218
+ # Initialize the index of the first subtask that requires validation
219
+ while (
220
+ self.valid_index < len(self.subtasks)
221
+ and not self.subtasks[self.valid_index].is_validated
222
+ ):
223
+ self.valid_index += 1
224
+
225
+ if self.valid_index == len(self.subtasks):
226
+ return (
227
+ 1,
228
+ True,
229
+ "Nice work, thank you!",
230
+ {"message": "Task completed successfully."},
231
+ )
232
+ # Validate the current subtask
233
+ subtask = self.subtasks[self.valid_index]
234
+ reward, stop, info, message = subtask.validate(page, chat_messages)
235
+
236
+ # If the subtask is valid
237
+ if reward >= 1.0:
238
+ # ... override the info and message to avoid success messages from the subtask
239
+ info = message["message"] = (
240
+ f"Step {self.valid_index + 1} has been completed successfully."
241
+ )
242
+ # ... this is a subtask, so we don't want to stop
243
+ stop = False
244
+ # ... increment index to flag this one as solved
245
+ self.valid_index += 1
246
+
247
+ # If the subtask is not valid
248
+ else:
249
+ # ... contextualize the info and message per subtask
250
+ info = f"Step {self.valid_index + 1}: " + info
251
+ message["message"] = f"Step {self.valid_index + 1}: " + message.get("message", "")
252
+
253
+ # Check if all subtasks are solved
254
+ if self.valid_index == len(self.subtasks):
255
+ return (
256
+ 1,
257
+ True,
258
+ "Nice work, thank you!",
259
+ {"message": "Task completed successfully."},
260
+ )
261
+
262
+ return 0, stop, info, message
263
+
264
+ def teardown(self) -> None:
265
+ # XXX: In base.py we define the teardown method as being independent of the
266
+ # current state of the page. This means that we can just call all the
267
+ # subtasks' teardown methods.
268
+ for task in self.subtasks:
269
+ task.teardown()
270
+ super().teardown()
271
+
272
+
273
+ class InfeasibleCompositionalTask(CompositionalTask):
274
+ """
275
+ Base class for infeasible tasks.
276
+
277
+ Args:
278
+ --------
279
+ infeasible_reason (List[str]):
280
+ The reason why the task is infeasible. If a task is infeasible, the validation will look for one of the reasons in the chat messages.
281
+ set by children classes.
282
+ """
283
+
284
+ def __init__(
285
+ self,
286
+ seed: int = None,
287
+ instance: SNowInstance = None,
288
+ start_rel_url: str = "/now/nav/ui/home",
289
+ fixed_config: list[AbstractServiceNowTask] = None,
290
+ level: int = 2,
291
+ protocol_name: str = "",
292
+ user_roles: List[str] = ["admin"],
293
+ infeasible_reasons: List[str] = [""],
294
+ provide_reason: bool = False,
295
+ **kwargs,
296
+ ) -> None:
297
+ super().__init__(
298
+ seed=seed,
299
+ instance=instance,
300
+ start_rel_url=start_rel_url,
301
+ fixed_config=fixed_config,
302
+ level=level,
303
+ protocol_name=protocol_name,
304
+ user_roles=user_roles,
305
+ )
306
+ # Set the private task as closed-skipped in L3 tasks
307
+ self.set_private_task_as_completed = False
308
+ self.provide_reason = provide_reason
309
+ self.infeasible_reasons = infeasible_reasons
310
+ self.infeasible_reason_found = (
311
+ False # flag to check if the infeasible reason was found in the chat messages
312
+ )
313
+ self.__dict__.update(kwargs)
314
+
315
+ def cheat(self, page: Page, chat_messages: list[str], subtask_idx: int) -> None:
316
+ """Add the infeasible reasons to the chat messages"""
317
+ # Index of the infeasible task that requires sending a message in the chat for the cheat
318
+ # for L2 tasks, the last task is the infeasible one, whereas for L3 tasks, third from last is infeasible
319
+ cheat_index = len(self.subtasks) - 1 if self.level == 2 else len(self.subtasks) - 3
320
+ # We expect the last task of the trajectory to be infeasible. Therefore, we do all but the last cheat step
321
+ if subtask_idx != cheat_index:
322
+ super().cheat(page, chat_messages, subtask_idx)
323
+ else:
324
+ message = ", ".join(self.infeasible_reasons)
325
+ chat_messages.append({"role": "infeasible", "message": message})
326
+
327
+ def validate(self, page: Page, chat_messages: list[str]) -> Tuple[float, bool, str, dict]:
328
+ """Check if the infeasible reasons are in the chat messages"""
329
+ if chat_messages and chat_messages[-1]["role"] == "infeasible":
330
+ answer = chat_messages[-1]["message"].lower()
331
+ else:
332
+ return (
333
+ 0,
334
+ False,
335
+ "",
336
+ {"message": "The assistant did not provide an answer."},
337
+ )
338
+ if self.provide_reason and answer == "":
339
+ return (
340
+ 0,
341
+ False,
342
+ "",
343
+ {"message": "The assistant did not provide a reason for the infeasibility."},
344
+ )
345
+ if not self.infeasible_reason_found:
346
+ for reason in self.infeasible_reasons:
347
+ if reason.lower() in answer:
348
+ self.infeasible_reason_found = True
349
+ break
350
+ if not self.infeasible_reason_found:
351
+ return (
352
+ 0,
353
+ False,
354
+ "",
355
+ {"message": "The assistant did not provide the correct answer."},
356
+ )
357
+
358
+ return super().validate(page, chat_messages)
359
+
360
+
361
+ class HumanEvalTask:
362
+ """Base class to label tasks suitable for human evaluation."""
363
+
364
+ pass