browsergym-workarena 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browsergym/workarena/__init__.py +125 -2
- browsergym/workarena/tasks/dashboard.py +3 -1
- {browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/METADATA +111 -12
- {browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/RECORD +7 -7
- {browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/WHEEL +1 -1
- {browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/entry_points.txt +0 -0
- {browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/licenses/LICENSE +0 -0
browsergym/workarena/__init__.py
CHANGED
|
@@ -1,11 +1,23 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.4.1"
|
|
2
2
|
|
|
3
3
|
import inspect
|
|
4
|
-
|
|
4
|
+
from logging import warning
|
|
5
5
|
|
|
6
|
+
import numpy as np
|
|
6
7
|
from browsergym.core.registration import register_task
|
|
7
8
|
|
|
8
9
|
from .tasks.comp_building_block import CompositionalBuildingBlockTask
|
|
10
|
+
from .tasks.compositional import (
|
|
11
|
+
AGENT_CURRICULUM_L2,
|
|
12
|
+
AGENT_CURRICULUM_L3,
|
|
13
|
+
ALL_COMPOSITIONAL_TASKS,
|
|
14
|
+
ALL_COMPOSITIONAL_TASKS_L2,
|
|
15
|
+
ALL_COMPOSITIONAL_TASKS_L3,
|
|
16
|
+
HUMAN_CURRICULUM_L2,
|
|
17
|
+
HUMAN_CURRICULUM_L3,
|
|
18
|
+
)
|
|
19
|
+
from .tasks.compositional.base import CompositionalTask, HumanEvalTask
|
|
20
|
+
from .tasks.compositional.update_task import __TASKS__ as UPDATE_TASKS
|
|
9
21
|
from .tasks.dashboard import __TASKS__ as DASHBOARD_TASKS
|
|
10
22
|
from .tasks.form import __TASKS__ as FORM_TASKS
|
|
11
23
|
from .tasks.knowledge import __TASKS__ as KB_TASKS
|
|
@@ -15,12 +27,15 @@ from .tasks.service_catalog import __TASKS__ as SERVICE_CATALOG_TASKS
|
|
|
15
27
|
from .tasks.compositional.base import CompositionalTask
|
|
16
28
|
|
|
17
29
|
ALL_WORKARENA_TASKS = [
|
|
30
|
+
*ALL_COMPOSITIONAL_TASKS_L2,
|
|
31
|
+
*ALL_COMPOSITIONAL_TASKS_L3,
|
|
18
32
|
*DASHBOARD_TASKS,
|
|
19
33
|
*FORM_TASKS,
|
|
20
34
|
*KB_TASKS,
|
|
21
35
|
*LIST_TASKS,
|
|
22
36
|
*NAVIGATION_TASKS,
|
|
23
37
|
*SERVICE_CATALOG_TASKS,
|
|
38
|
+
*UPDATE_TASKS,
|
|
24
39
|
]
|
|
25
40
|
ATOMIC_TASKS = [
|
|
26
41
|
task
|
|
@@ -30,9 +45,117 @@ ATOMIC_TASKS = [
|
|
|
30
45
|
and not issubclass(task, CompositionalBuildingBlockTask)
|
|
31
46
|
]
|
|
32
47
|
|
|
48
|
+
|
|
33
49
|
# register the WorkArena benchmark
|
|
34
50
|
for task in ALL_WORKARENA_TASKS:
|
|
35
51
|
register_task(
|
|
36
52
|
task.get_task_id(),
|
|
37
53
|
task,
|
|
38
54
|
)
|
|
55
|
+
|
|
56
|
+
workarena_tasks_all = [task_class.get_task_id() for task_class in ALL_WORKARENA_TASKS]
|
|
57
|
+
workarena_tasks_atomic = [task_class.get_task_id() for task_class in ATOMIC_TASKS]
|
|
58
|
+
|
|
59
|
+
TASK_CATEGORY_MAP = {
|
|
60
|
+
"workarena.servicenow.all-menu": "menu",
|
|
61
|
+
"workarena.servicenow.create-change-request": "form",
|
|
62
|
+
"workarena.servicenow.create-hardware-asset": "form",
|
|
63
|
+
"workarena.servicenow.create-incident": "form",
|
|
64
|
+
"workarena.servicenow.create-problem": "form",
|
|
65
|
+
"workarena.servicenow.create-user": "form",
|
|
66
|
+
"workarena.servicenow.filter-asset-list": "list-filter",
|
|
67
|
+
"workarena.servicenow.filter-change-request-list": "list-filter",
|
|
68
|
+
"workarena.servicenow.filter-hardware-list": "list-filter",
|
|
69
|
+
"workarena.servicenow.filter-incident-list": "list-filter",
|
|
70
|
+
"workarena.servicenow.filter-service-catalog-item-list": "list-filter",
|
|
71
|
+
"workarena.servicenow.filter-user-list": "list-filter",
|
|
72
|
+
"workarena.servicenow.impersonation": "menu",
|
|
73
|
+
"workarena.servicenow.knowledge-base-search": "knowledge",
|
|
74
|
+
"workarena.servicenow.order-apple-mac-book-pro15": "service catalog",
|
|
75
|
+
"workarena.servicenow.order-apple-watch": "service catalog",
|
|
76
|
+
"workarena.servicenow.order-developer-laptop": "service catalog",
|
|
77
|
+
"workarena.servicenow.order-development-laptop-p-c": "service catalog",
|
|
78
|
+
"workarena.servicenow.order-ipad-mini": "service catalog",
|
|
79
|
+
"workarena.servicenow.order-ipad-pro": "service catalog",
|
|
80
|
+
"workarena.servicenow.order-loaner-laptop": "service catalog",
|
|
81
|
+
"workarena.servicenow.order-sales-laptop": "service catalog",
|
|
82
|
+
"workarena.servicenow.order-standard-laptop": "service catalog",
|
|
83
|
+
"workarena.servicenow.sort-asset-list": "list-sort",
|
|
84
|
+
"workarena.servicenow.sort-change-request-list": "list-sort",
|
|
85
|
+
"workarena.servicenow.sort-hardware-list": "list-sort",
|
|
86
|
+
"workarena.servicenow.sort-incident-list": "list-sort",
|
|
87
|
+
"workarena.servicenow.sort-service-catalog-item-list": "list-sort",
|
|
88
|
+
"workarena.servicenow.sort-user-list": "list-sort",
|
|
89
|
+
"workarena.servicenow.multi-chart-min-max-retrieval": "dashboard",
|
|
90
|
+
"workarena.servicenow.multi-chart-value-retrieval": "dashboard",
|
|
91
|
+
"workarena.servicenow.single-chart-value-retrieval": "dashboard",
|
|
92
|
+
"workarena.servicenow.single-chart-min-max-retrieval": "dashboard",
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
workarena_tasks_l1 = list(TASK_CATEGORY_MAP.keys())
|
|
97
|
+
workarena_task_categories = {}
|
|
98
|
+
for task in workarena_tasks_atomic:
|
|
99
|
+
if task not in TASK_CATEGORY_MAP:
|
|
100
|
+
warning(f"Atomic task {task} not found in TASK_CATEGORY_MAP")
|
|
101
|
+
continue
|
|
102
|
+
cat = TASK_CATEGORY_MAP[task]
|
|
103
|
+
if cat in workarena_task_categories:
|
|
104
|
+
workarena_task_categories[cat].append(task)
|
|
105
|
+
else:
|
|
106
|
+
workarena_task_categories[cat] = [task]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_task_category(task_name):
|
|
110
|
+
benchmark = task_name.split(".")[0]
|
|
111
|
+
return benchmark, TASK_CATEGORY_MAP.get(task_name, None)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True):
|
|
115
|
+
OFFSET = 42
|
|
116
|
+
all_task_tuples = []
|
|
117
|
+
filter = filter.split(".")
|
|
118
|
+
if len(filter) > 2:
|
|
119
|
+
raise Exception("Unsupported filter used.")
|
|
120
|
+
if len(filter) == 1:
|
|
121
|
+
level = filter[0]
|
|
122
|
+
if level not in ["l1", "l2", "l3"]:
|
|
123
|
+
raise Exception("Unsupported category of tasks.")
|
|
124
|
+
else:
|
|
125
|
+
rng = np.random.RandomState(meta_seed)
|
|
126
|
+
if level == "l1":
|
|
127
|
+
for task in ATOMIC_TASKS:
|
|
128
|
+
for seed in rng.randint(0, 1000, n_seed_l1):
|
|
129
|
+
all_task_tuples.append((task, int(seed)))
|
|
130
|
+
|
|
131
|
+
return all_task_tuples
|
|
132
|
+
|
|
133
|
+
if len(filter) == 2:
|
|
134
|
+
level, filter_category = filter[0], filter[1]
|
|
135
|
+
if filter_category not in list(AGENT_CURRICULUM_L2.keys()):
|
|
136
|
+
raise Exception("Unsupported category of tasks.")
|
|
137
|
+
else:
|
|
138
|
+
filter_category = None
|
|
139
|
+
|
|
140
|
+
if is_agent_curriculum:
|
|
141
|
+
if level == "l2":
|
|
142
|
+
ALL_COMPOSITIONAL_TASKS_CATEGORIES = AGENT_CURRICULUM_L2
|
|
143
|
+
else:
|
|
144
|
+
ALL_COMPOSITIONAL_TASKS_CATEGORIES = AGENT_CURRICULUM_L3
|
|
145
|
+
else:
|
|
146
|
+
if level == "l2":
|
|
147
|
+
ALL_COMPOSITIONAL_TASKS_CATEGORIES = HUMAN_CURRICULUM_L2
|
|
148
|
+
else:
|
|
149
|
+
ALL_COMPOSITIONAL_TASKS_CATEGORIES = HUMAN_CURRICULUM_L3
|
|
150
|
+
|
|
151
|
+
for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items():
|
|
152
|
+
if filter_category and category != filter_category:
|
|
153
|
+
continue
|
|
154
|
+
for curr_seed in rng.randint(0, 1000, items["num_seeds"]):
|
|
155
|
+
random_gen = np.random.RandomState(curr_seed)
|
|
156
|
+
for task_set, count in zip(items["buckets"], items["weights"]):
|
|
157
|
+
tasks = random_gen.choice(task_set, count, replace=False)
|
|
158
|
+
for task in tasks:
|
|
159
|
+
all_task_tuples.append((task, int(curr_seed)))
|
|
160
|
+
|
|
161
|
+
return all_task_tuples
|
|
@@ -3,9 +3,9 @@ import logging
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import playwright.sync_api
|
|
5
5
|
import re
|
|
6
|
-
import tenacity
|
|
7
6
|
|
|
8
7
|
from abc import ABC, abstractmethod
|
|
8
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
9
9
|
from typing import List, Tuple
|
|
10
10
|
from urllib import parse
|
|
11
11
|
|
|
@@ -179,6 +179,8 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
|
|
|
179
179
|
|
|
180
180
|
return type, data
|
|
181
181
|
|
|
182
|
+
# retry because sometimes the page is not fully loaded
|
|
183
|
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
|
182
184
|
def _get_chart_by_title(
|
|
183
185
|
self, page: playwright.sync_api.Page, title: str = None
|
|
184
186
|
) -> Tuple[str, dict]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: browsergym-workarena
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: WorkArena benchmark for BrowserGym
|
|
5
5
|
Project-URL: homepage, https://github.com/ServiceNow/WorkArena
|
|
6
6
|
Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme, Megh Thakkar
|
|
@@ -22,9 +22,14 @@ Requires-Dist: tenacity>=8.2.3
|
|
|
22
22
|
Requires-Dist: tqdm>=4.66.2
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
|
|
25
|
-
# WorkArena:
|
|
25
|
+
# WorkArena: A Benchmark for Evaluating Agents on Knowledge Work Tasks
|
|
26
|
+
[[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
### Papers
|
|
29
|
+
* [ICML 2024] WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks? [[Paper]](https://arxiv.org/abs/2403.07718)
|
|
30
|
+
|
|
31
|
+
* WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks [[Paper]](https://arxiv.org/abs/2407.05291)
|
|
32
|
+
|
|
28
33
|
|
|
29
34
|
`WorkArena` is a suite of browser-based tasks tailored to gauge web agents' effectiveness in supporting routine tasks for knowledge workers.
|
|
30
35
|
By harnessing the ubiquitous [ServiceNow](https://www.servicenow.com/what-is-servicenow.html) platform, this benchmark will be instrumental in assessing the widespread state of such automations in modern knowledge work environments.
|
|
@@ -34,9 +39,47 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
|
|
|
34
39
|
|
|
35
40
|
https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
|
|
36
41
|
|
|
42
|
+
## Getting Started
|
|
43
|
+
|
|
44
|
+
To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
|
|
45
|
+
|
|
46
|
+
### a) Create a ServiceNow Developer Instance
|
|
47
|
+
|
|
48
|
+
1. Go to https://developer.servicenow.com/ and create an account.
|
|
49
|
+
2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
|
|
50
|
+
3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
|
|
51
|
+
4. You should now see your URL and credentials. Based on this information, set the following environment variables:
|
|
52
|
+
* `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
|
|
53
|
+
* `SNOW_INSTANCE_UNAME`: The username, should be "admin"
|
|
54
|
+
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
|
|
55
|
+
6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
|
|
56
|
+
|
|
57
|
+
**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
|
|
58
|
+
|
|
59
|
+
### b) Install WorkArena and Initialize your Instance
|
|
60
|
+
|
|
61
|
+
Run the following command to install WorkArena in the [BrowswerGym](https://github.com/servicenow/browsergym) environment:
|
|
62
|
+
```
|
|
63
|
+
pip install browsergym
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Then, install [Playwright](https://github.com/microsoft/playwright):
|
|
67
|
+
```
|
|
68
|
+
playwright install
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
|
|
72
|
+
```
|
|
73
|
+
workarena-install
|
|
74
|
+
```
|
|
75
|
+
Your installation is now complete! 🎉
|
|
76
|
+
|
|
77
|
+
|
|
37
78
|
## Benchmark Contents
|
|
38
79
|
|
|
39
|
-
At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface
|
|
80
|
+
At the moment, WorkArena-L1 includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface, otherwise referred to as "atomic" tasks. WorkArena++ contains 682 tasks, each one sampling among thousands of potential configurations. WorkArena++ uses the atomic components presented in WorkArena, and composes them into real-world use cases evaluating planning, reasoning, and memorizing abilities of agents.
|
|
81
|
+
|
|
82
|
+
The following videos show an agent built on `GPT-4-vision` interacting with every atomic component of the benchmark. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
|
|
40
83
|
|
|
41
84
|
### Knowledge Bases
|
|
42
85
|
|
|
@@ -80,7 +123,6 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
|
|
|
80
123
|
|
|
81
124
|
https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
|
|
82
125
|
|
|
83
|
-
|
|
84
126
|
## Getting Started
|
|
85
127
|
|
|
86
128
|
To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
|
|
@@ -93,7 +135,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
|
|
|
93
135
|
4. You should now see your URL and credentials. Based on this information, set the following environment variables:
|
|
94
136
|
* `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
|
|
95
137
|
* `SNOW_INSTANCE_UNAME`: The username, should be "admin"
|
|
96
|
-
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes
|
|
138
|
+
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in single quotes '' and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
|
|
97
139
|
6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
|
|
98
140
|
|
|
99
141
|
**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
|
|
@@ -105,25 +147,24 @@ Run the following command to install WorkArena in the [BrowswerGym](https://gith
|
|
|
105
147
|
pip install browsergym-workarena
|
|
106
148
|
```
|
|
107
149
|
|
|
108
|
-
Then,
|
|
150
|
+
Then, install [Playwright](https://github.com/microsoft/playwright):
|
|
109
151
|
```
|
|
110
|
-
|
|
152
|
+
playwright install
|
|
111
153
|
```
|
|
112
154
|
|
|
113
|
-
Finally,
|
|
155
|
+
Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
|
|
114
156
|
```
|
|
115
|
-
|
|
157
|
+
workarena-install
|
|
116
158
|
```
|
|
117
|
-
|
|
118
159
|
Your installation is now complete! 🎉
|
|
119
160
|
|
|
120
|
-
|
|
121
161
|
## Live Demo
|
|
122
162
|
|
|
123
163
|
Run this code to see WorkArena in action.
|
|
124
164
|
|
|
125
165
|
Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
|
|
126
166
|
|
|
167
|
+
- To run a demo of WorkArena-L1 (ICML 2024) tasks using BrowserGym, use the following script:
|
|
127
168
|
```python
|
|
128
169
|
import random
|
|
129
170
|
|
|
@@ -165,9 +206,55 @@ for task in ALL_WORKARENA_TASKS:
|
|
|
165
206
|
```
|
|
166
207
|
|
|
167
208
|
|
|
209
|
+
|
|
210
|
+
- To run a demo of WorkArena-L2 (WorkArena++) tasks using BrowserGym, use the following script. Change the filter on line 6 to `l3` to sample L3 tasks.
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
import random
|
|
214
|
+
|
|
215
|
+
from browsergym.core.env import BrowserEnv
|
|
216
|
+
from browsergym.workarena import get_all_tasks_agents
|
|
217
|
+
|
|
218
|
+
AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
|
|
219
|
+
|
|
220
|
+
AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
|
|
221
|
+
sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
|
|
222
|
+
]
|
|
223
|
+
from time import sleep
|
|
224
|
+
|
|
225
|
+
for (task, seed) in zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS):
|
|
226
|
+
print("Task:", task)
|
|
227
|
+
|
|
228
|
+
# Instantiate a new environment
|
|
229
|
+
env = BrowserEnv(task_entrypoint=task,
|
|
230
|
+
headless=False)
|
|
231
|
+
env.reset()
|
|
232
|
+
|
|
233
|
+
# Cheat functions use Playwright to automatically solve the task
|
|
234
|
+
env.chat.add_message(role="assistant", msg="On it. Please wait...")
|
|
235
|
+
|
|
236
|
+
for i in range(len(env.task)):
|
|
237
|
+
sleep(1)
|
|
238
|
+
env.task.cheat(page=env.page, chat_messages=env.chat.messages, subtask_idx=i)
|
|
239
|
+
sleep(1)
|
|
240
|
+
reward, done, message, info = env.task.validate(page=env.page, chat_messages=env.chat.messages)
|
|
241
|
+
|
|
242
|
+
if reward == 1:
|
|
243
|
+
env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
|
|
244
|
+
else:
|
|
245
|
+
env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
|
|
246
|
+
|
|
247
|
+
sleep(3)
|
|
248
|
+
env.close()
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
|
|
252
|
+
|
|
168
253
|
## Citing This Work
|
|
169
254
|
|
|
170
255
|
Please use the following BibTeX to cite our work:
|
|
256
|
+
|
|
257
|
+
### WorkArena
|
|
171
258
|
```
|
|
172
259
|
@misc{workarena2024,
|
|
173
260
|
title={WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?},
|
|
@@ -178,3 +265,15 @@ Please use the following BibTeX to cite our work:
|
|
|
178
265
|
primaryClass={cs.LG}
|
|
179
266
|
}
|
|
180
267
|
```
|
|
268
|
+
### WorkArena++
|
|
269
|
+
```
|
|
270
|
+
@misc{boisvert2024workarenacompositionalplanningreasoningbased,
|
|
271
|
+
title={WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks},
|
|
272
|
+
author={Léo Boisvert and Megh Thakkar and Maxime Gasse and Massimo Caccia and Thibault Le Sellier De Chezelles and Quentin Cappart and Nicolas Chapados and Alexandre Lacoste and Alexandre Drouin},
|
|
273
|
+
year={2024},
|
|
274
|
+
eprint={2407.05291},
|
|
275
|
+
archivePrefix={arXiv},
|
|
276
|
+
primaryClass={cs.AI},
|
|
277
|
+
url={https://arxiv.org/abs/2407.05291},
|
|
278
|
+
}
|
|
279
|
+
```
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
browsergym/workarena/__init__.py,sha256=
|
|
1
|
+
browsergym/workarena/__init__.py,sha256=ocdVJcRZysM8quznRst33KAV39ubpZuvVgjjwQXmKtw,6289
|
|
2
2
|
browsergym/workarena/config.py,sha256=tblmOUpqSoL3qlQHK_TFEDSFbC3o2kuRP_GFpoTNsX4,8522
|
|
3
3
|
browsergym/workarena/install.py,sha256=UaPE1K70xJB-2Gr1P5rJbcolkwMeWyRt04F7_5gpR4E,39341
|
|
4
4
|
browsergym/workarena/instance.py,sha256=Qw4lzHhgnl8IuiWOelsmzCJce3jXYivYYwtfTPt2H-s,4314
|
|
@@ -76,7 +76,7 @@ browsergym/workarena/human_eval/tool.py,sha256=SwPqArNnvEeOPLRgem6kwl8ho345o-1f3
|
|
|
76
76
|
browsergym/workarena/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
77
|
browsergym/workarena/tasks/base.py,sha256=Ikh_A5I9_9acHFQCcnVMEnlBg3u3QHQD2I_NbGvD6SE,6411
|
|
78
78
|
browsergym/workarena/tasks/comp_building_block.py,sha256=Lg3KbAWrxzAHe5XbPN6L8bvdu7mfJpmBvI7jXeSDwKE,194
|
|
79
|
-
browsergym/workarena/tasks/dashboard.py,sha256=
|
|
79
|
+
browsergym/workarena/tasks/dashboard.py,sha256=HDGygBVtUM88lWKkUjyd43JvqmGUOPjmGfmRPkTJruE,34199
|
|
80
80
|
browsergym/workarena/tasks/form.py,sha256=_s07yZ-zcZbi5v6VK6km1BPzUfIFfMEVWFm56QhoznM,64141
|
|
81
81
|
browsergym/workarena/tasks/knowledge.py,sha256=kANjlC7DpptMbRlUlZGdDjqZeWIwwyJzozV58qEA6KU,13751
|
|
82
82
|
browsergym/workarena/tasks/list.py,sha256=4Ov7fHD4smr_L_EB9og7j7pWTQ2zKAI8LWRrr-7ryiA,53389
|
|
@@ -131,8 +131,8 @@ browsergym/workarena/tasks/utils/js_utils.js,sha256=n97fmY2Jkr59rEcQSuSbCnn1L2ZN
|
|
|
131
131
|
browsergym/workarena/tasks/utils/private_tasks.py,sha256=r7Z9SnBMuZdZ2i-tK6eULj0q8hclANXFSzdLl49KYHI,2128
|
|
132
132
|
browsergym/workarena/tasks/utils/string.py,sha256=ir5_ASD9QSFMZ9kuHo2snSXRuSfv_wROH6nxBLOTP4I,330
|
|
133
133
|
browsergym/workarena/tasks/utils/utils.py,sha256=xQD-njEwgN7qxfn1dLBN8MYfd3kl3TuVfpmI1yxML9k,955
|
|
134
|
-
browsergym_workarena-0.
|
|
135
|
-
browsergym_workarena-0.
|
|
136
|
-
browsergym_workarena-0.
|
|
137
|
-
browsergym_workarena-0.
|
|
138
|
-
browsergym_workarena-0.
|
|
134
|
+
browsergym_workarena-0.4.1.dist-info/METADATA,sha256=dboAv2_pwEwNrxbHQKrgKHnG2oxLHq_iB5qO5oAeUms,12498
|
|
135
|
+
browsergym_workarena-0.4.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
136
|
+
browsergym_workarena-0.4.1.dist-info/entry_points.txt,sha256=1lCeAbQFCcU6UTFwS5QIA3TKhT2P9ZabaZKT7sIShKc,137
|
|
137
|
+
browsergym_workarena-0.4.1.dist-info/licenses/LICENSE,sha256=sZLFiZHo_1hcxXRhXUDnQYVATUuWwRCdQjBxqxNnNEs,579
|
|
138
|
+
browsergym_workarena-0.4.1.dist-info/RECORD,,
|
{browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|