PyPI - browsergym-workarena - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

browsergym-workarena 0.3.1py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

browsergym/workarena/__init__.py CHANGED Viewed

@@ -1,11 +1,23 @@
-__version__ = "0.3.1"
+__version__ = "0.4.1"
 import inspect
-import numpy as np
+from logging import warning
+import numpy as np
 from browsergym.core.registration import register_task
 from .tasks.comp_building_block import CompositionalBuildingBlockTask
+from .tasks.compositional import (
+    AGENT_CURRICULUM_L2,
+    AGENT_CURRICULUM_L3,
+    ALL_COMPOSITIONAL_TASKS,
+    ALL_COMPOSITIONAL_TASKS_L2,
+    ALL_COMPOSITIONAL_TASKS_L3,
+    HUMAN_CURRICULUM_L2,
+    HUMAN_CURRICULUM_L3,
+)
+from .tasks.compositional.base import CompositionalTask, HumanEvalTask
+from .tasks.compositional.update_task import __TASKS__ as UPDATE_TASKS
 from .tasks.dashboard import __TASKS__ as DASHBOARD_TASKS
 from .tasks.form import __TASKS__ as FORM_TASKS
 from .tasks.knowledge import __TASKS__ as KB_TASKS
@@ -15,12 +27,15 @@ from .tasks.service_catalog import __TASKS__ as SERVICE_CATALOG_TASKS
 from .tasks.compositional.base import CompositionalTask
 ALL_WORKARENA_TASKS = [
+    *ALL_COMPOSITIONAL_TASKS_L2,
+    *ALL_COMPOSITIONAL_TASKS_L3,
     *DASHBOARD_TASKS,
     *FORM_TASKS,
     *KB_TASKS,
     *LIST_TASKS,
     *NAVIGATION_TASKS,
     *SERVICE_CATALOG_TASKS,
+    *UPDATE_TASKS,
 ]
 ATOMIC_TASKS = [
     task
@@ -30,9 +45,117 @@ ATOMIC_TASKS = [
     and not issubclass(task, CompositionalBuildingBlockTask)
 ]
 # register the WorkArena benchmark
 for task in ALL_WORKARENA_TASKS:
     register_task(
         task.get_task_id(),
         task,
     )
+workarena_tasks_all = [task_class.get_task_id() for task_class in ALL_WORKARENA_TASKS]
+workarena_tasks_atomic = [task_class.get_task_id() for task_class in ATOMIC_TASKS]
+TASK_CATEGORY_MAP = {
+    "workarena.servicenow.all-menu": "menu",
+    "workarena.servicenow.create-change-request": "form",
+    "workarena.servicenow.create-hardware-asset": "form",
+    "workarena.servicenow.create-incident": "form",
+    "workarena.servicenow.create-problem": "form",
+    "workarena.servicenow.create-user": "form",
+    "workarena.servicenow.filter-asset-list": "list-filter",
+    "workarena.servicenow.filter-change-request-list": "list-filter",
+    "workarena.servicenow.filter-hardware-list": "list-filter",
+    "workarena.servicenow.filter-incident-list": "list-filter",
+    "workarena.servicenow.filter-service-catalog-item-list": "list-filter",
+    "workarena.servicenow.filter-user-list": "list-filter",
+    "workarena.servicenow.impersonation": "menu",
+    "workarena.servicenow.knowledge-base-search": "knowledge",
+    "workarena.servicenow.order-apple-mac-book-pro15": "service catalog",
+    "workarena.servicenow.order-apple-watch": "service catalog",
+    "workarena.servicenow.order-developer-laptop": "service catalog",
+    "workarena.servicenow.order-development-laptop-p-c": "service catalog",
+    "workarena.servicenow.order-ipad-mini": "service catalog",
+    "workarena.servicenow.order-ipad-pro": "service catalog",
+    "workarena.servicenow.order-loaner-laptop": "service catalog",
+    "workarena.servicenow.order-sales-laptop": "service catalog",
+    "workarena.servicenow.order-standard-laptop": "service catalog",
+    "workarena.servicenow.sort-asset-list": "list-sort",
+    "workarena.servicenow.sort-change-request-list": "list-sort",
+    "workarena.servicenow.sort-hardware-list": "list-sort",
+    "workarena.servicenow.sort-incident-list": "list-sort",
+    "workarena.servicenow.sort-service-catalog-item-list": "list-sort",
+    "workarena.servicenow.sort-user-list": "list-sort",
+    "workarena.servicenow.multi-chart-min-max-retrieval": "dashboard",
+    "workarena.servicenow.multi-chart-value-retrieval": "dashboard",
+    "workarena.servicenow.single-chart-value-retrieval": "dashboard",
+    "workarena.servicenow.single-chart-min-max-retrieval": "dashboard",
+}
+workarena_tasks_l1 = list(TASK_CATEGORY_MAP.keys())
+workarena_task_categories = {}
+for task in workarena_tasks_atomic:
+    if task not in TASK_CATEGORY_MAP:
+        warning(f"Atomic task {task} not found in TASK_CATEGORY_MAP")
+        continue
+    cat = TASK_CATEGORY_MAP[task]
+    if cat in workarena_task_categories:
+        workarena_task_categories[cat].append(task)
+    else:
+        workarena_task_categories[cat] = [task]
+def get_task_category(task_name):
+    benchmark = task_name.split(".")[0]
+    return benchmark, TASK_CATEGORY_MAP.get(task_name, None)
+def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True):
+    OFFSET = 42
+    all_task_tuples = []
+    filter = filter.split(".")
+    if len(filter) > 2:
+        raise Exception("Unsupported filter used.")
+    if len(filter) == 1:
+        level = filter[0]
+        if level not in ["l1", "l2", "l3"]:
+            raise Exception("Unsupported category of tasks.")
+        else:
+            rng = np.random.RandomState(meta_seed)
+        if level == "l1":
+            for task in ATOMIC_TASKS:
+                for seed in rng.randint(0, 1000, n_seed_l1):
+                    all_task_tuples.append((task, int(seed)))
+            return all_task_tuples
+    if len(filter) == 2:
+        level, filter_category = filter[0], filter[1]
+        if filter_category not in list(AGENT_CURRICULUM_L2.keys()):
+            raise Exception("Unsupported category of tasks.")
+    else:
+        filter_category = None
+    if is_agent_curriculum:
+        if level == "l2":
+            ALL_COMPOSITIONAL_TASKS_CATEGORIES = AGENT_CURRICULUM_L2
+        else:
+            ALL_COMPOSITIONAL_TASKS_CATEGORIES = AGENT_CURRICULUM_L3
+    else:
+        if level == "l2":
+            ALL_COMPOSITIONAL_TASKS_CATEGORIES = HUMAN_CURRICULUM_L2
+        else:
+            ALL_COMPOSITIONAL_TASKS_CATEGORIES = HUMAN_CURRICULUM_L3
+    for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items():
+        if filter_category and category != filter_category:
+            continue
+        for curr_seed in rng.randint(0, 1000, items["num_seeds"]):
+            random_gen = np.random.RandomState(curr_seed)
+            for task_set, count in zip(items["buckets"], items["weights"]):
+                tasks = random_gen.choice(task_set, count, replace=False)
+                for task in tasks:
+                    all_task_tuples.append((task, int(curr_seed)))
+    return all_task_tuples

browsergym/workarena/tasks/dashboard.py CHANGED Viewed

@@ -3,9 +3,9 @@ import logging
 import numpy as np
 import playwright.sync_api
 import re
-import tenacity
 from abc import ABC, abstractmethod
+from tenacity import retry, stop_after_attempt, wait_fixed
 from typing import List, Tuple
 from urllib import parse
@@ -179,6 +179,8 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
         return type, data
+    # retry because sometimes the page is not fully loaded
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
     def _get_chart_by_title(
         self, page: playwright.sync_api.Page, title: str = None
     ) -> Tuple[str, dict]:

{browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: browsergym-workarena
-Version: 0.3.1
+Version: 0.4.1
 Summary: WorkArena benchmark for BrowserGym
 Project-URL: homepage, https://github.com/ServiceNow/WorkArena
 Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme, Megh Thakkar
@@ -22,9 +22,14 @@ Requires-Dist: tenacity>=8.2.3
 Requires-Dist: tqdm>=4.66.2
 Description-Content-Type: text/markdown
-# WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?
+# WorkArena: A Benchmark for Evaluating Agents on Knowledge Work Tasks
+[[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
-[[Paper]](https://arxiv.org/abs/2403.07718) ♦ [[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
+### Papers
+*  [ICML 2024] WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks? [[Paper]](https://arxiv.org/abs/2403.07718)
+*  WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks [[Paper]](https://arxiv.org/abs/2407.05291)
 `WorkArena` is a suite of browser-based tasks tailored to gauge web agents' effectiveness in supporting routine tasks for knowledge workers.
 By harnessing the ubiquitous [ServiceNow](https://www.servicenow.com/what-is-servicenow.html) platform, this benchmark will be instrumental in assessing the widespread state of such automations in modern knowledge work environments.
@@ -34,9 +39,47 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
 https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
+## Getting Started
+To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
+### a) Create a ServiceNow Developer Instance
+1. Go to https://developer.servicenow.com/ and create an account.
+2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
+3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
+4. You should now see your URL and credentials. Based on this information, set the following environment variables:
+    * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
+    * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
+    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
+6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
+**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
+### b) Install WorkArena and Initialize your Instance
+Run the following command to install WorkArena in the [BrowswerGym](https://github.com/servicenow/browsergym) environment:
+```
+pip install browsergym
+```
+Then, install [Playwright](https://github.com/microsoft/playwright):
+```
+playwright install
+```
+Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
+```
+workarena-install
+```
+Your installation is now complete! 🎉
 ## Benchmark Contents
-At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
+At the moment, WorkArena-L1 includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface, otherwise referred to as "atomic" tasks. WorkArena++ contains 682 tasks, each one sampling among thousands of potential configurations. WorkArena++ uses the atomic components presented in WorkArena, and composes them into real-world use cases evaluating planning, reasoning, and memorizing abilities of agents.
+The following videos show an agent built on `GPT-4-vision` interacting with every atomic component of the benchmark. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
 ### Knowledge Bases
@@ -80,7 +123,6 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
 https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
 ## Getting Started
 To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
@@ -93,7 +135,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
 4. You should now see your URL and credentials. Based on this information, set the following environment variables:
     * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
     * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
-    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
+    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in single quotes '' and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
 6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
 **Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
@@ -105,25 +147,24 @@ Run the following command to install WorkArena in the [BrowswerGym](https://gith
 pip install browsergym-workarena
 ```
-Then, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
+Then, install [Playwright](https://github.com/microsoft/playwright):
 ```
-workarena-install
+playwright install
 ```
-Finally, install [Playwright](https://github.com/microsoft/playwright):
+Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
 ```
-playwright install
+workarena-install
 ```
 Your installation is now complete! 🎉
 ## Live Demo
 Run this code to see WorkArena in action.
 Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
+- To run a demo of WorkArena-L1 (ICML 2024) tasks using BrowserGym, use the following script:
 ```python
 import random
@@ -165,9 +206,55 @@ for task in ALL_WORKARENA_TASKS:
 ```
+- To run a demo of WorkArena-L2 (WorkArena++) tasks using BrowserGym, use the following script. Change the filter on line 6 to `l3` to sample L3 tasks.
+```python
+import random
+from browsergym.core.env import BrowserEnv
+from browsergym.workarena import get_all_tasks_agents
+AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
+AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
+    sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
+]
+from time import sleep
+for (task, seed) in zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS):
+    print("Task:", task)
+    # Instantiate a new environment
+    env = BrowserEnv(task_entrypoint=task,
+                    headless=False)
+    env.reset()
+    # Cheat functions use Playwright to automatically solve the task
+    env.chat.add_message(role="assistant", msg="On it. Please wait...")
+    for i in range(len(env.task)):
+        sleep(1)
+        env.task.cheat(page=env.page, chat_messages=env.chat.messages, subtask_idx=i)
+        sleep(1)
+        reward, done, message, info = env.task.validate(page=env.page, chat_messages=env.chat.messages)
+    if reward == 1:
+        env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
+    else:
+        env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
+    sleep(3)
+    env.close()
+```
+Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
 ## Citing This Work
 Please use the following BibTeX to cite our work:
+### WorkArena
 ```
 @misc{workarena2024,
       title={WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?},
@@ -178,3 +265,15 @@ Please use the following BibTeX to cite our work:
       primaryClass={cs.LG}
 }
 ```
+### WorkArena++
+```
+@misc{boisvert2024workarenacompositionalplanningreasoningbased,
+      title={WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks},
+      author={Léo Boisvert and Megh Thakkar and Maxime Gasse and Massimo Caccia and Thibault Le Sellier De Chezelles and Quentin Cappart and Nicolas Chapados and Alexandre Lacoste and Alexandre Drouin},
+      year={2024},
+      eprint={2407.05291},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2407.05291},
+}
+```

{browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-browsergym/workarena/__init__.py,sha256=OP6OKzK3jaEKhZaa5dDvs1R_rWOdk2cziV9zjIaFvEc,1062
+browsergym/workarena/__init__.py,sha256=ocdVJcRZysM8quznRst33KAV39ubpZuvVgjjwQXmKtw,6289
 browsergym/workarena/config.py,sha256=tblmOUpqSoL3qlQHK_TFEDSFbC3o2kuRP_GFpoTNsX4,8522
 browsergym/workarena/install.py,sha256=UaPE1K70xJB-2Gr1P5rJbcolkwMeWyRt04F7_5gpR4E,39341
 browsergym/workarena/instance.py,sha256=Qw4lzHhgnl8IuiWOelsmzCJce3jXYivYYwtfTPt2H-s,4314
@@ -76,7 +76,7 @@ browsergym/workarena/human_eval/tool.py,sha256=SwPqArNnvEeOPLRgem6kwl8ho345o-1f3
 browsergym/workarena/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 browsergym/workarena/tasks/base.py,sha256=Ikh_A5I9_9acHFQCcnVMEnlBg3u3QHQD2I_NbGvD6SE,6411
 browsergym/workarena/tasks/comp_building_block.py,sha256=Lg3KbAWrxzAHe5XbPN6L8bvdu7mfJpmBvI7jXeSDwKE,194
-browsergym/workarena/tasks/dashboard.py,sha256=c-C6Cl3YH4SMZr8IRFf3qFKvT-p8t80gub-5K3B9nHE,34038
+browsergym/workarena/tasks/dashboard.py,sha256=HDGygBVtUM88lWKkUjyd43JvqmGUOPjmGfmRPkTJruE,34199
 browsergym/workarena/tasks/form.py,sha256=_s07yZ-zcZbi5v6VK6km1BPzUfIFfMEVWFm56QhoznM,64141
 browsergym/workarena/tasks/knowledge.py,sha256=kANjlC7DpptMbRlUlZGdDjqZeWIwwyJzozV58qEA6KU,13751
 browsergym/workarena/tasks/list.py,sha256=4Ov7fHD4smr_L_EB9og7j7pWTQ2zKAI8LWRrr-7ryiA,53389
@@ -131,8 +131,8 @@ browsergym/workarena/tasks/utils/js_utils.js,sha256=n97fmY2Jkr59rEcQSuSbCnn1L2ZN
 browsergym/workarena/tasks/utils/private_tasks.py,sha256=r7Z9SnBMuZdZ2i-tK6eULj0q8hclANXFSzdLl49KYHI,2128
 browsergym/workarena/tasks/utils/string.py,sha256=ir5_ASD9QSFMZ9kuHo2snSXRuSfv_wROH6nxBLOTP4I,330
 browsergym/workarena/tasks/utils/utils.py,sha256=xQD-njEwgN7qxfn1dLBN8MYfd3kl3TuVfpmI1yxML9k,955
-browsergym_workarena-0.3.1.dist-info/METADATA,sha256=AkQsXf7XNV6KRmQHi4zErlG1ukK0iJoicWnFJytTAbU,7707
-browsergym_workarena-0.3.1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
-browsergym_workarena-0.3.1.dist-info/entry_points.txt,sha256=1lCeAbQFCcU6UTFwS5QIA3TKhT2P9ZabaZKT7sIShKc,137
-browsergym_workarena-0.3.1.dist-info/licenses/LICENSE,sha256=sZLFiZHo_1hcxXRhXUDnQYVATUuWwRCdQjBxqxNnNEs,579
-browsergym_workarena-0.3.1.dist-info/RECORD,,
+browsergym_workarena-0.4.1.dist-info/METADATA,sha256=dboAv2_pwEwNrxbHQKrgKHnG2oxLHq_iB5qO5oAeUms,12498
+browsergym_workarena-0.4.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+browsergym_workarena-0.4.1.dist-info/entry_points.txt,sha256=1lCeAbQFCcU6UTFwS5QIA3TKhT2P9ZabaZKT7sIShKc,137
+browsergym_workarena-0.4.1.dist-info/licenses/LICENSE,sha256=sZLFiZHo_1hcxXRhXUDnQYVATUuWwRCdQjBxqxNnNEs,579
+browsergym_workarena-0.4.1.dist-info/RECORD,,

{browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.24.2
+Generator: hatchling 1.25.0
 Root-Is-Purelib: true
 Tag: py3-none-any

{browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{browsergym_workarena-0.3.1.dist-info → browsergym_workarena-0.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

browsergym-workarena 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl

browsergym-workarena 0.3.1py3-none-any.whl → 0.4.1py3-none-any.whl