PyPI - browsergym-workarena - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

browsergym-workarena 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

{browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/.github/workflows/unit_tests.yml RENAMED Viewed

@@ -5,6 +5,8 @@ on:
     branches:
       - main
   pull_request:
+  schedule:
+    - cron: '59 23 * * SUN'  # Runs at midnight on Sunday
 jobs:
@@ -101,4 +103,32 @@ jobs:
           SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
-        run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
+        run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
+  end-to-end-tests:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'schedule'
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests

browsergym_workarena-0.3.2/.gitignore ADDED Viewed

@@ -0,0 +1,196 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+results/
+.vscode
+*.csv
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+venv/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# MacOS
+**/.DS_Store
+.vscode
+allowed_selenium.json
+# Torchtune
+finetuning/torchtune
+# PyLLMD repo for finetuning
+pyllmd_tune/research-pyllmd/
+pyllmd_tune/data/
+datasets/*
+_sandbox.py
+node_modules/
+/test-results/
+/playwright-report/
+/blob-report/
+/playwright/.cache/
+/test-results/
+/playwright-report/
+/blob-report/
+/playwright/.cache/
+results/
+# personal (optimass)
+ICML_deadline/
+mass_utils/
+pyllmd_tune/
+# don't ignore the miniwob_tasks_all.csv file
+!miniwob_tasks_all.csv

{browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: browsergym-workarena
-Version: 0.3.1
+Version: 0.3.2
 Summary: WorkArena benchmark for BrowserGym
 Project-URL: homepage, https://github.com/ServiceNow/WorkArena
 Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme, Megh Thakkar
@@ -22,9 +22,14 @@ Requires-Dist: tenacity>=8.2.3
 Requires-Dist: tqdm>=4.66.2
 Description-Content-Type: text/markdown
-# WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?
+# WorkArena: A Benchmark for Evaluating Agents on Knowledge Work Tasks
+[[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
-[[Paper]](https://arxiv.org/abs/2403.07718) ♦ [[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
+### Papers
+*  [ICML 2024] WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks? [[Paper]](https://arxiv.org/abs/2403.07718)
+*  WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks [[Paper]](https://arxiv.org/abs/2407.05291)
 `WorkArena` is a suite of browser-based tasks tailored to gauge web agents' effectiveness in supporting routine tasks for knowledge workers.
 By harnessing the ubiquitous [ServiceNow](https://www.servicenow.com/what-is-servicenow.html) platform, this benchmark will be instrumental in assessing the widespread state of such automations in modern knowledge work environments.
@@ -34,9 +39,47 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
 https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
+## Getting Started
+To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
+### a) Create a ServiceNow Developer Instance
+1. Go to https://developer.servicenow.com/ and create an account.
+2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
+3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
+4. You should now see your URL and credentials. Based on this information, set the following environment variables:
+    * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
+    * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
+    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
+6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
+**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
+### b) Install WorkArena and Initialize your Instance
+Run the following command to install WorkArena in the [BrowswerGym](https://github.com/servicenow/browsergym) environment:
+```
+pip install browsergym
+```
+Then, install [Playwright](https://github.com/microsoft/playwright):
+```
+playwright install
+```
+Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
+```
+workarena-install
+```
+Your installation is now complete! 🎉
 ## Benchmark Contents
-At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
+At the moment, WorkArena-L1 includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface, otherwise referred to as "atomic" tasks. WorkArena++ contains 682 tasks, each one sampling among thousands of potential configurations. WorkArena++ uses the atomic components presented in WorkArena, and composes them into real-world use cases evaluating planning, reasoning, and memorizing abilities of agents.
+The following videos show an agent built on `GPT-4-vision` interacting with every atomic component of the benchmark. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
 ### Knowledge Bases
@@ -80,7 +123,6 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
 https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
 ## Getting Started
 To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
@@ -93,7 +135,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
 4. You should now see your URL and credentials. Based on this information, set the following environment variables:
     * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
     * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
-    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
+    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in single quotes '' and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
 6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
 **Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
@@ -105,25 +147,24 @@ Run the following command to install WorkArena in the [BrowswerGym](https://gith
 pip install browsergym-workarena
 ```
-Then, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
+Then, install [Playwright](https://github.com/microsoft/playwright):
 ```
-workarena-install
+playwright install
 ```
-Finally, install [Playwright](https://github.com/microsoft/playwright):
+Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
 ```
-playwright install
+workarena-install
 ```
 Your installation is now complete! 🎉
 ## Live Demo
 Run this code to see WorkArena in action.
 Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
+- To run a demo of WorkArena-L1 (ICML 2024) tasks using BrowserGym, use the following script:
 ```python
 import random
@@ -165,9 +206,55 @@ for task in ALL_WORKARENA_TASKS:
 ```
+- To run a demo of WorkArena-L2 (WorkArena++) tasks using BrowserGym, use the following script. Change the filter on line 6 to `l3` to sample L3 tasks.
+```python
+import random
+from browsergym.core.env import BrowserEnv
+from browsergym.workarena import get_all_tasks_agents
+AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
+AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
+    sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
+]
+from time import sleep
+for (task, seed) in zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS):
+    print("Task:", task)
+    # Instantiate a new environment
+    env = BrowserEnv(task_entrypoint=task,
+                    headless=False)
+    env.reset()
+    # Cheat functions use Playwright to automatically solve the task
+    env.chat.add_message(role="assistant", msg="On it. Please wait...")
+    for i in range(len(env.task)):
+        sleep(1)
+        env.task.cheat(page=env.page, chat_messages=env.chat.messages, subtask_idx=i)
+        sleep(1)
+        reward, done, message, info = env.task.validate(page=env.page, chat_messages=env.chat.messages)
+    if reward == 1:
+        env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
+    else:
+        env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
+    sleep(3)
+    env.close()
+```
+Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
 ## Citing This Work
 Please use the following BibTeX to cite our work:
+### WorkArena
 ```
 @misc{workarena2024,
       title={WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?},
@@ -178,3 +265,15 @@ Please use the following BibTeX to cite our work:
       primaryClass={cs.LG}
 }
 ```
+### WorkArena++
+```
+@misc{boisvert2024workarenacompositionalplanningreasoningbased,
+      title={WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks},
+      author={Léo Boisvert and Megh Thakkar and Maxime Gasse and Massimo Caccia and Thibault Le Sellier De Chezelles and Quentin Cappart and Nicolas Chapados and Alexandre Lacoste and Alexandre Drouin},
+      year={2024},
+      eprint={2407.05291},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2407.05291},
+}
+```

{browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/README.md RENAMED Viewed

@@ -1,6 +1,11 @@
-# WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?
+# WorkArena: A Benchmark for Evaluating Agents on Knowledge Work Tasks
+[[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
-[[Paper]](https://arxiv.org/abs/2403.07718) ♦ [[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
+### Papers
+*  [ICML 2024] WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks? [[Paper]](https://arxiv.org/abs/2403.07718)
+*  WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks [[Paper]](https://arxiv.org/abs/2407.05291)
 `WorkArena` is a suite of browser-based tasks tailored to gauge web agents' effectiveness in supporting routine tasks for knowledge workers.
 By harnessing the ubiquitous [ServiceNow](https://www.servicenow.com/what-is-servicenow.html) platform, this benchmark will be instrumental in assessing the widespread state of such automations in modern knowledge work environments.
@@ -10,9 +15,47 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
 https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
+## Getting Started
+To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
+### a) Create a ServiceNow Developer Instance
+1. Go to https://developer.servicenow.com/ and create an account.
+2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
+3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
+4. You should now see your URL and credentials. Based on this information, set the following environment variables:
+    * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
+    * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
+    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
+6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
+**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
+### b) Install WorkArena and Initialize your Instance
+Run the following command to install WorkArena in the [BrowswerGym](https://github.com/servicenow/browsergym) environment:
+```
+pip install browsergym
+```
+Then, install [Playwright](https://github.com/microsoft/playwright):
+```
+playwright install
+```
+Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
+```
+workarena-install
+```
+Your installation is now complete! 🎉
 ## Benchmark Contents
-At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
+At the moment, WorkArena-L1 includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface, otherwise referred to as "atomic" tasks. WorkArena++ contains 682 tasks, each one sampling among thousands of potential configurations. WorkArena++ uses the atomic components presented in WorkArena, and composes them into real-world use cases evaluating planning, reasoning, and memorizing abilities of agents.
+The following videos show an agent built on `GPT-4-vision` interacting with every atomic component of the benchmark. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
 ### Knowledge Bases
@@ -56,7 +99,6 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
 https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
 ## Getting Started
 To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
@@ -69,7 +111,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
 4. You should now see your URL and credentials. Based on this information, set the following environment variables:
     * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
     * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
-    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
+    * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in single quotes '' and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
 6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
 **Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
@@ -81,25 +123,24 @@ Run the following command to install WorkArena in the [BrowswerGym](https://gith
 pip install browsergym-workarena
 ```
-Then, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
+Then, install [Playwright](https://github.com/microsoft/playwright):
 ```
-workarena-install
+playwright install
 ```
-Finally, install [Playwright](https://github.com/microsoft/playwright):
+Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
 ```
-playwright install
+workarena-install
 ```
 Your installation is now complete! 🎉
 ## Live Demo
 Run this code to see WorkArena in action.
 Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
+- To run a demo of WorkArena-L1 (ICML 2024) tasks using BrowserGym, use the following script:
 ```python
 import random
@@ -141,9 +182,55 @@ for task in ALL_WORKARENA_TASKS:
 ```
+- To run a demo of WorkArena-L2 (WorkArena++) tasks using BrowserGym, use the following script. Change the filter on line 6 to `l3` to sample L3 tasks.
+```python
+import random
+from browsergym.core.env import BrowserEnv
+from browsergym.workarena import get_all_tasks_agents
+AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
+AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
+    sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
+]
+from time import sleep
+for (task, seed) in zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS):
+    print("Task:", task)
+    # Instantiate a new environment
+    env = BrowserEnv(task_entrypoint=task,
+                    headless=False)
+    env.reset()
+    # Cheat functions use Playwright to automatically solve the task
+    env.chat.add_message(role="assistant", msg="On it. Please wait...")
+    for i in range(len(env.task)):
+        sleep(1)
+        env.task.cheat(page=env.page, chat_messages=env.chat.messages, subtask_idx=i)
+        sleep(1)
+        reward, done, message, info = env.task.validate(page=env.page, chat_messages=env.chat.messages)
+    if reward == 1:
+        env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
+    else:
+        env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
+    sleep(3)
+    env.close()
+```
+Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
 ## Citing This Work
 Please use the following BibTeX to cite our work:
+### WorkArena
 ```
 @misc{workarena2024,
       title={WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?},
@@ -154,3 +241,15 @@ Please use the following BibTeX to cite our work:
       primaryClass={cs.LG}
 }
 ```
+### WorkArena++
+```
+@misc{boisvert2024workarenacompositionalplanningreasoningbased,
+      title={WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks},
+      author={Léo Boisvert and Megh Thakkar and Maxime Gasse and Massimo Caccia and Thibault Le Sellier De Chezelles and Quentin Cappart and Nicolas Chapados and Alexandre Lacoste and Alexandre Drouin},
+      year={2024},
+      eprint={2407.05291},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2407.05291},
+}
+```

browsergym_workarena-0.3.2/dcat-metadata.jsonld ADDED Viewed

@@ -0,0 +1,32 @@
+{
+    "@context": {
+      "dcat": "http://www.w3.org/ns/dcat#"
+  ,
+      "dct": "http://purl.org/dc/terms/"
+  ,
+      "foaf": "http://xmlns.com/foaf/0.1/"
+    },
+    "@type": "dcat:Dataset",
+    "dct:title": "WorkArena++",
+    "dct:description": "Benchmark to evaluate the reasoning, retrieval, planning, and decision making abilities of LLM and VLM-based agents",
+    "dct:identifier": "https://github.com/ServiceNow/WorkArena/tree/workarena-plus-plus"
+  ,
+    "dct:issued": "2024-06-12",
+    "dct:modified": "2024-06-12",
+    "dct:publisher": {
+      "@type": "foaf:Organization",
+      "foaf:name": "ServiceNow Research"
+    },
+    "dct:contactPoint": {
+      "@type": "vcard:Contact",
+      "vcard:fn": "Alexandre Drouin",
+      "vcard:hasEmail": "mailto:alexandre.drouin@servicenow.com"
+    },
+    "dcat:distribution": [
+      {
+        "@type": "dcat:Distribution",
+        "dct:format": "text/csv",
+        "dcat:accessURL": "https://github.com/ServiceNow/WorkArena/tree/workarena-plus-plus/src/browsergym/workarena/tasks/compositional"
+      }
+    ]
+  }

browsergym-workarena 0.3.1__tar.gz → 0.3.2__tar.gz

browsergym-workarena 0.3.1tar.gz → 0.3.2tar.gz