PyPI - browsergym-workarena - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

browsergym-workarena 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

{browsergym_workarena-0.2.0 → browsergym_workarena-0.3.0}/.github/workflows/pypi.yml RENAMED Viewed

@@ -48,10 +48,11 @@ jobs:
           uses: pypa/gh-action-pypi-publish@release/v1
     github-release:
-      name: Sign with Sigstore and upload them to GitHub Release
+      name: Sign packages with Sigstore and upload them to GitHub Release
       needs:
       - publish-to-pypi
       runs-on: ubuntu-latest
       permissions:
         contents: write  # IMPORTANT: mandatory for making GitHub Releases
         id-token: write  # IMPORTANT: mandatory for sigstore
@@ -64,7 +65,7 @@ jobs:
           path: dist/
       - name: Sign the dists with Sigstore
-        uses: sigstore/gh-action-sigstore-python@v1.2.3
+        uses: sigstore/gh-action-sigstore-python@v2.1.1
         with:
           inputs: >-
             ./dist/*.tar.gz

{browsergym_workarena-0.2.0 → browsergym_workarena-0.3.0}/.github/workflows/unit_tests.yml RENAMED Viewed

@@ -34,38 +34,39 @@ jobs:
         run: black . --check
   browsergym-workarena-fast:
-      runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-      defaults:
-        run:
-          shell: bash -l {0}
+    defaults:
+      run:
+        shell: bash -l {0}
-      steps:
+    steps:
-        - name: Checkout Repository
-          uses: actions/checkout@v4
+      - name: Checkout Repository
+        uses: actions/checkout@v4
-        - name: Set up Python
-          uses: actions/setup-python@v5
-          with:
-            python-version: '3.10'
-            cache: 'pip' # caching pip dependencies
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip' # caching pip dependencies
-        - name: Pip install
-          run: pip install -r requirements.txt
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
-        - name: Pip list
-          run: pip list
+      - name: Pip list
+        run: pip list
-        - name: Install Playwright
-          run: playwright install --with-deps
+      - name: Install Playwright
+        run: playwright install --with-deps
-        - name: Run non-slow browsergym-workarena Unit Tests
-          env:
-            SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
-            SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
-            SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
-          run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v tests
+      - name: Run non-slow browsergym-workarena Unit Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+        run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v tests
   browsergym-workarena-slow:
     runs-on: ubuntu-latest
@@ -86,6 +87,7 @@ jobs:
           cache: 'pip' # caching pip dependencies
       - name: Pip install
+        working-directory: ./dev
         run: pip install -r requirements.txt
       - name: Pip list

{browsergym_workarena-0.2.0 → browsergym_workarena-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.3
 Name: browsergym-workarena
-Version: 0.2.0
+Version: 0.3.0
 Summary: WorkArena benchmark for BrowserGym
 Project-URL: homepage, https://github.com/ServiceNow/WorkArena
-Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme
+Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme, Megh Thakkar
 License: Apache-2.0
 License-File: LICENSE
 Classifier: Development Status :: 2 - Pre-Alpha
@@ -13,9 +13,9 @@ Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >3.7
-Requires-Dist: browsergym-core==0.2.0
+Requires-Dist: browsergym-core>=0.2
 Requires-Dist: english-words>=2.0.1
-Requires-Dist: faker>=24.11.0
+Requires-Dist: faker>=24.8.0
 Requires-Dist: numpy>=1.14
 Requires-Dist: requests>=2.31
 Requires-Dist: tenacity>=8.2.3
@@ -34,12 +34,9 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
 https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
-## ⚠️ Pre-Release warning ⚠️
-Please note that the WorkArena benchmark is still undergoing minor bug fixes and updates, which may cause discrepancies with results reported in our latest arXiv preprint. We plan to release soon a stable version of WorkArena v0.1.0 with enhanced stability, and a final version v1.0.0 with a new suite of tasks.
 ## Benchmark Contents
-At the moment, WorkArena includes `18,050` task instances drawn from `29` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
+At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
 ### Knowledge Bases
@@ -75,6 +72,15 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/7538b3ef-d39b-4978-b9ea-8
 https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-80e482435e6e
+### Dashboards
+**Goal:** The agent must answer a question that requires reading charts and (optionally) performing simple reasoning over them.
+*Note: For demonstration purposes, a human is controlling the cursor since this is a pure retrieval task*
+https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
 ## Getting Started
 To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
@@ -82,7 +88,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
 ### a) Create a ServiceNow Developer Instance
 1. Go to https://developer.servicenow.com/ and create an account.
-2. Click on `Request an instance` and select the `Utah` release (initializing the instance will take a few minutes)
+2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
 3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
 4. You should now see your URL and credentials. Based on this information, set the following environment variables:
     * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
@@ -116,6 +122,8 @@ Your installation is now complete! 🎉
 Run this code to see WorkArena in action.
+Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
 ```python
 import random
@@ -130,28 +138,27 @@ for task in ALL_WORKARENA_TASKS:
     # Instantiate a new environment
     env = BrowserEnv(task_entrypoint=task,
-                    headless=False,
-                    slow_mo=1000)
+                    headless=False)
     env.reset()
     # Cheat functions use Playwright to automatically solve the task
     env.chat.add_message(role="assistant", msg="On it. Please wait...")
-    env.task.cheat(env.page, env.chat.messages)
+    cheat_messages = []
+    env.task.cheat(env.page, cheat_messages)
+    # Send cheat messages to chat
+    for cheat_msg in cheat_messages:
+        env.chat.add_message(role=cheat_msg["role"], msg=cheat_msg["message"])
     # Post solution to chat
-    if "KnowledgeBaseSearchTask" in str(task):
-        answer = env.chat.messages[-1]["message"]
-        env.chat.add_message(role="assistant", msg=f"The answer is:")
-        env.chat.add_message(role="assistant", msg=answer)
-    else:
-        env.chat.add_message(role="assistant", msg="I'm done!")
+    env.chat.add_message(role="assistant", msg="I'm done!")
     # Validate the solution
-    reward, stop, info, message = env.task.validate(env.page, env.chat.messages)
+    reward, stop, message, info = env.task.validate(env.page, cheat_messages)
     if reward == 1:
         env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
     else:
-        env.chat.add_message(role="user", msg=f"No, that doesn't work. {message.get('message', '')}")
+        env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
     sleep(3)
     env.close()

{browsergym_workarena-0.2.0 → browsergym_workarena-0.3.0}/README.md RENAMED Viewed

@@ -10,12 +10,9 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
 https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
-## ⚠️ Pre-Release warning ⚠️
-Please note that the WorkArena benchmark is still undergoing minor bug fixes and updates, which may cause discrepancies with results reported in our latest arXiv preprint. We plan to release soon a stable version of WorkArena v0.1.0 with enhanced stability, and a final version v1.0.0 with a new suite of tasks.
 ## Benchmark Contents
-At the moment, WorkArena includes `18,050` task instances drawn from `29` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
+At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
 ### Knowledge Bases
@@ -51,6 +48,15 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/7538b3ef-d39b-4978-b9ea-8
 https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-80e482435e6e
+### Dashboards
+**Goal:** The agent must answer a question that requires reading charts and (optionally) performing simple reasoning over them.
+*Note: For demonstration purposes, a human is controlling the cursor since this is a pure retrieval task*
+https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
 ## Getting Started
 To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
@@ -58,7 +64,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
 ### a) Create a ServiceNow Developer Instance
 1. Go to https://developer.servicenow.com/ and create an account.
-2. Click on `Request an instance` and select the `Utah` release (initializing the instance will take a few minutes)
+2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
 3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
 4. You should now see your URL and credentials. Based on this information, set the following environment variables:
     * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
@@ -92,6 +98,8 @@ Your installation is now complete! 🎉
 Run this code to see WorkArena in action.
+Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
 ```python
 import random
@@ -106,28 +114,27 @@ for task in ALL_WORKARENA_TASKS:
     # Instantiate a new environment
     env = BrowserEnv(task_entrypoint=task,
-                    headless=False,
-                    slow_mo=1000)
+                    headless=False)
     env.reset()
     # Cheat functions use Playwright to automatically solve the task
     env.chat.add_message(role="assistant", msg="On it. Please wait...")
-    env.task.cheat(env.page, env.chat.messages)
+    cheat_messages = []
+    env.task.cheat(env.page, cheat_messages)
+    # Send cheat messages to chat
+    for cheat_msg in cheat_messages:
+        env.chat.add_message(role=cheat_msg["role"], msg=cheat_msg["message"])
     # Post solution to chat
-    if "KnowledgeBaseSearchTask" in str(task):
-        answer = env.chat.messages[-1]["message"]
-        env.chat.add_message(role="assistant", msg=f"The answer is:")
-        env.chat.add_message(role="assistant", msg=answer)
-    else:
-        env.chat.add_message(role="assistant", msg="I'm done!")
+    env.chat.add_message(role="assistant", msg="I'm done!")
     # Validate the solution
-    reward, stop, info, message = env.task.validate(env.page, env.chat.messages)
+    reward, stop, message, info = env.task.validate(env.page, cheat_messages)
     if reward == 1:
         env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
     else:
-        env.chat.add_message(role="user", msg=f"No, that doesn't work. {message.get('message', '')}")
+        env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
     sleep(3)
     env.close()

browsergym_workarena-0.3.0/dev/environment.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+name: workarena-dev
+channels:
+  - huggingface
+  - conda-forge
+  - defaults
+dependencies:
+  - python>=3.10
+  - pip
+  - pip:
+      - -r requirements.txt

browsergym_workarena-0.3.0/dev/requirements.txt ADDED Viewed

@@ -0,0 +1,9 @@
+black[jupyter]==24.2.0
+blacken-docs
+pre-commit
+pytest==7.3.2
+pytest-xdist
+pytest-playwright
+tenacity
+browsergym-core
+-e .. # local package

{browsergym_workarena-0.2.0 → browsergym_workarena-0.3.0}/pyproject.toml RENAMED Viewed

@@ -11,6 +11,7 @@ authors = [
     {name = "Maxime Gasse"},
     {name = "Alex Lacoste"},
     {name = "Manuel Del Verme"},
+    {name = "Megh Thakkar"},
 ]
 readme = "README.md"
 requires-python = ">3.7"
@@ -30,6 +31,7 @@ homepage = "https://github.com/ServiceNow/WorkArena"
 [project.scripts]
 workarena-install = "browsergym.workarena.install:main"
+workarena-human-eval = "browsergym.workarena.human_eval.tool:main"
 [tool.hatch.version]
 path = "src/browsergym/workarena/__init__.py"
@@ -39,3 +41,30 @@ files = ["requirements.txt"]
 [tool.hatch.build.targets.wheel]
 packages = ["src/browsergym"]
+[tool.black]
+line-length = 100
+include = '\.pyi?$'
+exclude = '''
+/(
+    \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.nox
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | dist
+)/
+'''
+[tool.pytest.ini_options]
+filterwarnings = [
+    'ignore::UserWarning:gymnasium.*:',  # too many "The obs is not within the observation space." warnings.
+]
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+]

{browsergym_workarena-0.2.0 → browsergym_workarena-0.3.0}/requirements.txt RENAMED Viewed

@@ -1,6 +1,6 @@
-browsergym-core==0.2.0
+browsergym-core>=0.2
 english-words>=2.0.1
-faker>=24.11.0
+Faker>=24.8.0
 numpy>=1.14
 requests>=2.31
 tenacity>=8.2.3  # only used in cheat() -> move to tests?

browsergym_workarena-0.3.0/scripts/extract_finetuning_traces.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""
+A demonstration of how observation/action traces can be extracted
+for WorkArena tasks without modifying the task code.
+Author: Alexandre Drouin (alexandre.drouin@servicenow.com)
+Notes:
+- This approach relies on monkey patching the playwright actions to log the actions and observations.
+  It has not been tested for parallel execution. It might work with multiprocessing, but it will for
+  sure not work with multithreading.
+"""
+import importlib
+import logging
+import os
+import pickle
+import playwright.sync_api as playwright_sync
+from browsergym.core.env import BrowserEnv
+from browsergym.workarena import ALL_WORKARENA_TASKS
+from collections import defaultdict
+from tenacity import retry, stop_after_attempt, wait_fixed
+from time import time
+N_PER_TASK = 10
+def monkey_patch_playwright(observation_callback, trace_storage):
+    """
+    A function that overrides the default playwright actions to log the actions and observations.
+    Parameters:
+    ------------
+    observation_callback: callable
+        A function that returns the observation of the environment.
+    trace_storage: list
+        A list to store the trace of the actions and observations.
+        These will be appended in-place.
+    """
+    def wrapper(func, interface):
+        def wrapped(*args, **kwargs):
+            # Get the observation
+            obs = observation_callback()
+            # Get the BID of the element on which we are acting.
+            if interface.__name__ == "Locator":
+                # Get the locator
+                locator = args[0]
+                # Get the BID
+                bid = locator.element_handle().evaluate('(el) => el.getAttribute("bid")')
+            elif interface.__name__ == "Keyboard":
+                # Get the BID of the element
+                bid = "keyboard"
+            else:
+                # Get the BID of the element
+                bid = args[0].evaluate('(el) => el.getAttribute("bid")')
+            logging.info(f"Action: {func.__name__} BID: {bid}  --   Args: {args[1:]} {kwargs}")
+            trace_storage.append(
+                {
+                    "obs": obs,
+                    "action": func.__name__,
+                    "args": args[1:],
+                    "kwargs": kwargs,
+                    "bid": bid,
+                    "time": time(),
+                }
+            )
+            # Resume action
+            return func(*args, **kwargs)
+        return wrapped
+    # Interfaces and actions we want to monkey patch
+    importlib.reload(playwright_sync)
+    from playwright.sync_api import Page, Frame, Locator, Keyboard, ElementHandle
+    # TODO: Make sure the list of interfaces and actions is exhaustive
+    #       It covers all that is used in WorkArena cheats as of April 11, 2024
+    interfaces = [Page, Frame, Locator, Keyboard, ElementHandle]
+    actions = ["click", "select_option", "set_checked", "fill", "press", "type", "down", "up"]
+    for interface in interfaces:
+        for action in actions:
+            if hasattr(interface, action):
+                setattr(interface, action, wrapper(getattr(interface, action), interface))
+                print(f"Monkey patched {interface.__name__}.{action}")
+@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
+def extract_trace(task_cls, headless=True):
+    """
+    Extracts the trace of actions and observations for a given task.
+    Parameters:
+    ------------
+    task_cls: class
+        The class of the task to extract the trace from.
+    """
+    # Instantiate a new environment
+    env = BrowserEnv(task_entrypoint=task_cls, headless=headless, slow_mo=1000)
+    # Setup customized tracing
+    trace = []
+    monkey_patch_playwright(observation_callback=env._get_obs, trace_storage=trace)
+    env.reset()
+    env.task.cheat(env.page, env.chat.messages)
+    env.close()
+    return trace
+if __name__ == "__main__":
+    os.makedirs("trace_profiling", exist_ok=True)
+    task_traces = defaultdict(list)
+    for task in ALL_WORKARENA_TASKS:
+        print("Task:", task)
+        for i in range(N_PER_TASK):
+            print(f"Extracting trace {i+1}/{N_PER_TASK}")
+            trace = extract_trace(task, headless=True)
+            task_traces[task].append(trace)
+    pickle.dump(task_traces, open("trace_profiling/task_traces.pkl", "wb"))

browsergym-workarena 0.2.0__tar.gz → 0.3.0__tar.gz

browsergym-workarena 0.2.0tar.gz → 0.3.0tar.gz