browsergym-workarena 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/.github/workflows/unit_tests.yml +31 -1
- browsergym_workarena-0.3.2/.gitignore +196 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/PKG-INFO +111 -12
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/README.md +110 -11
- browsergym_workarena-0.3.2/dcat-metadata.jsonld +32 -0
- browsergym_workarena-0.3.2/make_human_eval_curriculum.py +44 -0
- browsergym_workarena-0.3.2/scripts/generate_knowledge_base.ipynb +1499 -0
- browsergym_workarena-0.3.2/src/browsergym/workarena/__init__.py +161 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/dashboard.py +3 -1
- browsergym_workarena-0.3.2/src/wa_action_traces.py +131 -0
- browsergym_workarena-0.3.2/src/workarena_test.py +37 -0
- browsergym_workarena-0.3.2/tests/test_compositional.py +169 -0
- browsergym_workarena-0.3.1/.gitignore +0 -3
- browsergym_workarena-0.3.1/src/browsergym/workarena/__init__.py +0 -38
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/.github/workflows/pypi.yml +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/LICENSE +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/dev/environment.yaml +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/dev/requirements.txt +0 -0
- {browsergym_workarena-0.3.1/scripts → browsergym_workarena-0.3.2}/generate_knowledge_base.ipynb +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/pyproject.toml +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/requirements.txt +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/scripts/extract_finetuning_traces.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/scripts/make_human_eval_curriculum.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/__init__.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/category.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/change_request.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/computer_asset.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/cost_center.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/expense_line.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/incident.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/knowledge.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/problem.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/report.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/requested_items.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/requests.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/ui_themes.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/user.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/utils.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/config.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_change_request_form_fields.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_hardware_form_fields.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_problem_form_fields.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_user_form_fields.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/knowledge/kb_autopublish_workflow.xml +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/knowledge/knowledge_base.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/knowledge/protocols.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/knowledge/test.html +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/ui_themes/workarena_themes.xml +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/all_menu.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_change_request_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_hardware_asset_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_incident_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_problem_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_user_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_asset_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_change_request_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_hardware_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_incident_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_user_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/impersonation_users.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/knowledge_base_configs.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_apple_mac_book_pro15_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_apple_watch_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_developer_laptop_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_development_laptop_pc_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_ipad_mini_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_ipad_pro_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_loaner_laptop_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_sales_laptop_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_standard_laptop_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_asset_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_change_request_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_hardware_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_incident_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_service_catalog_item_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_user_list_task.json +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/human_eval/console.js +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/human_eval/tool.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/install.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/instance.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/__init__.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/base.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/comp_building_block.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/__init__.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/base.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_base.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_catalog.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_create_incident.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_create_problem.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_filter.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_request_item.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/delete_record.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/edit_knowledge_base.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/expense_management.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/filter_and_do.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/find_and_order_item.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/maximize_investment_return.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/navigate_and_do.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/offboard_user.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/onboard_user.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/update_task.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/utils/curriculum.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/utils/knapsack.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/warranty_check.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/work_assignment.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/workload_balancing.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/form.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/knowledge.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/list.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/mark_duplicate_problem.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/navigation.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/README.md +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/extract_all_menu_items.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/generate_forms.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/knowledge.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/list.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/navigation.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/service_catalog.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/validate.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/send_chat_message.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/service_catalog.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/__init__.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/debug.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/form.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/js_utils.js +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/private_tasks.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/string.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/utils.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/utils.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_api.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_compositional_utils.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_random_config_generation.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_snow_instance.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_task_from_config.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_task_general.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_task_setup.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_utils.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_validate.py +0 -0
- {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/utils.py +0 -0
|
@@ -5,6 +5,8 @@ on:
|
|
|
5
5
|
branches:
|
|
6
6
|
- main
|
|
7
7
|
pull_request:
|
|
8
|
+
schedule:
|
|
9
|
+
- cron: '59 23 * * SUN' # Runs at midnight on Sunday
|
|
8
10
|
|
|
9
11
|
jobs:
|
|
10
12
|
|
|
@@ -101,4 +103,32 @@ jobs:
|
|
|
101
103
|
SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
|
|
102
104
|
SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
|
|
103
105
|
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
|
|
104
|
-
run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
|
|
106
|
+
run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
|
|
107
|
+
|
|
108
|
+
end-to-end-tests:
|
|
109
|
+
runs-on: ubuntu-latest
|
|
110
|
+
if: github.event_name == 'schedule'
|
|
111
|
+
defaults:
|
|
112
|
+
run:
|
|
113
|
+
shell: bash -l {0}
|
|
114
|
+
steps:
|
|
115
|
+
- name: Checkout Repository
|
|
116
|
+
uses: actions/checkout@v4
|
|
117
|
+
- name: Set up Python
|
|
118
|
+
uses: actions/setup-python@v5
|
|
119
|
+
with:
|
|
120
|
+
python-version: '3.10'
|
|
121
|
+
cache: 'pip'
|
|
122
|
+
- name: Pip install
|
|
123
|
+
working-directory: ./dev
|
|
124
|
+
run: pip install -r requirements.txt
|
|
125
|
+
- name: Pip list
|
|
126
|
+
run: pip list
|
|
127
|
+
- name: Install Playwright
|
|
128
|
+
run: playwright install --with-deps
|
|
129
|
+
- name: Run E2E Tests
|
|
130
|
+
env:
|
|
131
|
+
SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
|
|
132
|
+
SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
|
|
133
|
+
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
|
|
134
|
+
run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
results/
|
|
6
|
+
.vscode
|
|
7
|
+
*.csv
|
|
8
|
+
# C extensions
|
|
9
|
+
*.so
|
|
10
|
+
# Distribution / packaging
|
|
11
|
+
.Python
|
|
12
|
+
build/
|
|
13
|
+
develop-eggs/
|
|
14
|
+
dist/
|
|
15
|
+
downloads/
|
|
16
|
+
eggs/
|
|
17
|
+
.eggs/
|
|
18
|
+
lib/
|
|
19
|
+
lib64/
|
|
20
|
+
parts/
|
|
21
|
+
sdist/
|
|
22
|
+
var/
|
|
23
|
+
wheels/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
# Usually these files are written by a python script from a template
|
|
32
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
33
|
+
*.manifest
|
|
34
|
+
*.spec
|
|
35
|
+
|
|
36
|
+
# Installer logs
|
|
37
|
+
pip-log.txt
|
|
38
|
+
pip-delete-this-directory.txt
|
|
39
|
+
|
|
40
|
+
# Unit test / coverage reports
|
|
41
|
+
htmlcov/
|
|
42
|
+
.tox/
|
|
43
|
+
.nox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
*.py,cover
|
|
51
|
+
.hypothesis/
|
|
52
|
+
.pytest_cache/
|
|
53
|
+
cover/
|
|
54
|
+
|
|
55
|
+
# Translations
|
|
56
|
+
*.mo
|
|
57
|
+
*.pot
|
|
58
|
+
|
|
59
|
+
# Django stuff:
|
|
60
|
+
*.log
|
|
61
|
+
local_settings.py
|
|
62
|
+
db.sqlite3
|
|
63
|
+
db.sqlite3-journal
|
|
64
|
+
|
|
65
|
+
# Flask stuff:
|
|
66
|
+
instance/
|
|
67
|
+
.webassets-cache
|
|
68
|
+
|
|
69
|
+
# Scrapy stuff:
|
|
70
|
+
.scrapy
|
|
71
|
+
|
|
72
|
+
# Sphinx documentation
|
|
73
|
+
docs/_build/
|
|
74
|
+
|
|
75
|
+
# PyBuilder
|
|
76
|
+
.pybuilder/
|
|
77
|
+
target/
|
|
78
|
+
|
|
79
|
+
# Jupyter Notebook
|
|
80
|
+
.ipynb_checkpoints
|
|
81
|
+
|
|
82
|
+
# IPython
|
|
83
|
+
profile_default/
|
|
84
|
+
ipython_config.py
|
|
85
|
+
|
|
86
|
+
# pyenv
|
|
87
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
88
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
89
|
+
# .python-version
|
|
90
|
+
|
|
91
|
+
# pipenv
|
|
92
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
93
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
94
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
95
|
+
# install all needed dependencies.
|
|
96
|
+
#Pipfile.lock
|
|
97
|
+
|
|
98
|
+
# poetry
|
|
99
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
100
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
101
|
+
# commonly ignored for libraries.
|
|
102
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
103
|
+
#poetry.lock
|
|
104
|
+
|
|
105
|
+
# pdm
|
|
106
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
107
|
+
#pdm.lock
|
|
108
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
109
|
+
# in version control.
|
|
110
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
111
|
+
.pdm.toml
|
|
112
|
+
|
|
113
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
114
|
+
__pypackages__/
|
|
115
|
+
|
|
116
|
+
# Celery stuff
|
|
117
|
+
celerybeat-schedule
|
|
118
|
+
celerybeat.pid
|
|
119
|
+
|
|
120
|
+
# SageMath parsed files
|
|
121
|
+
*.sage.py
|
|
122
|
+
|
|
123
|
+
# Environments
|
|
124
|
+
.env
|
|
125
|
+
.venv
|
|
126
|
+
venv/
|
|
127
|
+
env.bak/
|
|
128
|
+
venv.bak/
|
|
129
|
+
|
|
130
|
+
# Spyder project settings
|
|
131
|
+
.spyderproject
|
|
132
|
+
.spyproject
|
|
133
|
+
|
|
134
|
+
# Rope project settings
|
|
135
|
+
.ropeproject
|
|
136
|
+
|
|
137
|
+
# mkdocs documentation
|
|
138
|
+
/site
|
|
139
|
+
|
|
140
|
+
# mypy
|
|
141
|
+
.mypy_cache/
|
|
142
|
+
.dmypy.json
|
|
143
|
+
dmypy.json
|
|
144
|
+
|
|
145
|
+
# Pyre type checker
|
|
146
|
+
.pyre/
|
|
147
|
+
|
|
148
|
+
# pytype static type analyzer
|
|
149
|
+
.pytype/
|
|
150
|
+
|
|
151
|
+
# Cython debug symbols
|
|
152
|
+
cython_debug/
|
|
153
|
+
|
|
154
|
+
# PyCharm
|
|
155
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
156
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
157
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
158
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
159
|
+
#.idea/
|
|
160
|
+
|
|
161
|
+
# MacOS
|
|
162
|
+
**/.DS_Store
|
|
163
|
+
|
|
164
|
+
.vscode
|
|
165
|
+
allowed_selenium.json
|
|
166
|
+
|
|
167
|
+
# Torchtune
|
|
168
|
+
finetuning/torchtune
|
|
169
|
+
|
|
170
|
+
# PyLLMD repo for finetuning
|
|
171
|
+
pyllmd_tune/research-pyllmd/
|
|
172
|
+
pyllmd_tune/data/
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
datasets/*
|
|
176
|
+
_sandbox.py
|
|
177
|
+
node_modules/
|
|
178
|
+
/test-results/
|
|
179
|
+
/playwright-report/
|
|
180
|
+
/blob-report/
|
|
181
|
+
/playwright/.cache/
|
|
182
|
+
/test-results/
|
|
183
|
+
/playwright-report/
|
|
184
|
+
/blob-report/
|
|
185
|
+
/playwright/.cache/
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
results/
|
|
189
|
+
|
|
190
|
+
# personal (optimass)
|
|
191
|
+
ICML_deadline/
|
|
192
|
+
mass_utils/
|
|
193
|
+
pyllmd_tune/
|
|
194
|
+
|
|
195
|
+
# don't ignore the miniwob_tasks_all.csv file
|
|
196
|
+
!miniwob_tasks_all.csv
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: browsergym-workarena
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: WorkArena benchmark for BrowserGym
|
|
5
5
|
Project-URL: homepage, https://github.com/ServiceNow/WorkArena
|
|
6
6
|
Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme, Megh Thakkar
|
|
@@ -22,9 +22,14 @@ Requires-Dist: tenacity>=8.2.3
|
|
|
22
22
|
Requires-Dist: tqdm>=4.66.2
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
|
|
25
|
-
# WorkArena:
|
|
25
|
+
# WorkArena: A Benchmark for Evaluating Agents on Knowledge Work Tasks
|
|
26
|
+
[[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
### Papers
|
|
29
|
+
* [ICML 2024] WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks? [[Paper]](https://arxiv.org/abs/2403.07718)
|
|
30
|
+
|
|
31
|
+
* WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks [[Paper]](https://arxiv.org/abs/2407.05291)
|
|
32
|
+
|
|
28
33
|
|
|
29
34
|
`WorkArena` is a suite of browser-based tasks tailored to gauge web agents' effectiveness in supporting routine tasks for knowledge workers.
|
|
30
35
|
By harnessing the ubiquitous [ServiceNow](https://www.servicenow.com/what-is-servicenow.html) platform, this benchmark will be instrumental in assessing the widespread state of such automations in modern knowledge work environments.
|
|
@@ -34,9 +39,47 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
|
|
|
34
39
|
|
|
35
40
|
https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
|
|
36
41
|
|
|
42
|
+
## Getting Started
|
|
43
|
+
|
|
44
|
+
To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
|
|
45
|
+
|
|
46
|
+
### a) Create a ServiceNow Developer Instance
|
|
47
|
+
|
|
48
|
+
1. Go to https://developer.servicenow.com/ and create an account.
|
|
49
|
+
2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
|
|
50
|
+
3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
|
|
51
|
+
4. You should now see your URL and credentials. Based on this information, set the following environment variables:
|
|
52
|
+
* `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
|
|
53
|
+
* `SNOW_INSTANCE_UNAME`: The username, should be "admin"
|
|
54
|
+
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
|
|
55
|
+
6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
|
|
56
|
+
|
|
57
|
+
**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
|
|
58
|
+
|
|
59
|
+
### b) Install WorkArena and Initialize your Instance
|
|
60
|
+
|
|
61
|
+
Run the following command to install WorkArena in the [BrowswerGym](https://github.com/servicenow/browsergym) environment:
|
|
62
|
+
```
|
|
63
|
+
pip install browsergym
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Then, install [Playwright](https://github.com/microsoft/playwright):
|
|
67
|
+
```
|
|
68
|
+
playwright install
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
|
|
72
|
+
```
|
|
73
|
+
workarena-install
|
|
74
|
+
```
|
|
75
|
+
Your installation is now complete! 🎉
|
|
76
|
+
|
|
77
|
+
|
|
37
78
|
## Benchmark Contents
|
|
38
79
|
|
|
39
|
-
At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface
|
|
80
|
+
At the moment, WorkArena-L1 includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface, otherwise referred to as "atomic" tasks. WorkArena++ contains 682 tasks, each one sampling among thousands of potential configurations. WorkArena++ uses the atomic components presented in WorkArena, and composes them into real-world use cases evaluating planning, reasoning, and memorizing abilities of agents.
|
|
81
|
+
|
|
82
|
+
The following videos show an agent built on `GPT-4-vision` interacting with every atomic component of the benchmark. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
|
|
40
83
|
|
|
41
84
|
### Knowledge Bases
|
|
42
85
|
|
|
@@ -80,7 +123,6 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
|
|
|
80
123
|
|
|
81
124
|
https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
|
|
82
125
|
|
|
83
|
-
|
|
84
126
|
## Getting Started
|
|
85
127
|
|
|
86
128
|
To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
|
|
@@ -93,7 +135,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
|
|
|
93
135
|
4. You should now see your URL and credentials. Based on this information, set the following environment variables:
|
|
94
136
|
* `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
|
|
95
137
|
* `SNOW_INSTANCE_UNAME`: The username, should be "admin"
|
|
96
|
-
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes
|
|
138
|
+
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in single quotes '' and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
|
|
97
139
|
6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
|
|
98
140
|
|
|
99
141
|
**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
|
|
@@ -105,25 +147,24 @@ Run the following command to install WorkArena in the [BrowswerGym](https://gith
|
|
|
105
147
|
pip install browsergym-workarena
|
|
106
148
|
```
|
|
107
149
|
|
|
108
|
-
Then,
|
|
150
|
+
Then, install [Playwright](https://github.com/microsoft/playwright):
|
|
109
151
|
```
|
|
110
|
-
|
|
152
|
+
playwright install
|
|
111
153
|
```
|
|
112
154
|
|
|
113
|
-
Finally,
|
|
155
|
+
Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
|
|
114
156
|
```
|
|
115
|
-
|
|
157
|
+
workarena-install
|
|
116
158
|
```
|
|
117
|
-
|
|
118
159
|
Your installation is now complete! 🎉
|
|
119
160
|
|
|
120
|
-
|
|
121
161
|
## Live Demo
|
|
122
162
|
|
|
123
163
|
Run this code to see WorkArena in action.
|
|
124
164
|
|
|
125
165
|
Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
|
|
126
166
|
|
|
167
|
+
- To run a demo of WorkArena-L1 (ICML 2024) tasks using BrowserGym, use the following script:
|
|
127
168
|
```python
|
|
128
169
|
import random
|
|
129
170
|
|
|
@@ -165,9 +206,55 @@ for task in ALL_WORKARENA_TASKS:
|
|
|
165
206
|
```
|
|
166
207
|
|
|
167
208
|
|
|
209
|
+
|
|
210
|
+
- To run a demo of WorkArena-L2 (WorkArena++) tasks using BrowserGym, use the following script. Change the filter on line 6 to `l3` to sample L3 tasks.
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
import random
|
|
214
|
+
|
|
215
|
+
from browsergym.core.env import BrowserEnv
|
|
216
|
+
from browsergym.workarena import get_all_tasks_agents
|
|
217
|
+
|
|
218
|
+
AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
|
|
219
|
+
|
|
220
|
+
AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
|
|
221
|
+
sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
|
|
222
|
+
]
|
|
223
|
+
from time import sleep
|
|
224
|
+
|
|
225
|
+
for (task, seed) in zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS):
|
|
226
|
+
print("Task:", task)
|
|
227
|
+
|
|
228
|
+
# Instantiate a new environment
|
|
229
|
+
env = BrowserEnv(task_entrypoint=task,
|
|
230
|
+
headless=False)
|
|
231
|
+
env.reset()
|
|
232
|
+
|
|
233
|
+
# Cheat functions use Playwright to automatically solve the task
|
|
234
|
+
env.chat.add_message(role="assistant", msg="On it. Please wait...")
|
|
235
|
+
|
|
236
|
+
for i in range(len(env.task)):
|
|
237
|
+
sleep(1)
|
|
238
|
+
env.task.cheat(page=env.page, chat_messages=env.chat.messages, subtask_idx=i)
|
|
239
|
+
sleep(1)
|
|
240
|
+
reward, done, message, info = env.task.validate(page=env.page, chat_messages=env.chat.messages)
|
|
241
|
+
|
|
242
|
+
if reward == 1:
|
|
243
|
+
env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
|
|
244
|
+
else:
|
|
245
|
+
env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
|
|
246
|
+
|
|
247
|
+
sleep(3)
|
|
248
|
+
env.close()
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
|
|
252
|
+
|
|
168
253
|
## Citing This Work
|
|
169
254
|
|
|
170
255
|
Please use the following BibTeX to cite our work:
|
|
256
|
+
|
|
257
|
+
### WorkArena
|
|
171
258
|
```
|
|
172
259
|
@misc{workarena2024,
|
|
173
260
|
title={WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?},
|
|
@@ -178,3 +265,15 @@ Please use the following BibTeX to cite our work:
|
|
|
178
265
|
primaryClass={cs.LG}
|
|
179
266
|
}
|
|
180
267
|
```
|
|
268
|
+
### WorkArena++
|
|
269
|
+
```
|
|
270
|
+
@misc{boisvert2024workarenacompositionalplanningreasoningbased,
|
|
271
|
+
title={WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks},
|
|
272
|
+
author={Léo Boisvert and Megh Thakkar and Maxime Gasse and Massimo Caccia and Thibault Le Sellier De Chezelles and Quentin Cappart and Nicolas Chapados and Alexandre Lacoste and Alexandre Drouin},
|
|
273
|
+
year={2024},
|
|
274
|
+
eprint={2407.05291},
|
|
275
|
+
archivePrefix={arXiv},
|
|
276
|
+
primaryClass={cs.AI},
|
|
277
|
+
url={https://arxiv.org/abs/2407.05291},
|
|
278
|
+
}
|
|
279
|
+
```
|
|
@@ -1,6 +1,11 @@
|
|
|
1
|
-
# WorkArena:
|
|
1
|
+
# WorkArena: A Benchmark for Evaluating Agents on Knowledge Work Tasks
|
|
2
|
+
[[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
|
|
2
3
|
|
|
3
|
-
|
|
4
|
+
### Papers
|
|
5
|
+
* [ICML 2024] WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks? [[Paper]](https://arxiv.org/abs/2403.07718)
|
|
6
|
+
|
|
7
|
+
* WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks [[Paper]](https://arxiv.org/abs/2407.05291)
|
|
8
|
+
|
|
4
9
|
|
|
5
10
|
`WorkArena` is a suite of browser-based tasks tailored to gauge web agents' effectiveness in supporting routine tasks for knowledge workers.
|
|
6
11
|
By harnessing the ubiquitous [ServiceNow](https://www.servicenow.com/what-is-servicenow.html) platform, this benchmark will be instrumental in assessing the widespread state of such automations in modern knowledge work environments.
|
|
@@ -10,9 +15,47 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
|
|
|
10
15
|
|
|
11
16
|
https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
|
|
12
17
|
|
|
18
|
+
## Getting Started
|
|
19
|
+
|
|
20
|
+
To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
|
|
21
|
+
|
|
22
|
+
### a) Create a ServiceNow Developer Instance
|
|
23
|
+
|
|
24
|
+
1. Go to https://developer.servicenow.com/ and create an account.
|
|
25
|
+
2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
|
|
26
|
+
3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
|
|
27
|
+
4. You should now see your URL and credentials. Based on this information, set the following environment variables:
|
|
28
|
+
* `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
|
|
29
|
+
* `SNOW_INSTANCE_UNAME`: The username, should be "admin"
|
|
30
|
+
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
|
|
31
|
+
6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
|
|
32
|
+
|
|
33
|
+
**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
|
|
34
|
+
|
|
35
|
+
### b) Install WorkArena and Initialize your Instance
|
|
36
|
+
|
|
37
|
+
Run the following command to install WorkArena in the [BrowswerGym](https://github.com/servicenow/browsergym) environment:
|
|
38
|
+
```
|
|
39
|
+
pip install browsergym
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Then, install [Playwright](https://github.com/microsoft/playwright):
|
|
43
|
+
```
|
|
44
|
+
playwright install
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
|
|
48
|
+
```
|
|
49
|
+
workarena-install
|
|
50
|
+
```
|
|
51
|
+
Your installation is now complete! 🎉
|
|
52
|
+
|
|
53
|
+
|
|
13
54
|
## Benchmark Contents
|
|
14
55
|
|
|
15
|
-
At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface
|
|
56
|
+
At the moment, WorkArena-L1 includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface, otherwise referred to as "atomic" tasks. WorkArena++ contains 682 tasks, each one sampling among thousands of potential configurations. WorkArena++ uses the atomic components presented in WorkArena, and composes them into real-world use cases evaluating planning, reasoning, and memorizing abilities of agents.
|
|
57
|
+
|
|
58
|
+
The following videos show an agent built on `GPT-4-vision` interacting with every atomic component of the benchmark. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
|
|
16
59
|
|
|
17
60
|
### Knowledge Bases
|
|
18
61
|
|
|
@@ -56,7 +99,6 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
|
|
|
56
99
|
|
|
57
100
|
https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
|
|
58
101
|
|
|
59
|
-
|
|
60
102
|
## Getting Started
|
|
61
103
|
|
|
62
104
|
To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
|
|
@@ -69,7 +111,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
|
|
|
69
111
|
4. You should now see your URL and credentials. Based on this information, set the following environment variables:
|
|
70
112
|
* `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
|
|
71
113
|
* `SNOW_INSTANCE_UNAME`: The username, should be "admin"
|
|
72
|
-
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes
|
|
114
|
+
* `SNOW_INSTANCE_PWD`: The password, make sure you place the value in single quotes '' and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
|
|
73
115
|
6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
|
|
74
116
|
|
|
75
117
|
**Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
|
|
@@ -81,25 +123,24 @@ Run the following command to install WorkArena in the [BrowswerGym](https://gith
|
|
|
81
123
|
pip install browsergym-workarena
|
|
82
124
|
```
|
|
83
125
|
|
|
84
|
-
Then,
|
|
126
|
+
Then, install [Playwright](https://github.com/microsoft/playwright):
|
|
85
127
|
```
|
|
86
|
-
|
|
128
|
+
playwright install
|
|
87
129
|
```
|
|
88
130
|
|
|
89
|
-
Finally,
|
|
131
|
+
Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
|
|
90
132
|
```
|
|
91
|
-
|
|
133
|
+
workarena-install
|
|
92
134
|
```
|
|
93
|
-
|
|
94
135
|
Your installation is now complete! 🎉
|
|
95
136
|
|
|
96
|
-
|
|
97
137
|
## Live Demo
|
|
98
138
|
|
|
99
139
|
Run this code to see WorkArena in action.
|
|
100
140
|
|
|
101
141
|
Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
|
|
102
142
|
|
|
143
|
+
- To run a demo of WorkArena-L1 (ICML 2024) tasks using BrowserGym, use the following script:
|
|
103
144
|
```python
|
|
104
145
|
import random
|
|
105
146
|
|
|
@@ -141,9 +182,55 @@ for task in ALL_WORKARENA_TASKS:
|
|
|
141
182
|
```
|
|
142
183
|
|
|
143
184
|
|
|
185
|
+
|
|
186
|
+
- To run a demo of WorkArena-L2 (WorkArena++) tasks using BrowserGym, use the following script. Change the filter on line 6 to `l3` to sample L3 tasks.
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
import random
|
|
190
|
+
|
|
191
|
+
from browsergym.core.env import BrowserEnv
|
|
192
|
+
from browsergym.workarena import get_all_tasks_agents
|
|
193
|
+
|
|
194
|
+
AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
|
|
195
|
+
|
|
196
|
+
AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
|
|
197
|
+
sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
|
|
198
|
+
]
|
|
199
|
+
from time import sleep
|
|
200
|
+
|
|
201
|
+
for (task, seed) in zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS):
|
|
202
|
+
print("Task:", task)
|
|
203
|
+
|
|
204
|
+
# Instantiate a new environment
|
|
205
|
+
env = BrowserEnv(task_entrypoint=task,
|
|
206
|
+
headless=False)
|
|
207
|
+
env.reset()
|
|
208
|
+
|
|
209
|
+
# Cheat functions use Playwright to automatically solve the task
|
|
210
|
+
env.chat.add_message(role="assistant", msg="On it. Please wait...")
|
|
211
|
+
|
|
212
|
+
for i in range(len(env.task)):
|
|
213
|
+
sleep(1)
|
|
214
|
+
env.task.cheat(page=env.page, chat_messages=env.chat.messages, subtask_idx=i)
|
|
215
|
+
sleep(1)
|
|
216
|
+
reward, done, message, info = env.task.validate(page=env.page, chat_messages=env.chat.messages)
|
|
217
|
+
|
|
218
|
+
if reward == 1:
|
|
219
|
+
env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
|
|
220
|
+
else:
|
|
221
|
+
env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
|
|
222
|
+
|
|
223
|
+
sleep(3)
|
|
224
|
+
env.close()
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
|
|
228
|
+
|
|
144
229
|
## Citing This Work
|
|
145
230
|
|
|
146
231
|
Please use the following BibTeX to cite our work:
|
|
232
|
+
|
|
233
|
+
### WorkArena
|
|
147
234
|
```
|
|
148
235
|
@misc{workarena2024,
|
|
149
236
|
title={WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?},
|
|
@@ -154,3 +241,15 @@ Please use the following BibTeX to cite our work:
|
|
|
154
241
|
primaryClass={cs.LG}
|
|
155
242
|
}
|
|
156
243
|
```
|
|
244
|
+
### WorkArena++
|
|
245
|
+
```
|
|
246
|
+
@misc{boisvert2024workarenacompositionalplanningreasoningbased,
|
|
247
|
+
title={WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks},
|
|
248
|
+
author={Léo Boisvert and Megh Thakkar and Maxime Gasse and Massimo Caccia and Thibault Le Sellier De Chezelles and Quentin Cappart and Nicolas Chapados and Alexandre Lacoste and Alexandre Drouin},
|
|
249
|
+
year={2024},
|
|
250
|
+
eprint={2407.05291},
|
|
251
|
+
archivePrefix={arXiv},
|
|
252
|
+
primaryClass={cs.AI},
|
|
253
|
+
url={https://arxiv.org/abs/2407.05291},
|
|
254
|
+
}
|
|
255
|
+
```
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"@context": {
|
|
3
|
+
"dcat": "http://www.w3.org/ns/dcat#"
|
|
4
|
+
,
|
|
5
|
+
"dct": "http://purl.org/dc/terms/"
|
|
6
|
+
,
|
|
7
|
+
"foaf": "http://xmlns.com/foaf/0.1/"
|
|
8
|
+
},
|
|
9
|
+
"@type": "dcat:Dataset",
|
|
10
|
+
"dct:title": "WorkArena++",
|
|
11
|
+
"dct:description": "Benchmark to evaluate the reasoning, retrieval, planning, and decision making abilities of LLM and VLM-based agents",
|
|
12
|
+
"dct:identifier": "https://github.com/ServiceNow/WorkArena/tree/workarena-plus-plus"
|
|
13
|
+
,
|
|
14
|
+
"dct:issued": "2024-06-12",
|
|
15
|
+
"dct:modified": "2024-06-12",
|
|
16
|
+
"dct:publisher": {
|
|
17
|
+
"@type": "foaf:Organization",
|
|
18
|
+
"foaf:name": "ServiceNow Research"
|
|
19
|
+
},
|
|
20
|
+
"dct:contactPoint": {
|
|
21
|
+
"@type": "vcard:Contact",
|
|
22
|
+
"vcard:fn": "Alexandre Drouin",
|
|
23
|
+
"vcard:hasEmail": "mailto:alexandre.drouin@servicenow.com"
|
|
24
|
+
},
|
|
25
|
+
"dcat:distribution": [
|
|
26
|
+
{
|
|
27
|
+
"@type": "dcat:Distribution",
|
|
28
|
+
"dct:format": "text/csv",
|
|
29
|
+
"dcat:accessURL": "https://github.com/ServiceNow/WorkArena/tree/workarena-plus-plus/src/browsergym/workarena/tasks/compositional"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
}
|