browsergym-workarena 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/PKG-INFO +19 -18
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/README.md +17 -16
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/pyproject.toml +1 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/requirements.txt +1 -1
- browsergym_workarena-0.3.0/scripts/extract_finetuning_traces.py +131 -0
- browsergym_workarena-0.3.0/scripts/generate_knowledge_base.ipynb +1499 -0
- browsergym_workarena-0.3.0/scripts/make_human_eval_curriculum.py +54 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/__init__.py +13 -1
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/category.py +74 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/change_request.py +87 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/computer_asset.py +90 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/cost_center.py +19 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/expense_line.py +89 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/incident.py +45 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/knowledge.py +29 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/problem.py +90 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/report.py +183 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/api/requested_items.py +63 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/user.py +11 -8
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/utils.py +47 -3
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/config.py +21 -1
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +1 -1
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +1 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/knowledge/protocols.json +46 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/knowledge/test.html +1 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +12 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +12 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +12 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +12 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +12 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +12 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +12 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +12 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +12 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/all_menu.json +1 -1
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +1 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +1 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +1 -1
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/impersonation_users.json +1 -1
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +1 -1
- browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +1 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/human_eval/console.js +176 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/human_eval/tool.py +366 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/install.py +81 -20
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/base.py +55 -20
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/comp_building_block.py +4 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/__init__.py +76 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/base.py +364 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_base.py +1366 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_catalog.py +1127 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +2047 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_create_incident.py +403 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +278 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_create_problem.py +336 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +235 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_filter.py +1600 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_request_item.py +1315 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +693 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/delete_record.py +341 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/edit_knowledge_base.py +457 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/expense_management.py +598 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/filter_and_do.py +139 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/find_and_order_item.py +345 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +1417 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +499 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/maximize_investment_return.py +1763 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/navigate_and_do.py +1151 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +2100 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/offboard_user.py +207 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/onboard_user.py +226 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/update_task.py +145 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/utils/curriculum.py +215 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +151 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/utils/knapsack.py +192 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/warranty_check.py +227 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/work_assignment.py +804 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/workload_balancing.py +396 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/dashboard.py +188 -8
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/form.py +1593 -0
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/knowledge.py +359 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/list.py +519 -102
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/mark_duplicate_problem.py +171 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/navigation.py +55 -13
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/extract_all_menu_items.py +9 -2
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +6 -5
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/service_catalog.py +2 -1
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/validate.py +8 -2
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/send_chat_message.py +90 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/service_catalog.py +94 -26
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/form.py +1 -4
- browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/utils/private_tasks.py +63 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/utils.py +13 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_api.py +1 -0
- browsergym_workarena-0.3.0/tests/test_compositional_utils.py +92 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_random_config_generation.py +24 -23
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_task_from_config.py +37 -3
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_task_general.py +4 -9
- browsergym_workarena-0.2.1/scripts/generate_knowledge_base.ipynb +0 -1374
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +0 -34
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +0 -48
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +0 -53
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +0 -28
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +0 -29
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +0 -59
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +0 -1
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +0 -1
- browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +0 -1
- browsergym_workarena-0.2.1/src/browsergym/workarena/tasks/form.py +0 -801
- browsergym_workarena-0.2.1/src/browsergym/workarena/tasks/knowledge.py +0 -168
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/.github/workflows/pypi.yml +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/.github/workflows/unit_tests.yml +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/.gitignore +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/LICENSE +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/dev/environment.yaml +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/dev/requirements.txt +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/__init__.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/requests.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/ui_themes.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_change_request_form_fields.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_hardware_form_fields.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_problem_form_fields.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_user_form_fields.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/knowledge/kb_autopublish_workflow.xml +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/knowledge/knowledge_base.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/ui_themes/workarena_themes.xml +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_change_request_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_hardware_asset_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_incident_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_problem_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_user_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_asset_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_change_request_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_hardware_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_incident_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_user_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/knowledge_base_configs.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_apple_mac_book_pro15_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_apple_watch_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_developer_laptop_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_development_laptop_pc_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_ipad_mini_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_ipad_pro_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_loaner_laptop_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_sales_laptop_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_standard_laptop_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_asset_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_change_request_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_hardware_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_incident_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_service_catalog_item_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_user_list_task.json +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/instance.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/__init__.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/README.md +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/generate_forms.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/knowledge.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/list.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/navigation.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/__init__.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/debug.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/js_utils.js +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/string.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/utils.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_snow_instance.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_task_setup.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_utils.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_validate.py +0 -0
- {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: browsergym-workarena
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: WorkArena benchmark for BrowserGym
|
|
5
5
|
Project-URL: homepage, https://github.com/ServiceNow/WorkArena
|
|
6
6
|
Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme, Megh Thakkar
|
|
@@ -15,7 +15,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
15
15
|
Requires-Python: >3.7
|
|
16
16
|
Requires-Dist: browsergym-core>=0.2
|
|
17
17
|
Requires-Dist: english-words>=2.0.1
|
|
18
|
-
Requires-Dist: faker>=24.
|
|
18
|
+
Requires-Dist: faker>=24.8.0
|
|
19
19
|
Requires-Dist: numpy>=1.14
|
|
20
20
|
Requires-Dist: requests>=2.31
|
|
21
21
|
Requires-Dist: tenacity>=8.2.3
|
|
@@ -34,12 +34,9 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
|
|
|
34
34
|
|
|
35
35
|
https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
|
|
36
36
|
|
|
37
|
-
## ⚠️ Pre-Release warning ⚠️
|
|
38
|
-
Please note that the WorkArena benchmark is still undergoing minor bug fixes and updates, which may cause discrepancies with results reported in our latest arXiv preprint. We plan to release soon a stable version of WorkArena with enhanced stability, and a final version v1.0.0 with a new suite of tasks.
|
|
39
|
-
|
|
40
37
|
## Benchmark Contents
|
|
41
38
|
|
|
42
|
-
At the moment, WorkArena includes `
|
|
39
|
+
At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
|
|
43
40
|
|
|
44
41
|
### Knowledge Bases
|
|
45
42
|
|
|
@@ -77,8 +74,11 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
|
|
|
77
74
|
|
|
78
75
|
### Dashboards
|
|
79
76
|
|
|
80
|
-
**Goal:** The agent must
|
|
77
|
+
**Goal:** The agent must answer a question that requires reading charts and (optionally) performing simple reasoning over them.
|
|
78
|
+
|
|
79
|
+
*Note: For demonstration purposes, a human is controlling the cursor since this is a pure retrieval task*
|
|
81
80
|
|
|
81
|
+
https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
|
|
82
82
|
|
|
83
83
|
|
|
84
84
|
## Getting Started
|
|
@@ -122,6 +122,8 @@ Your installation is now complete! 🎉
|
|
|
122
122
|
|
|
123
123
|
Run this code to see WorkArena in action.
|
|
124
124
|
|
|
125
|
+
Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
|
|
126
|
+
|
|
125
127
|
```python
|
|
126
128
|
import random
|
|
127
129
|
|
|
@@ -136,28 +138,27 @@ for task in ALL_WORKARENA_TASKS:
|
|
|
136
138
|
|
|
137
139
|
# Instantiate a new environment
|
|
138
140
|
env = BrowserEnv(task_entrypoint=task,
|
|
139
|
-
headless=False
|
|
140
|
-
slow_mo=1000)
|
|
141
|
+
headless=False)
|
|
141
142
|
env.reset()
|
|
142
143
|
|
|
143
144
|
# Cheat functions use Playwright to automatically solve the task
|
|
144
145
|
env.chat.add_message(role="assistant", msg="On it. Please wait...")
|
|
145
|
-
|
|
146
|
+
cheat_messages = []
|
|
147
|
+
env.task.cheat(env.page, cheat_messages)
|
|
148
|
+
|
|
149
|
+
# Send cheat messages to chat
|
|
150
|
+
for cheat_msg in cheat_messages:
|
|
151
|
+
env.chat.add_message(role=cheat_msg["role"], msg=cheat_msg["message"])
|
|
146
152
|
|
|
147
153
|
# Post solution to chat
|
|
148
|
-
|
|
149
|
-
answer = env.chat.messages[-1]["message"]
|
|
150
|
-
env.chat.add_message(role="assistant", msg=f"The answer is:")
|
|
151
|
-
env.chat.add_message(role="assistant", msg=answer)
|
|
152
|
-
else:
|
|
153
|
-
env.chat.add_message(role="assistant", msg="I'm done!")
|
|
154
|
+
env.chat.add_message(role="assistant", msg="I'm done!")
|
|
154
155
|
|
|
155
156
|
# Validate the solution
|
|
156
|
-
reward, stop,
|
|
157
|
+
reward, stop, message, info = env.task.validate(env.page, cheat_messages)
|
|
157
158
|
if reward == 1:
|
|
158
159
|
env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
|
|
159
160
|
else:
|
|
160
|
-
env.chat.add_message(role="user", msg=f"No, that doesn't work. {
|
|
161
|
+
env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
|
|
161
162
|
|
|
162
163
|
sleep(3)
|
|
163
164
|
env.close()
|
|
@@ -10,12 +10,9 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
|
|
|
10
10
|
|
|
11
11
|
https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
|
|
12
12
|
|
|
13
|
-
## ⚠️ Pre-Release warning ⚠️
|
|
14
|
-
Please note that the WorkArena benchmark is still undergoing minor bug fixes and updates, which may cause discrepancies with results reported in our latest arXiv preprint. We plan to release soon a stable version of WorkArena with enhanced stability, and a final version v1.0.0 with a new suite of tasks.
|
|
15
|
-
|
|
16
13
|
## Benchmark Contents
|
|
17
14
|
|
|
18
|
-
At the moment, WorkArena includes `
|
|
15
|
+
At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
|
|
19
16
|
|
|
20
17
|
### Knowledge Bases
|
|
21
18
|
|
|
@@ -53,8 +50,11 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
|
|
|
53
50
|
|
|
54
51
|
### Dashboards
|
|
55
52
|
|
|
56
|
-
**Goal:** The agent must
|
|
53
|
+
**Goal:** The agent must answer a question that requires reading charts and (optionally) performing simple reasoning over them.
|
|
54
|
+
|
|
55
|
+
*Note: For demonstration purposes, a human is controlling the cursor since this is a pure retrieval task*
|
|
57
56
|
|
|
57
|
+
https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
## Getting Started
|
|
@@ -98,6 +98,8 @@ Your installation is now complete! 🎉
|
|
|
98
98
|
|
|
99
99
|
Run this code to see WorkArena in action.
|
|
100
100
|
|
|
101
|
+
Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
|
|
102
|
+
|
|
101
103
|
```python
|
|
102
104
|
import random
|
|
103
105
|
|
|
@@ -112,28 +114,27 @@ for task in ALL_WORKARENA_TASKS:
|
|
|
112
114
|
|
|
113
115
|
# Instantiate a new environment
|
|
114
116
|
env = BrowserEnv(task_entrypoint=task,
|
|
115
|
-
headless=False
|
|
116
|
-
slow_mo=1000)
|
|
117
|
+
headless=False)
|
|
117
118
|
env.reset()
|
|
118
119
|
|
|
119
120
|
# Cheat functions use Playwright to automatically solve the task
|
|
120
121
|
env.chat.add_message(role="assistant", msg="On it. Please wait...")
|
|
121
|
-
|
|
122
|
+
cheat_messages = []
|
|
123
|
+
env.task.cheat(env.page, cheat_messages)
|
|
124
|
+
|
|
125
|
+
# Send cheat messages to chat
|
|
126
|
+
for cheat_msg in cheat_messages:
|
|
127
|
+
env.chat.add_message(role=cheat_msg["role"], msg=cheat_msg["message"])
|
|
122
128
|
|
|
123
129
|
# Post solution to chat
|
|
124
|
-
|
|
125
|
-
answer = env.chat.messages[-1]["message"]
|
|
126
|
-
env.chat.add_message(role="assistant", msg=f"The answer is:")
|
|
127
|
-
env.chat.add_message(role="assistant", msg=answer)
|
|
128
|
-
else:
|
|
129
|
-
env.chat.add_message(role="assistant", msg="I'm done!")
|
|
130
|
+
env.chat.add_message(role="assistant", msg="I'm done!")
|
|
130
131
|
|
|
131
132
|
# Validate the solution
|
|
132
|
-
reward, stop,
|
|
133
|
+
reward, stop, message, info = env.task.validate(env.page, cheat_messages)
|
|
133
134
|
if reward == 1:
|
|
134
135
|
env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
|
|
135
136
|
else:
|
|
136
|
-
env.chat.add_message(role="user", msg=f"No, that doesn't work. {
|
|
137
|
+
env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
|
|
137
138
|
|
|
138
139
|
sleep(3)
|
|
139
140
|
env.close()
|
|
@@ -31,6 +31,7 @@ homepage = "https://github.com/ServiceNow/WorkArena"
|
|
|
31
31
|
|
|
32
32
|
[project.scripts]
|
|
33
33
|
workarena-install = "browsergym.workarena.install:main"
|
|
34
|
+
workarena-human-eval = "browsergym.workarena.human_eval.tool:main"
|
|
34
35
|
|
|
35
36
|
[tool.hatch.version]
|
|
36
37
|
path = "src/browsergym/workarena/__init__.py"
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A demonstration of how observation/action traces can be extracted
|
|
3
|
+
for WorkArena tasks without modifying the task code.
|
|
4
|
+
|
|
5
|
+
Author: Alexandre Drouin (alexandre.drouin@servicenow.com)
|
|
6
|
+
|
|
7
|
+
Notes:
|
|
8
|
+
- This approach relies on monkey patching the playwright actions to log the actions and observations.
|
|
9
|
+
It has not been tested for parallel execution. It might work with multiprocessing, but it will for
|
|
10
|
+
sure not work with multithreading.
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import importlib
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import pickle
|
|
18
|
+
import playwright.sync_api as playwright_sync
|
|
19
|
+
|
|
20
|
+
from browsergym.core.env import BrowserEnv
|
|
21
|
+
from browsergym.workarena import ALL_WORKARENA_TASKS
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
24
|
+
from time import time
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
N_PER_TASK = 10
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def monkey_patch_playwright(observation_callback, trace_storage):
|
|
31
|
+
"""
|
|
32
|
+
A function that overrides the default playwright actions to log the actions and observations.
|
|
33
|
+
|
|
34
|
+
Parameters:
|
|
35
|
+
------------
|
|
36
|
+
observation_callback: callable
|
|
37
|
+
A function that returns the observation of the environment.
|
|
38
|
+
trace_storage: list
|
|
39
|
+
A list to store the trace of the actions and observations.
|
|
40
|
+
These will be appended in-place.
|
|
41
|
+
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def wrapper(func, interface):
|
|
45
|
+
def wrapped(*args, **kwargs):
|
|
46
|
+
# Get the observation
|
|
47
|
+
obs = observation_callback()
|
|
48
|
+
|
|
49
|
+
# Get the BID of the element on which we are acting.
|
|
50
|
+
if interface.__name__ == "Locator":
|
|
51
|
+
# Get the locator
|
|
52
|
+
locator = args[0]
|
|
53
|
+
# Get the BID
|
|
54
|
+
bid = locator.element_handle().evaluate('(el) => el.getAttribute("bid")')
|
|
55
|
+
elif interface.__name__ == "Keyboard":
|
|
56
|
+
# Get the BID of the element
|
|
57
|
+
bid = "keyboard"
|
|
58
|
+
else:
|
|
59
|
+
# Get the BID of the element
|
|
60
|
+
bid = args[0].evaluate('(el) => el.getAttribute("bid")')
|
|
61
|
+
|
|
62
|
+
logging.info(f"Action: {func.__name__} BID: {bid} -- Args: {args[1:]} {kwargs}")
|
|
63
|
+
trace_storage.append(
|
|
64
|
+
{
|
|
65
|
+
"obs": obs,
|
|
66
|
+
"action": func.__name__,
|
|
67
|
+
"args": args[1:],
|
|
68
|
+
"kwargs": kwargs,
|
|
69
|
+
"bid": bid,
|
|
70
|
+
"time": time(),
|
|
71
|
+
}
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Resume action
|
|
75
|
+
return func(*args, **kwargs)
|
|
76
|
+
|
|
77
|
+
return wrapped
|
|
78
|
+
|
|
79
|
+
# Interfaces and actions we want to monkey patch
|
|
80
|
+
importlib.reload(playwright_sync)
|
|
81
|
+
from playwright.sync_api import Page, Frame, Locator, Keyboard, ElementHandle
|
|
82
|
+
|
|
83
|
+
# TODO: Make sure the list of interfaces and actions is exhaustive
|
|
84
|
+
# It covers all that is used in WorkArena cheats as of April 11, 2024
|
|
85
|
+
interfaces = [Page, Frame, Locator, Keyboard, ElementHandle]
|
|
86
|
+
actions = ["click", "select_option", "set_checked", "fill", "press", "type", "down", "up"]
|
|
87
|
+
|
|
88
|
+
for interface in interfaces:
|
|
89
|
+
for action in actions:
|
|
90
|
+
if hasattr(interface, action):
|
|
91
|
+
setattr(interface, action, wrapper(getattr(interface, action), interface))
|
|
92
|
+
print(f"Monkey patched {interface.__name__}.{action}")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
|
|
96
|
+
def extract_trace(task_cls, headless=True):
|
|
97
|
+
"""
|
|
98
|
+
Extracts the trace of actions and observations for a given task.
|
|
99
|
+
|
|
100
|
+
Parameters:
|
|
101
|
+
------------
|
|
102
|
+
task_cls: class
|
|
103
|
+
The class of the task to extract the trace from.
|
|
104
|
+
|
|
105
|
+
"""
|
|
106
|
+
# Instantiate a new environment
|
|
107
|
+
env = BrowserEnv(task_entrypoint=task_cls, headless=headless, slow_mo=1000)
|
|
108
|
+
|
|
109
|
+
# Setup customized tracing
|
|
110
|
+
trace = []
|
|
111
|
+
monkey_patch_playwright(observation_callback=env._get_obs, trace_storage=trace)
|
|
112
|
+
|
|
113
|
+
env.reset()
|
|
114
|
+
env.task.cheat(env.page, env.chat.messages)
|
|
115
|
+
env.close()
|
|
116
|
+
|
|
117
|
+
return trace
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
os.makedirs("trace_profiling", exist_ok=True)
|
|
122
|
+
|
|
123
|
+
task_traces = defaultdict(list)
|
|
124
|
+
for task in ALL_WORKARENA_TASKS:
|
|
125
|
+
print("Task:", task)
|
|
126
|
+
for i in range(N_PER_TASK):
|
|
127
|
+
print(f"Extracting trace {i+1}/{N_PER_TASK}")
|
|
128
|
+
trace = extract_trace(task, headless=True)
|
|
129
|
+
task_traces[task].append(trace)
|
|
130
|
+
|
|
131
|
+
pickle.dump(task_traces, open("trace_profiling/task_traces.pkl", "wb"))
|