browsergym-workarena 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/.github/workflows/unit_tests.yml +31 -1
  2. browsergym_workarena-0.3.2/.gitignore +196 -0
  3. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/PKG-INFO +111 -12
  4. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/README.md +110 -11
  5. browsergym_workarena-0.3.2/dcat-metadata.jsonld +32 -0
  6. browsergym_workarena-0.3.2/make_human_eval_curriculum.py +44 -0
  7. browsergym_workarena-0.3.2/scripts/generate_knowledge_base.ipynb +1499 -0
  8. browsergym_workarena-0.3.2/src/browsergym/workarena/__init__.py +161 -0
  9. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/dashboard.py +3 -1
  10. browsergym_workarena-0.3.2/src/wa_action_traces.py +131 -0
  11. browsergym_workarena-0.3.2/src/workarena_test.py +37 -0
  12. browsergym_workarena-0.3.2/tests/test_compositional.py +169 -0
  13. browsergym_workarena-0.3.1/.gitignore +0 -3
  14. browsergym_workarena-0.3.1/src/browsergym/workarena/__init__.py +0 -38
  15. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/.github/workflows/pypi.yml +0 -0
  16. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/LICENSE +0 -0
  17. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/dev/environment.yaml +0 -0
  18. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/dev/requirements.txt +0 -0
  19. {browsergym_workarena-0.3.1/scripts → browsergym_workarena-0.3.2}/generate_knowledge_base.ipynb +0 -0
  20. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/pyproject.toml +0 -0
  21. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/requirements.txt +0 -0
  22. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/scripts/extract_finetuning_traces.py +0 -0
  23. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/scripts/make_human_eval_curriculum.py +0 -0
  24. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/__init__.py +0 -0
  25. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/category.py +0 -0
  26. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/change_request.py +0 -0
  27. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/computer_asset.py +0 -0
  28. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/cost_center.py +0 -0
  29. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/expense_line.py +0 -0
  30. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/incident.py +0 -0
  31. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/knowledge.py +0 -0
  32. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/problem.py +0 -0
  33. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/report.py +0 -0
  34. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/requested_items.py +0 -0
  35. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/requests.py +0 -0
  36. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/ui_themes.py +0 -0
  37. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/user.py +0 -0
  38. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/api/utils.py +0 -0
  39. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/config.py +0 -0
  40. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_change_request_form_fields.json +0 -0
  41. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_hardware_form_fields.json +0 -0
  42. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +0 -0
  43. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_problem_form_fields.json +0 -0
  44. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +0 -0
  45. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/forms/expected_user_form_fields.json +0 -0
  46. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/knowledge/kb_autopublish_workflow.xml +0 -0
  47. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/knowledge/knowledge_base.json +0 -0
  48. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/knowledge/protocols.json +0 -0
  49. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/knowledge/test.html +0 -0
  50. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +0 -0
  51. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +0 -0
  52. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +0 -0
  53. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +0 -0
  54. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +0 -0
  55. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +0 -0
  56. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +0 -0
  57. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +0 -0
  58. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +0 -0
  59. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/setup_files/ui_themes/workarena_themes.xml +0 -0
  60. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/all_menu.json +0 -0
  61. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_change_request_task.json +0 -0
  62. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_hardware_asset_task.json +0 -0
  63. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_incident_task.json +0 -0
  64. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_problem_task.json +0 -0
  65. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/create_user_task.json +0 -0
  66. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +0 -0
  67. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +0 -0
  68. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_asset_list_task.json +0 -0
  69. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_change_request_list_task.json +0 -0
  70. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_hardware_list_task.json +0 -0
  71. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_incident_list_task.json +0 -0
  72. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +0 -0
  73. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/filter_user_list_task.json +0 -0
  74. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/impersonation_users.json +0 -0
  75. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/knowledge_base_configs.json +0 -0
  76. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_apple_mac_book_pro15_task.json +0 -0
  77. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_apple_watch_task.json +0 -0
  78. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_developer_laptop_task.json +0 -0
  79. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_development_laptop_pc_task.json +0 -0
  80. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_ipad_mini_task.json +0 -0
  81. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_ipad_pro_task.json +0 -0
  82. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_loaner_laptop_task.json +0 -0
  83. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_sales_laptop_task.json +0 -0
  84. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/order_standard_laptop_task.json +0 -0
  85. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +0 -0
  86. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +0 -0
  87. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_asset_list_task.json +0 -0
  88. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_change_request_list_task.json +0 -0
  89. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_hardware_list_task.json +0 -0
  90. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_incident_list_task.json +0 -0
  91. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_service_catalog_item_list_task.json +0 -0
  92. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/data_files/task_configs/sort_user_list_task.json +0 -0
  93. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/human_eval/console.js +0 -0
  94. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/human_eval/tool.py +0 -0
  95. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/install.py +0 -0
  96. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/instance.py +0 -0
  97. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/__init__.py +0 -0
  98. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/base.py +0 -0
  99. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/comp_building_block.py +0 -0
  100. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/__init__.py +0 -0
  101. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/base.py +0 -0
  102. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_base.py +0 -0
  103. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_catalog.py +0 -0
  104. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +0 -0
  105. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_create_incident.py +0 -0
  106. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +0 -0
  107. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_create_problem.py +0 -0
  108. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +0 -0
  109. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_filter.py +0 -0
  110. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_request_item.py +0 -0
  111. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +0 -0
  112. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/delete_record.py +0 -0
  113. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/edit_knowledge_base.py +0 -0
  114. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/expense_management.py +0 -0
  115. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/filter_and_do.py +0 -0
  116. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/find_and_order_item.py +0 -0
  117. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +0 -0
  118. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +0 -0
  119. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/maximize_investment_return.py +0 -0
  120. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/navigate_and_do.py +0 -0
  121. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +0 -0
  122. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/offboard_user.py +0 -0
  123. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/onboard_user.py +0 -0
  124. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/update_task.py +0 -0
  125. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/utils/curriculum.py +0 -0
  126. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +0 -0
  127. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/utils/knapsack.py +0 -0
  128. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/warranty_check.py +0 -0
  129. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/work_assignment.py +0 -0
  130. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/compositional/workload_balancing.py +0 -0
  131. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/form.py +0 -0
  132. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/knowledge.py +0 -0
  133. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/list.py +0 -0
  134. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/mark_duplicate_problem.py +0 -0
  135. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/navigation.py +0 -0
  136. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/README.md +0 -0
  137. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/extract_all_menu_items.py +0 -0
  138. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +0 -0
  139. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/generate_forms.py +0 -0
  140. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/knowledge.py +0 -0
  141. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/list.py +0 -0
  142. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/navigation.py +0 -0
  143. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/service_catalog.py +0 -0
  144. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/scripts/validate.py +0 -0
  145. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/send_chat_message.py +0 -0
  146. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/service_catalog.py +0 -0
  147. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/__init__.py +0 -0
  148. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/debug.py +0 -0
  149. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/form.py +0 -0
  150. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/js_utils.js +0 -0
  151. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/private_tasks.py +0 -0
  152. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/string.py +0 -0
  153. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/tasks/utils/utils.py +0 -0
  154. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/src/browsergym/workarena/utils.py +0 -0
  155. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_api.py +0 -0
  156. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_compositional_utils.py +0 -0
  157. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_random_config_generation.py +0 -0
  158. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_snow_instance.py +0 -0
  159. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_task_from_config.py +0 -0
  160. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_task_general.py +0 -0
  161. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_task_setup.py +0 -0
  162. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_utils.py +0 -0
  163. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/test_validate.py +0 -0
  164. {browsergym_workarena-0.3.1 → browsergym_workarena-0.3.2}/tests/utils.py +0 -0
@@ -5,6 +5,8 @@ on:
5
5
  branches:
6
6
  - main
7
7
  pull_request:
8
+ schedule:
9
+ - cron: '59 23 * * SUN' # Runs at midnight on Sunday
8
10
 
9
11
  jobs:
10
12
 
@@ -101,4 +103,32 @@ jobs:
101
103
  SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
102
104
  SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
103
105
  SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
104
- run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
106
+ run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
107
+
108
+ end-to-end-tests:
109
+ runs-on: ubuntu-latest
110
+ if: github.event_name == 'schedule'
111
+ defaults:
112
+ run:
113
+ shell: bash -l {0}
114
+ steps:
115
+ - name: Checkout Repository
116
+ uses: actions/checkout@v4
117
+ - name: Set up Python
118
+ uses: actions/setup-python@v5
119
+ with:
120
+ python-version: '3.10'
121
+ cache: 'pip'
122
+ - name: Pip install
123
+ working-directory: ./dev
124
+ run: pip install -r requirements.txt
125
+ - name: Pip list
126
+ run: pip list
127
+ - name: Install Playwright
128
+ run: playwright install --with-deps
129
+ - name: Run E2E Tests
130
+ env:
131
+ SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
132
+ SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
133
+ SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
134
+ run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
@@ -0,0 +1,196 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ results/
6
+ .vscode
7
+ *.csv
8
+ # C extensions
9
+ *.so
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # poetry
99
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103
+ #poetry.lock
104
+
105
+ # pdm
106
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107
+ #pdm.lock
108
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109
+ # in version control.
110
+ # https://pdm.fming.dev/#use-with-ide
111
+ .pdm.toml
112
+
113
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114
+ __pypackages__/
115
+
116
+ # Celery stuff
117
+ celerybeat-schedule
118
+ celerybeat.pid
119
+
120
+ # SageMath parsed files
121
+ *.sage.py
122
+
123
+ # Environments
124
+ .env
125
+ .venv
126
+ venv/
127
+ env.bak/
128
+ venv.bak/
129
+
130
+ # Spyder project settings
131
+ .spyderproject
132
+ .spyproject
133
+
134
+ # Rope project settings
135
+ .ropeproject
136
+
137
+ # mkdocs documentation
138
+ /site
139
+
140
+ # mypy
141
+ .mypy_cache/
142
+ .dmypy.json
143
+ dmypy.json
144
+
145
+ # Pyre type checker
146
+ .pyre/
147
+
148
+ # pytype static type analyzer
149
+ .pytype/
150
+
151
+ # Cython debug symbols
152
+ cython_debug/
153
+
154
+ # PyCharm
155
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
156
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
157
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
158
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
159
+ #.idea/
160
+
161
+ # MacOS
162
+ **/.DS_Store
163
+
164
+ .vscode
165
+ allowed_selenium.json
166
+
167
+ # Torchtune
168
+ finetuning/torchtune
169
+
170
+ # PyLLMD repo for finetuning
171
+ pyllmd_tune/research-pyllmd/
172
+ pyllmd_tune/data/
173
+
174
+
175
+ datasets/*
176
+ _sandbox.py
177
+ node_modules/
178
+ /test-results/
179
+ /playwright-report/
180
+ /blob-report/
181
+ /playwright/.cache/
182
+ /test-results/
183
+ /playwright-report/
184
+ /blob-report/
185
+ /playwright/.cache/
186
+
187
+
188
+ results/
189
+
190
+ # personal (optimass)
191
+ ICML_deadline/
192
+ mass_utils/
193
+ pyllmd_tune/
194
+
195
+ # don't ignore the miniwob_tasks_all.csv file
196
+ !miniwob_tasks_all.csv
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: browsergym-workarena
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: WorkArena benchmark for BrowserGym
5
5
  Project-URL: homepage, https://github.com/ServiceNow/WorkArena
6
6
  Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme, Megh Thakkar
@@ -22,9 +22,14 @@ Requires-Dist: tenacity>=8.2.3
22
22
  Requires-Dist: tqdm>=4.66.2
23
23
  Description-Content-Type: text/markdown
24
24
 
25
- # WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?
25
+ # WorkArena: A Benchmark for Evaluating Agents on Knowledge Work Tasks
26
+ [[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
26
27
 
27
- [[Paper]](https://arxiv.org/abs/2403.07718) ♦ [[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
28
+ ### Papers
29
+ * [ICML 2024] WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks? [[Paper]](https://arxiv.org/abs/2403.07718)
30
+
31
+ * WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks [[Paper]](https://arxiv.org/abs/2407.05291)
32
+
28
33
 
29
34
  `WorkArena` is a suite of browser-based tasks tailored to gauge web agents' effectiveness in supporting routine tasks for knowledge workers.
30
35
  By harnessing the ubiquitous [ServiceNow](https://www.servicenow.com/what-is-servicenow.html) platform, this benchmark will be instrumental in assessing the widespread state of such automations in modern knowledge work environments.
@@ -34,9 +39,47 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
34
39
 
35
40
  https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
36
41
 
42
+ ## Getting Started
43
+
44
+ To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
45
+
46
+ ### a) Create a ServiceNow Developer Instance
47
+
48
+ 1. Go to https://developer.servicenow.com/ and create an account.
49
+ 2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
50
+ 3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
51
+ 4. You should now see your URL and credentials. Based on this information, set the following environment variables:
52
+ * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
53
+ * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
54
+ * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
55
+ 6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
56
+
57
+ **Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
58
+
59
+ ### b) Install WorkArena and Initialize your Instance
60
+
61
+ Run the following command to install WorkArena in the [BrowswerGym](https://github.com/servicenow/browsergym) environment:
62
+ ```
63
+ pip install browsergym
64
+ ```
65
+
66
+ Then, install [Playwright](https://github.com/microsoft/playwright):
67
+ ```
68
+ playwright install
69
+ ```
70
+
71
+ Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
72
+ ```
73
+ workarena-install
74
+ ```
75
+ Your installation is now complete! 🎉
76
+
77
+
37
78
  ## Benchmark Contents
38
79
 
39
- At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
80
+ At the moment, WorkArena-L1 includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface, otherwise referred to as "atomic" tasks. WorkArena++ contains 682 tasks, each one sampling among thousands of potential configurations. WorkArena++ uses the atomic components presented in WorkArena, and composes them into real-world use cases evaluating planning, reasoning, and memorizing abilities of agents.
81
+
82
+ The following videos show an agent built on `GPT-4-vision` interacting with every atomic component of the benchmark. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
40
83
 
41
84
  ### Knowledge Bases
42
85
 
@@ -80,7 +123,6 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
80
123
 
81
124
  https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
82
125
 
83
-
84
126
  ## Getting Started
85
127
 
86
128
  To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
@@ -93,7 +135,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
93
135
  4. You should now see your URL and credentials. Based on this information, set the following environment variables:
94
136
  * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
95
137
  * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
96
- * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
138
+ * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in single quotes '' and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
97
139
  6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
98
140
 
99
141
  **Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
@@ -105,25 +147,24 @@ Run the following command to install WorkArena in the [BrowswerGym](https://gith
105
147
  pip install browsergym-workarena
106
148
  ```
107
149
 
108
- Then, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
150
+ Then, install [Playwright](https://github.com/microsoft/playwright):
109
151
  ```
110
- workarena-install
152
+ playwright install
111
153
  ```
112
154
 
113
- Finally, install [Playwright](https://github.com/microsoft/playwright):
155
+ Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
114
156
  ```
115
- playwright install
157
+ workarena-install
116
158
  ```
117
-
118
159
  Your installation is now complete! 🎉
119
160
 
120
-
121
161
  ## Live Demo
122
162
 
123
163
  Run this code to see WorkArena in action.
124
164
 
125
165
  Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
126
166
 
167
+ - To run a demo of WorkArena-L1 (ICML 2024) tasks using BrowserGym, use the following script:
127
168
  ```python
128
169
  import random
129
170
 
@@ -165,9 +206,55 @@ for task in ALL_WORKARENA_TASKS:
165
206
  ```
166
207
 
167
208
 
209
+
210
+ - To run a demo of WorkArena-L2 (WorkArena++) tasks using BrowserGym, use the following script. Change the filter on line 6 to `l3` to sample L3 tasks.
211
+
212
+ ```python
213
+ import random
214
+
215
+ from browsergym.core.env import BrowserEnv
216
+ from browsergym.workarena import get_all_tasks_agents
217
+
218
+ AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
219
+
220
+ AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
221
+ sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
222
+ ]
223
+ from time import sleep
224
+
225
+ for (task, seed) in zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS):
226
+ print("Task:", task)
227
+
228
+ # Instantiate a new environment
229
+ env = BrowserEnv(task_entrypoint=task,
230
+ headless=False)
231
+ env.reset()
232
+
233
+ # Cheat functions use Playwright to automatically solve the task
234
+ env.chat.add_message(role="assistant", msg="On it. Please wait...")
235
+
236
+ for i in range(len(env.task)):
237
+ sleep(1)
238
+ env.task.cheat(page=env.page, chat_messages=env.chat.messages, subtask_idx=i)
239
+ sleep(1)
240
+ reward, done, message, info = env.task.validate(page=env.page, chat_messages=env.chat.messages)
241
+
242
+ if reward == 1:
243
+ env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
244
+ else:
245
+ env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
246
+
247
+ sleep(3)
248
+ env.close()
249
+ ```
250
+
251
+ Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
252
+
168
253
  ## Citing This Work
169
254
 
170
255
  Please use the following BibTeX to cite our work:
256
+
257
+ ### WorkArena
171
258
  ```
172
259
  @misc{workarena2024,
173
260
  title={WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?},
@@ -178,3 +265,15 @@ Please use the following BibTeX to cite our work:
178
265
  primaryClass={cs.LG}
179
266
  }
180
267
  ```
268
+ ### WorkArena++
269
+ ```
270
+ @misc{boisvert2024workarenacompositionalplanningreasoningbased,
271
+ title={WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks},
272
+ author={Léo Boisvert and Megh Thakkar and Maxime Gasse and Massimo Caccia and Thibault Le Sellier De Chezelles and Quentin Cappart and Nicolas Chapados and Alexandre Lacoste and Alexandre Drouin},
273
+ year={2024},
274
+ eprint={2407.05291},
275
+ archivePrefix={arXiv},
276
+ primaryClass={cs.AI},
277
+ url={https://arxiv.org/abs/2407.05291},
278
+ }
279
+ ```
@@ -1,6 +1,11 @@
1
- # WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?
1
+ # WorkArena: A Benchmark for Evaluating Agents on Knowledge Work Tasks
2
+ [[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
2
3
 
3
- [[Paper]](https://arxiv.org/abs/2403.07718) ♦ [[Benchmark Contents]](#benchmark-contents) ♦ [[Getting Started]](#getting-started) ♦ [[Live Demo]](#live-demo) ♦ [[BrowserGym]](https://github.com/ServiceNow/BrowserGym) ♦ [[Citing This Work]](#citing-this-work)
4
+ ### Papers
5
+ * [ICML 2024] WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks? [[Paper]](https://arxiv.org/abs/2403.07718)
6
+
7
+ * WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks [[Paper]](https://arxiv.org/abs/2407.05291)
8
+
4
9
 
5
10
  `WorkArena` is a suite of browser-based tasks tailored to gauge web agents' effectiveness in supporting routine tasks for knowledge workers.
6
11
  By harnessing the ubiquitous [ServiceNow](https://www.servicenow.com/what-is-servicenow.html) platform, this benchmark will be instrumental in assessing the widespread state of such automations in modern knowledge work environments.
@@ -10,9 +15,47 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
10
15
 
11
16
  https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
12
17
 
18
+ ## Getting Started
19
+
20
+ To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
21
+
22
+ ### a) Create a ServiceNow Developer Instance
23
+
24
+ 1. Go to https://developer.servicenow.com/ and create an account.
25
+ 2. Click on `Request an instance` and select the `Washington` release (initializing the instance will take a few minutes)
26
+ 3. Once the instance is ready, you should see your instance URL and credentials. If not, click _Return to the Developer Portal_, then navigate to _Manage instance password_ and click _Reset instance password_.
27
+ 4. You should now see your URL and credentials. Based on this information, set the following environment variables:
28
+ * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
29
+ * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
30
+ * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
31
+ 6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
32
+
33
+ **Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
34
+
35
+ ### b) Install WorkArena and Initialize your Instance
36
+
37
+ Run the following command to install WorkArena in the [BrowswerGym](https://github.com/servicenow/browsergym) environment:
38
+ ```
39
+ pip install browsergym
40
+ ```
41
+
42
+ Then, install [Playwright](https://github.com/microsoft/playwright):
43
+ ```
44
+ playwright install
45
+ ```
46
+
47
+ Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
48
+ ```
49
+ workarena-install
50
+ ```
51
+ Your installation is now complete! 🎉
52
+
53
+
13
54
  ## Benchmark Contents
14
55
 
15
- At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
56
+ At the moment, WorkArena-L1 includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface, otherwise referred to as "atomic" tasks. WorkArena++ contains 682 tasks, each one sampling among thousands of potential configurations. WorkArena++ uses the atomic components presented in WorkArena, and composes them into real-world use cases evaluating planning, reasoning, and memorizing abilities of agents.
57
+
58
+ The following videos show an agent built on `GPT-4-vision` interacting with every atomic component of the benchmark. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
16
59
 
17
60
  ### Knowledge Bases
18
61
 
@@ -56,7 +99,6 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
56
99
 
57
100
  https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
58
101
 
59
-
60
102
  ## Getting Started
61
103
 
62
104
  To setup WorkArena, you will need to get your own ServiceNow instance, install our Python package, and upload some data to your instance. Follow the steps below to achieve this.
@@ -69,7 +111,7 @@ To setup WorkArena, you will need to get your own ServiceNow instance, install o
69
111
  4. You should now see your URL and credentials. Based on this information, set the following environment variables:
70
112
  * `SNOW_INSTANCE_URL`: The URL of your ServiceNow developer instance
71
113
  * `SNOW_INSTANCE_UNAME`: The username, should be "admin"
72
- * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in quotes "" and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
114
+ * `SNOW_INSTANCE_PWD`: The password, make sure you place the value in single quotes '' and be mindful of [escaping special shell characters](https://onlinelinuxtools.com/escape-shell-characters). Running `echo $SNOW_INSTANCE_PWD` should print the correct password.
73
115
  6. Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics).
74
116
 
75
117
  **Warning:** Feel free to look around the platform, but please make sure you revert any changes (e.g., changes to list views, pinning some menus, etc.) as these changes will be persistent and affect the benchmarking process.
@@ -81,25 +123,24 @@ Run the following command to install WorkArena in the [BrowswerGym](https://gith
81
123
  pip install browsergym-workarena
82
124
  ```
83
125
 
84
- Then, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
126
+ Then, install [Playwright](https://github.com/microsoft/playwright):
85
127
  ```
86
- workarena-install
128
+ playwright install
87
129
  ```
88
130
 
89
- Finally, install [Playwright](https://github.com/microsoft/playwright):
131
+ Finally, run this command in a terminal to upload the benchmark data to your ServiceNow instance:
90
132
  ```
91
- playwright install
133
+ workarena-install
92
134
  ```
93
-
94
135
  Your installation is now complete! 🎉
95
136
 
96
-
97
137
  ## Live Demo
98
138
 
99
139
  Run this code to see WorkArena in action.
100
140
 
101
141
  Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
102
142
 
143
+ - To run a demo of WorkArena-L1 (ICML 2024) tasks using BrowserGym, use the following script:
103
144
  ```python
104
145
  import random
105
146
 
@@ -141,9 +182,55 @@ for task in ALL_WORKARENA_TASKS:
141
182
  ```
142
183
 
143
184
 
185
+
186
+ - To run a demo of WorkArena-L2 (WorkArena++) tasks using BrowserGym, use the following script. Change the filter on line 6 to `l3` to sample L3 tasks.
187
+
188
+ ```python
189
+ import random
190
+
191
+ from browsergym.core.env import BrowserEnv
192
+ from browsergym.workarena import get_all_tasks_agents
193
+
194
+ AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
195
+
196
+ AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
197
+ sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
198
+ ]
199
+ from time import sleep
200
+
201
+ for (task, seed) in zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS):
202
+ print("Task:", task)
203
+
204
+ # Instantiate a new environment
205
+ env = BrowserEnv(task_entrypoint=task,
206
+ headless=False)
207
+ env.reset()
208
+
209
+ # Cheat functions use Playwright to automatically solve the task
210
+ env.chat.add_message(role="assistant", msg="On it. Please wait...")
211
+
212
+ for i in range(len(env.task)):
213
+ sleep(1)
214
+ env.task.cheat(page=env.page, chat_messages=env.chat.messages, subtask_idx=i)
215
+ sleep(1)
216
+ reward, done, message, info = env.task.validate(page=env.page, chat_messages=env.chat.messages)
217
+
218
+ if reward == 1:
219
+ env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
220
+ else:
221
+ env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
222
+
223
+ sleep(3)
224
+ env.close()
225
+ ```
226
+
227
+ Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
228
+
144
229
  ## Citing This Work
145
230
 
146
231
  Please use the following BibTeX to cite our work:
232
+
233
+ ### WorkArena
147
234
  ```
148
235
  @misc{workarena2024,
149
236
  title={WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?},
@@ -154,3 +241,15 @@ Please use the following BibTeX to cite our work:
154
241
  primaryClass={cs.LG}
155
242
  }
156
243
  ```
244
+ ### WorkArena++
245
+ ```
246
+ @misc{boisvert2024workarenacompositionalplanningreasoningbased,
247
+ title={WorkArena++: Towards Compositional Planning and Reasoning-based Common Knowledge Work Tasks},
248
+ author={Léo Boisvert and Megh Thakkar and Maxime Gasse and Massimo Caccia and Thibault Le Sellier De Chezelles and Quentin Cappart and Nicolas Chapados and Alexandre Lacoste and Alexandre Drouin},
249
+ year={2024},
250
+ eprint={2407.05291},
251
+ archivePrefix={arXiv},
252
+ primaryClass={cs.AI},
253
+ url={https://arxiv.org/abs/2407.05291},
254
+ }
255
+ ```
@@ -0,0 +1,32 @@
1
+ {
2
+ "@context": {
3
+ "dcat": "http://www.w3.org/ns/dcat#"
4
+ ,
5
+ "dct": "http://purl.org/dc/terms/"
6
+ ,
7
+ "foaf": "http://xmlns.com/foaf/0.1/"
8
+ },
9
+ "@type": "dcat:Dataset",
10
+ "dct:title": "WorkArena++",
11
+ "dct:description": "Benchmark to evaluate the reasoning, retrieval, planning, and decision making abilities of LLM and VLM-based agents",
12
+ "dct:identifier": "https://github.com/ServiceNow/WorkArena/tree/workarena-plus-plus"
13
+ ,
14
+ "dct:issued": "2024-06-12",
15
+ "dct:modified": "2024-06-12",
16
+ "dct:publisher": {
17
+ "@type": "foaf:Organization",
18
+ "foaf:name": "ServiceNow Research"
19
+ },
20
+ "dct:contactPoint": {
21
+ "@type": "vcard:Contact",
22
+ "vcard:fn": "Alexandre Drouin",
23
+ "vcard:hasEmail": "mailto:alexandre.drouin@servicenow.com"
24
+ },
25
+ "dcat:distribution": [
26
+ {
27
+ "@type": "dcat:Distribution",
28
+ "dct:format": "text/csv",
29
+ "dcat:accessURL": "https://github.com/ServiceNow/WorkArena/tree/workarena-plus-plus/src/browsergym/workarena/tasks/compositional"
30
+ }
31
+ ]
32
+ }