browsergym-workarena 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/PKG-INFO +19 -18
  2. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/README.md +17 -16
  3. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/pyproject.toml +1 -0
  4. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/requirements.txt +1 -1
  5. browsergym_workarena-0.3.0/scripts/extract_finetuning_traces.py +131 -0
  6. browsergym_workarena-0.3.0/scripts/generate_knowledge_base.ipynb +1499 -0
  7. browsergym_workarena-0.3.0/scripts/make_human_eval_curriculum.py +54 -0
  8. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/__init__.py +13 -1
  9. browsergym_workarena-0.3.0/src/browsergym/workarena/api/category.py +74 -0
  10. browsergym_workarena-0.3.0/src/browsergym/workarena/api/change_request.py +87 -0
  11. browsergym_workarena-0.3.0/src/browsergym/workarena/api/computer_asset.py +90 -0
  12. browsergym_workarena-0.3.0/src/browsergym/workarena/api/cost_center.py +19 -0
  13. browsergym_workarena-0.3.0/src/browsergym/workarena/api/expense_line.py +89 -0
  14. browsergym_workarena-0.3.0/src/browsergym/workarena/api/incident.py +45 -0
  15. browsergym_workarena-0.3.0/src/browsergym/workarena/api/knowledge.py +29 -0
  16. browsergym_workarena-0.3.0/src/browsergym/workarena/api/problem.py +90 -0
  17. browsergym_workarena-0.3.0/src/browsergym/workarena/api/report.py +183 -0
  18. browsergym_workarena-0.3.0/src/browsergym/workarena/api/requested_items.py +63 -0
  19. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/user.py +11 -8
  20. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/utils.py +47 -3
  21. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/config.py +21 -1
  22. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +1 -1
  23. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +1 -0
  24. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/knowledge/protocols.json +46 -0
  25. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/knowledge/test.html +1 -0
  26. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +12 -0
  27. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +12 -0
  28. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +12 -0
  29. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +12 -0
  30. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +12 -0
  31. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +12 -0
  32. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +12 -0
  33. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +12 -0
  34. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +12 -0
  35. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/all_menu.json +1 -1
  36. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +1 -0
  37. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +1 -0
  38. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +1 -1
  39. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/impersonation_users.json +1 -1
  40. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +1 -1
  41. browsergym_workarena-0.3.0/src/browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +1 -0
  42. browsergym_workarena-0.3.0/src/browsergym/workarena/human_eval/console.js +176 -0
  43. browsergym_workarena-0.3.0/src/browsergym/workarena/human_eval/tool.py +366 -0
  44. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/install.py +81 -20
  45. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/base.py +55 -20
  46. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/comp_building_block.py +4 -0
  47. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/__init__.py +76 -0
  48. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/base.py +364 -0
  49. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_base.py +1366 -0
  50. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_catalog.py +1127 -0
  51. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +2047 -0
  52. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_create_incident.py +403 -0
  53. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +278 -0
  54. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_create_problem.py +336 -0
  55. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +235 -0
  56. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_filter.py +1600 -0
  57. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_request_item.py +1315 -0
  58. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +693 -0
  59. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/delete_record.py +341 -0
  60. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/edit_knowledge_base.py +457 -0
  61. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/expense_management.py +598 -0
  62. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/filter_and_do.py +139 -0
  63. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/find_and_order_item.py +345 -0
  64. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +1417 -0
  65. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +499 -0
  66. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/maximize_investment_return.py +1763 -0
  67. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/navigate_and_do.py +1151 -0
  68. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +2100 -0
  69. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/offboard_user.py +207 -0
  70. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/onboard_user.py +226 -0
  71. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/update_task.py +145 -0
  72. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/utils/curriculum.py +215 -0
  73. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +151 -0
  74. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/utils/knapsack.py +192 -0
  75. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/warranty_check.py +227 -0
  76. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/work_assignment.py +804 -0
  77. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/compositional/workload_balancing.py +396 -0
  78. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/dashboard.py +188 -8
  79. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/form.py +1593 -0
  80. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/knowledge.py +359 -0
  81. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/list.py +519 -102
  82. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/mark_duplicate_problem.py +171 -0
  83. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/navigation.py +55 -13
  84. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/extract_all_menu_items.py +9 -2
  85. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +6 -5
  86. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/service_catalog.py +2 -1
  87. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/validate.py +8 -2
  88. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/send_chat_message.py +90 -0
  89. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/service_catalog.py +94 -26
  90. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/form.py +1 -4
  91. browsergym_workarena-0.3.0/src/browsergym/workarena/tasks/utils/private_tasks.py +63 -0
  92. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/utils.py +13 -0
  93. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_api.py +1 -0
  94. browsergym_workarena-0.3.0/tests/test_compositional_utils.py +92 -0
  95. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_random_config_generation.py +24 -23
  96. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_task_from_config.py +37 -3
  97. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_task_general.py +4 -9
  98. browsergym_workarena-0.2.1/scripts/generate_knowledge_base.ipynb +0 -1374
  99. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +0 -34
  100. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +0 -48
  101. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +0 -53
  102. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +0 -28
  103. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +0 -29
  104. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +0 -59
  105. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +0 -1
  106. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +0 -1
  107. browsergym_workarena-0.2.1/src/browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +0 -1
  108. browsergym_workarena-0.2.1/src/browsergym/workarena/tasks/form.py +0 -801
  109. browsergym_workarena-0.2.1/src/browsergym/workarena/tasks/knowledge.py +0 -168
  110. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/.github/workflows/pypi.yml +0 -0
  111. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/.github/workflows/unit_tests.yml +0 -0
  112. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/.gitignore +0 -0
  113. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/LICENSE +0 -0
  114. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/dev/environment.yaml +0 -0
  115. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/dev/requirements.txt +0 -0
  116. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/__init__.py +0 -0
  117. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/requests.py +0 -0
  118. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/api/ui_themes.py +0 -0
  119. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_change_request_form_fields.json +0 -0
  120. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_hardware_form_fields.json +0 -0
  121. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_problem_form_fields.json +0 -0
  122. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/forms/expected_user_form_fields.json +0 -0
  123. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/knowledge/kb_autopublish_workflow.xml +0 -0
  124. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/knowledge/knowledge_base.json +0 -0
  125. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/setup_files/ui_themes/workarena_themes.xml +0 -0
  126. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_change_request_task.json +0 -0
  127. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_hardware_asset_task.json +0 -0
  128. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_incident_task.json +0 -0
  129. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_problem_task.json +0 -0
  130. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/create_user_task.json +0 -0
  131. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_asset_list_task.json +0 -0
  132. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_change_request_list_task.json +0 -0
  133. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_hardware_list_task.json +0 -0
  134. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_incident_list_task.json +0 -0
  135. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/filter_user_list_task.json +0 -0
  136. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/knowledge_base_configs.json +0 -0
  137. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_apple_mac_book_pro15_task.json +0 -0
  138. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_apple_watch_task.json +0 -0
  139. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_developer_laptop_task.json +0 -0
  140. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_development_laptop_pc_task.json +0 -0
  141. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_ipad_mini_task.json +0 -0
  142. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_ipad_pro_task.json +0 -0
  143. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_loaner_laptop_task.json +0 -0
  144. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_sales_laptop_task.json +0 -0
  145. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/order_standard_laptop_task.json +0 -0
  146. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_asset_list_task.json +0 -0
  147. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_change_request_list_task.json +0 -0
  148. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_hardware_list_task.json +0 -0
  149. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_incident_list_task.json +0 -0
  150. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_service_catalog_item_list_task.json +0 -0
  151. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/data_files/task_configs/sort_user_list_task.json +0 -0
  152. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/instance.py +0 -0
  153. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/__init__.py +0 -0
  154. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/README.md +0 -0
  155. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/generate_forms.py +0 -0
  156. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/knowledge.py +0 -0
  157. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/list.py +0 -0
  158. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/scripts/navigation.py +0 -0
  159. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/__init__.py +0 -0
  160. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/debug.py +0 -0
  161. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/js_utils.js +0 -0
  162. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/tasks/utils/string.py +0 -0
  163. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/src/browsergym/workarena/utils.py +0 -0
  164. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_snow_instance.py +0 -0
  165. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_task_setup.py +0 -0
  166. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_utils.py +0 -0
  167. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/test_validate.py +0 -0
  168. {browsergym_workarena-0.2.1 → browsergym_workarena-0.3.0}/tests/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: browsergym-workarena
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: WorkArena benchmark for BrowserGym
5
5
  Project-URL: homepage, https://github.com/ServiceNow/WorkArena
6
6
  Author: Léo Boisvert, Alex Drouin, Maxime Gasse, Alex Lacoste, Manuel Del Verme, Megh Thakkar
@@ -15,7 +15,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
15
  Requires-Python: >3.7
16
16
  Requires-Dist: browsergym-core>=0.2
17
17
  Requires-Dist: english-words>=2.0.1
18
- Requires-Dist: faker>=24.11.0
18
+ Requires-Dist: faker>=24.8.0
19
19
  Requires-Dist: numpy>=1.14
20
20
  Requires-Dist: requests>=2.31
21
21
  Requires-Dist: tenacity>=8.2.3
@@ -34,12 +34,9 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
34
34
 
35
35
  https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
36
36
 
37
- ## ⚠️ Pre-Release warning ⚠️
38
- Please note that the WorkArena benchmark is still undergoing minor bug fixes and updates, which may cause discrepancies with results reported in our latest arXiv preprint. We plan to release soon a stable version of WorkArena with enhanced stability, and a final version v1.0.0 with a new suite of tasks.
39
-
40
37
  ## Benchmark Contents
41
38
 
42
- At the moment, WorkArena includes `18,050` task instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
39
+ At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
43
40
 
44
41
  ### Knowledge Bases
45
42
 
@@ -77,8 +74,11 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
77
74
 
78
75
  ### Dashboards
79
76
 
80
- **Goal:** The agent must extract information from a dashboard.
77
+ **Goal:** The agent must answer a question that requires reading charts and (optionally) performing simple reasoning over them.
78
+
79
+ *Note: For demonstration purposes, a human is controlling the cursor since this is a pure retrieval task*
81
80
 
81
+ https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
82
82
 
83
83
 
84
84
  ## Getting Started
@@ -122,6 +122,8 @@ Your installation is now complete! 🎉
122
122
 
123
123
  Run this code to see WorkArena in action.
124
124
 
125
+ Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
126
+
125
127
  ```python
126
128
  import random
127
129
 
@@ -136,28 +138,27 @@ for task in ALL_WORKARENA_TASKS:
136
138
 
137
139
  # Instantiate a new environment
138
140
  env = BrowserEnv(task_entrypoint=task,
139
- headless=False,
140
- slow_mo=1000)
141
+ headless=False)
141
142
  env.reset()
142
143
 
143
144
  # Cheat functions use Playwright to automatically solve the task
144
145
  env.chat.add_message(role="assistant", msg="On it. Please wait...")
145
- env.task.cheat(env.page, env.chat.messages)
146
+ cheat_messages = []
147
+ env.task.cheat(env.page, cheat_messages)
148
+
149
+ # Send cheat messages to chat
150
+ for cheat_msg in cheat_messages:
151
+ env.chat.add_message(role=cheat_msg["role"], msg=cheat_msg["message"])
146
152
 
147
153
  # Post solution to chat
148
- if "KnowledgeBaseSearchTask" in str(task):
149
- answer = env.chat.messages[-1]["message"]
150
- env.chat.add_message(role="assistant", msg=f"The answer is:")
151
- env.chat.add_message(role="assistant", msg=answer)
152
- else:
153
- env.chat.add_message(role="assistant", msg="I'm done!")
154
+ env.chat.add_message(role="assistant", msg="I'm done!")
154
155
 
155
156
  # Validate the solution
156
- reward, stop, info, message = env.task.validate(env.page, env.chat.messages)
157
+ reward, stop, message, info = env.task.validate(env.page, cheat_messages)
157
158
  if reward == 1:
158
159
  env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
159
160
  else:
160
- env.chat.add_message(role="user", msg=f"No, that doesn't work. {message.get('message', '')}")
161
+ env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
161
162
 
162
163
  sleep(3)
163
164
  env.close()
@@ -10,12 +10,9 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),
10
10
 
11
11
  https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70
12
12
 
13
- ## ⚠️ Pre-Release warning ⚠️
14
- Please note that the WorkArena benchmark is still undergoing minor bug fixes and updates, which may cause discrepancies with results reported in our latest arXiv preprint. We plan to release soon a stable version of WorkArena with enhanced stability, and a final version v1.0.0 with a new suite of tasks.
15
-
16
13
  ## Benchmark Contents
17
14
 
18
- At the moment, WorkArena includes `18,050` task instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
15
+ At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
19
16
 
20
17
  ### Knowledge Bases
21
18
 
@@ -53,8 +50,11 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8
53
50
 
54
51
  ### Dashboards
55
52
 
56
- **Goal:** The agent must extract information from a dashboard.
53
+ **Goal:** The agent must answer a question that requires reading charts and (optionally) performing simple reasoning over them.
54
+
55
+ *Note: For demonstration purposes, a human is controlling the cursor since this is a pure retrieval task*
57
56
 
57
+ https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f
58
58
 
59
59
 
60
60
  ## Getting Started
@@ -98,6 +98,8 @@ Your installation is now complete! 🎉
98
98
 
99
99
  Run this code to see WorkArena in action.
100
100
 
101
+ Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.
102
+
101
103
  ```python
102
104
  import random
103
105
 
@@ -112,28 +114,27 @@ for task in ALL_WORKARENA_TASKS:
112
114
 
113
115
  # Instantiate a new environment
114
116
  env = BrowserEnv(task_entrypoint=task,
115
- headless=False,
116
- slow_mo=1000)
117
+ headless=False)
117
118
  env.reset()
118
119
 
119
120
  # Cheat functions use Playwright to automatically solve the task
120
121
  env.chat.add_message(role="assistant", msg="On it. Please wait...")
121
- env.task.cheat(env.page, env.chat.messages)
122
+ cheat_messages = []
123
+ env.task.cheat(env.page, cheat_messages)
124
+
125
+ # Send cheat messages to chat
126
+ for cheat_msg in cheat_messages:
127
+ env.chat.add_message(role=cheat_msg["role"], msg=cheat_msg["message"])
122
128
 
123
129
  # Post solution to chat
124
- if "KnowledgeBaseSearchTask" in str(task):
125
- answer = env.chat.messages[-1]["message"]
126
- env.chat.add_message(role="assistant", msg=f"The answer is:")
127
- env.chat.add_message(role="assistant", msg=answer)
128
- else:
129
- env.chat.add_message(role="assistant", msg="I'm done!")
130
+ env.chat.add_message(role="assistant", msg="I'm done!")
130
131
 
131
132
  # Validate the solution
132
- reward, stop, info, message = env.task.validate(env.page, env.chat.messages)
133
+ reward, stop, message, info = env.task.validate(env.page, cheat_messages)
133
134
  if reward == 1:
134
135
  env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
135
136
  else:
136
- env.chat.add_message(role="user", msg=f"No, that doesn't work. {message.get('message', '')}")
137
+ env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")
137
138
 
138
139
  sleep(3)
139
140
  env.close()
@@ -31,6 +31,7 @@ homepage = "https://github.com/ServiceNow/WorkArena"
31
31
 
32
32
  [project.scripts]
33
33
  workarena-install = "browsergym.workarena.install:main"
34
+ workarena-human-eval = "browsergym.workarena.human_eval.tool:main"
34
35
 
35
36
  [tool.hatch.version]
36
37
  path = "src/browsergym/workarena/__init__.py"
@@ -1,6 +1,6 @@
1
1
  browsergym-core>=0.2
2
2
  english-words>=2.0.1
3
- faker>=24.11.0
3
+ Faker>=24.8.0
4
4
  numpy>=1.14
5
5
  requests>=2.31
6
6
  tenacity>=8.2.3 # only used in cheat() -> move to tests?
@@ -0,0 +1,131 @@
1
+ """
2
+ A demonstration of how observation/action traces can be extracted
3
+ for WorkArena tasks without modifying the task code.
4
+
5
+ Author: Alexandre Drouin (alexandre.drouin@servicenow.com)
6
+
7
+ Notes:
8
+ - This approach relies on monkey patching the playwright actions to log the actions and observations.
9
+ It has not been tested for parallel execution. It might work with multiprocessing, but it will for
10
+ sure not work with multithreading.
11
+
12
+ """
13
+
14
+ import importlib
15
+ import logging
16
+ import os
17
+ import pickle
18
+ import playwright.sync_api as playwright_sync
19
+
20
+ from browsergym.core.env import BrowserEnv
21
+ from browsergym.workarena import ALL_WORKARENA_TASKS
22
+ from collections import defaultdict
23
+ from tenacity import retry, stop_after_attempt, wait_fixed
24
+ from time import time
25
+
26
+
27
+ N_PER_TASK = 10
28
+
29
+
30
+ def monkey_patch_playwright(observation_callback, trace_storage):
31
+ """
32
+ A function that overrides the default playwright actions to log the actions and observations.
33
+
34
+ Parameters:
35
+ ------------
36
+ observation_callback: callable
37
+ A function that returns the observation of the environment.
38
+ trace_storage: list
39
+ A list to store the trace of the actions and observations.
40
+ These will be appended in-place.
41
+
42
+ """
43
+
44
+ def wrapper(func, interface):
45
+ def wrapped(*args, **kwargs):
46
+ # Get the observation
47
+ obs = observation_callback()
48
+
49
+ # Get the BID of the element on which we are acting.
50
+ if interface.__name__ == "Locator":
51
+ # Get the locator
52
+ locator = args[0]
53
+ # Get the BID
54
+ bid = locator.element_handle().evaluate('(el) => el.getAttribute("bid")')
55
+ elif interface.__name__ == "Keyboard":
56
+ # Get the BID of the element
57
+ bid = "keyboard"
58
+ else:
59
+ # Get the BID of the element
60
+ bid = args[0].evaluate('(el) => el.getAttribute("bid")')
61
+
62
+ logging.info(f"Action: {func.__name__} BID: {bid} -- Args: {args[1:]} {kwargs}")
63
+ trace_storage.append(
64
+ {
65
+ "obs": obs,
66
+ "action": func.__name__,
67
+ "args": args[1:],
68
+ "kwargs": kwargs,
69
+ "bid": bid,
70
+ "time": time(),
71
+ }
72
+ )
73
+
74
+ # Resume action
75
+ return func(*args, **kwargs)
76
+
77
+ return wrapped
78
+
79
+ # Interfaces and actions we want to monkey patch
80
+ importlib.reload(playwright_sync)
81
+ from playwright.sync_api import Page, Frame, Locator, Keyboard, ElementHandle
82
+
83
+ # TODO: Make sure the list of interfaces and actions is exhaustive
84
+ # It covers all that is used in WorkArena cheats as of April 11, 2024
85
+ interfaces = [Page, Frame, Locator, Keyboard, ElementHandle]
86
+ actions = ["click", "select_option", "set_checked", "fill", "press", "type", "down", "up"]
87
+
88
+ for interface in interfaces:
89
+ for action in actions:
90
+ if hasattr(interface, action):
91
+ setattr(interface, action, wrapper(getattr(interface, action), interface))
92
+ print(f"Monkey patched {interface.__name__}.{action}")
93
+
94
+
95
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
96
+ def extract_trace(task_cls, headless=True):
97
+ """
98
+ Extracts the trace of actions and observations for a given task.
99
+
100
+ Parameters:
101
+ ------------
102
+ task_cls: class
103
+ The class of the task to extract the trace from.
104
+
105
+ """
106
+ # Instantiate a new environment
107
+ env = BrowserEnv(task_entrypoint=task_cls, headless=headless, slow_mo=1000)
108
+
109
+ # Setup customized tracing
110
+ trace = []
111
+ monkey_patch_playwright(observation_callback=env._get_obs, trace_storage=trace)
112
+
113
+ env.reset()
114
+ env.task.cheat(env.page, env.chat.messages)
115
+ env.close()
116
+
117
+ return trace
118
+
119
+
120
+ if __name__ == "__main__":
121
+ os.makedirs("trace_profiling", exist_ok=True)
122
+
123
+ task_traces = defaultdict(list)
124
+ for task in ALL_WORKARENA_TASKS:
125
+ print("Task:", task)
126
+ for i in range(N_PER_TASK):
127
+ print(f"Extracting trace {i+1}/{N_PER_TASK}")
128
+ trace = extract_trace(task, headless=True)
129
+ task_traces[task].append(trace)
130
+
131
+ pickle.dump(task_traces, open("trace_profiling/task_traces.pkl", "wb"))