browsergym-workarena 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. browsergym/workarena/__init__.py +13 -1
  2. browsergym/workarena/api/category.py +74 -0
  3. browsergym/workarena/api/change_request.py +87 -0
  4. browsergym/workarena/api/computer_asset.py +90 -0
  5. browsergym/workarena/api/cost_center.py +19 -0
  6. browsergym/workarena/api/expense_line.py +89 -0
  7. browsergym/workarena/api/incident.py +45 -0
  8. browsergym/workarena/api/knowledge.py +29 -0
  9. browsergym/workarena/api/problem.py +90 -0
  10. browsergym/workarena/api/report.py +183 -0
  11. browsergym/workarena/api/requested_items.py +63 -0
  12. browsergym/workarena/api/user.py +11 -8
  13. browsergym/workarena/api/utils.py +47 -3
  14. browsergym/workarena/config.py +21 -1
  15. browsergym/workarena/data_files/setup_files/forms/expected_incident_form_fields.json +1 -1
  16. browsergym/workarena/data_files/setup_files/forms/expected_request_item_form_fields.json +1 -0
  17. browsergym/workarena/data_files/setup_files/knowledge/protocols.json +46 -0
  18. browsergym/workarena/data_files/setup_files/knowledge/test.html +1 -0
  19. browsergym/workarena/data_files/setup_files/lists/expected_asset_list_columns.json +2 -24
  20. browsergym/workarena/data_files/setup_files/lists/expected_change_request_list_columns.json +4 -40
  21. browsergym/workarena/data_files/setup_files/lists/expected_expense_line_list_columns.json +12 -0
  22. browsergym/workarena/data_files/setup_files/lists/expected_hardware_list_columns.json +1 -42
  23. browsergym/workarena/data_files/setup_files/lists/expected_incident_list_columns.json +2 -18
  24. browsergym/workarena/data_files/setup_files/lists/expected_problem_list_columns.json +12 -0
  25. browsergym/workarena/data_files/setup_files/lists/expected_requested_items_list_columns.json +12 -0
  26. browsergym/workarena/data_files/setup_files/lists/expected_service_catalog_list_columns.json +2 -19
  27. browsergym/workarena/data_files/setup_files/lists/expected_user_list_columns.json +3 -50
  28. browsergym/workarena/data_files/task_configs/all_menu.json +95 -95
  29. browsergym/workarena/data_files/task_configs/dashboard_retrieval_minmax_task.json +1 -1
  30. browsergym/workarena/data_files/task_configs/dashboard_retrieval_value_task.json +1 -1
  31. browsergym/workarena/data_files/task_configs/filter_service_catalog_item_list_task.json +7986 -7982
  32. browsergym/workarena/data_files/task_configs/impersonation_users.json +3 -3
  33. browsergym/workarena/data_files/task_configs/report_retrieval_minmax_task.json +1 -1
  34. browsergym/workarena/data_files/task_configs/report_retrieval_value_task.json +1 -1
  35. browsergym/workarena/human_eval/console.js +176 -0
  36. browsergym/workarena/human_eval/tool.py +366 -0
  37. browsergym/workarena/install.py +81 -20
  38. browsergym/workarena/tasks/base.py +55 -20
  39. browsergym/workarena/tasks/comp_building_block.py +4 -0
  40. browsergym/workarena/tasks/compositional/__init__.py +76 -0
  41. browsergym/workarena/tasks/compositional/base.py +364 -0
  42. browsergym/workarena/tasks/compositional/dash_do_base.py +1366 -0
  43. browsergym/workarena/tasks/compositional/dash_do_catalog.py +1127 -0
  44. browsergym/workarena/tasks/compositional/dash_do_catalog_infeasible.py +2047 -0
  45. browsergym/workarena/tasks/compositional/dash_do_create_incident.py +403 -0
  46. browsergym/workarena/tasks/compositional/dash_do_create_incident_infeasible.py +278 -0
  47. browsergym/workarena/tasks/compositional/dash_do_create_problem.py +336 -0
  48. browsergym/workarena/tasks/compositional/dash_do_create_problem_infeasible.py +235 -0
  49. browsergym/workarena/tasks/compositional/dash_do_filter.py +1600 -0
  50. browsergym/workarena/tasks/compositional/dash_do_request_item.py +1315 -0
  51. browsergym/workarena/tasks/compositional/dash_do_request_item_infeasible.py +693 -0
  52. browsergym/workarena/tasks/compositional/delete_record.py +341 -0
  53. browsergym/workarena/tasks/compositional/edit_knowledge_base.py +457 -0
  54. browsergym/workarena/tasks/compositional/expense_management.py +598 -0
  55. browsergym/workarena/tasks/compositional/filter_and_do.py +139 -0
  56. browsergym/workarena/tasks/compositional/find_and_order_item.py +345 -0
  57. browsergym/workarena/tasks/compositional/manage_change_request_schedule.py +1417 -0
  58. browsergym/workarena/tasks/compositional/mark_duplicate_problems.py +499 -0
  59. browsergym/workarena/tasks/compositional/maximize_investment_return.py +1763 -0
  60. browsergym/workarena/tasks/compositional/navigate_and_do.py +1151 -0
  61. browsergym/workarena/tasks/compositional/navigate_and_do_infeasible.py +2100 -0
  62. browsergym/workarena/tasks/compositional/offboard_user.py +207 -0
  63. browsergym/workarena/tasks/compositional/onboard_user.py +226 -0
  64. browsergym/workarena/tasks/compositional/update_task.py +145 -0
  65. browsergym/workarena/tasks/compositional/utils/curriculum.py +215 -0
  66. browsergym/workarena/tasks/compositional/utils/infeasible_configs.py +151 -0
  67. browsergym/workarena/tasks/compositional/utils/knapsack.py +192 -0
  68. browsergym/workarena/tasks/compositional/warranty_check.py +227 -0
  69. browsergym/workarena/tasks/compositional/work_assignment.py +804 -0
  70. browsergym/workarena/tasks/compositional/workload_balancing.py +396 -0
  71. browsergym/workarena/tasks/dashboard.py +188 -8
  72. browsergym/workarena/tasks/form.py +1024 -232
  73. browsergym/workarena/tasks/knowledge.py +216 -25
  74. browsergym/workarena/tasks/list.py +519 -102
  75. browsergym/workarena/tasks/mark_duplicate_problem.py +171 -0
  76. browsergym/workarena/tasks/navigation.py +55 -13
  77. browsergym/workarena/tasks/scripts/extract_all_menu_items.py +9 -2
  78. browsergym/workarena/tasks/scripts/generate_dashboard_configs.py +6 -5
  79. browsergym/workarena/tasks/scripts/service_catalog.py +2 -1
  80. browsergym/workarena/tasks/scripts/validate.py +8 -2
  81. browsergym/workarena/tasks/send_chat_message.py +90 -0
  82. browsergym/workarena/tasks/service_catalog.py +94 -26
  83. browsergym/workarena/tasks/utils/form.py +1 -4
  84. browsergym/workarena/tasks/utils/private_tasks.py +63 -0
  85. browsergym/workarena/tasks/utils/utils.py +13 -0
  86. {browsergym_workarena-0.2.0.dist-info → browsergym_workarena-0.3.0.dist-info}/METADATA +27 -20
  87. browsergym_workarena-0.3.0.dist-info/RECORD +138 -0
  88. {browsergym_workarena-0.2.0.dist-info → browsergym_workarena-0.3.0.dist-info}/entry_points.txt +1 -0
  89. browsergym_workarena-0.2.0.dist-info/RECORD +0 -85
  90. {browsergym_workarena-0.2.0.dist-info → browsergym_workarena-0.3.0.dist-info}/WHEEL +0 -0
  91. {browsergym_workarena-0.2.0.dist-info → browsergym_workarena-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -10,6 +10,9 @@ from typing import List, Tuple
10
10
  from urllib import parse
11
11
 
12
12
  from .base import AbstractServiceNowTask
13
+ from .comp_building_block import CompositionalBuildingBlockTask
14
+ from .utils.utils import check_url_suffix_match
15
+
13
16
  from ..api.utils import table_api_call, table_column_info
14
17
  from ..config import (
15
18
  DASHBOARD_RETRIEVAL_MINMAX_CONFIG_PATH,
@@ -21,6 +24,7 @@ from ..config import (
21
24
  )
22
25
  from ..instance import SNowInstance
23
26
  from .utils.string import share_tri_gram
27
+ from .utils.utils import check_url_suffix_match
24
28
 
25
29
  # XXX: Some notes on plot types
26
30
  # - We currently don't support maps because they are clickable and would require a more evolved cheat function
@@ -33,10 +37,17 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
33
37
 
34
38
  """
35
39
 
36
- def __init__(self, seed: int, instance: SNowInstance = None, fixed_config: dict = None) -> None:
40
+ def __init__(
41
+ self, seed: int = None, instance: SNowInstance = None, fixed_config: dict = None, **kwargs
42
+ ) -> None:
37
43
  super().__init__(seed=seed, instance=instance, start_rel_url="")
38
44
  self.iframe_id = "gsft_main"
39
45
  self.fixed_config = fixed_config
46
+ self.__dict__.update(kwargs)
47
+
48
+ @abstractmethod
49
+ def all_configs(self) -> List[dict]:
50
+ pass
40
51
 
41
52
  @abstractmethod
42
53
  def all_configs(self) -> List[dict]:
@@ -228,12 +239,6 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
228
239
  logging.debug("All plots loaded")
229
240
 
230
241
  def get_init_scripts(self) -> List[str]:
231
- # Configure to page type
232
- # ... extract URL suffix
233
- url_suffix = parse.unquote(
234
- parse.urlparse(self.config["url"].replace("%3F", "?")).path.split("/")[-1]
235
- )
236
-
237
242
  return super().get_init_scripts() + [
238
243
  "registerGsftMainLoaded();",
239
244
  f"""
@@ -263,7 +268,28 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
263
268
  waLog('All charts loaded', 'loadAllCharts');
264
269
  }});
265
270
  }}
266
- runInGsftMainOnlyAndProtectByURL(renderAllCharts, '{url_suffix}');
271
+ // Run on both dashboard and reports pages
272
+ runInGsftMainOnlyAndProtectByURL(renderAllCharts, 'pa_dashboard.do');
273
+ runInGsftMainOnlyAndProtectByURL(renderAllCharts, 'sys_report_template.do');
274
+ """,
275
+ f"""
276
+ function purifyReportUIButtons() {{
277
+ // Delete a lot of UI features that were causing issues due to the report refreshing without
278
+ // reloading the page. This makes the task easier, but it doesn't matter because we really
279
+ // want to evaluate retrieval and this doesn't prevent that.
280
+ document.querySelectorAll('[ng-click*="main.runReport"], #sidebar, #nlq-over-cb, #open-tree-navigation-button, .data-filtering-wrap').forEach(element => {{
281
+ if (element && element.parentNode) {{
282
+ element.parentNode.removeChild(element);
283
+ }}
284
+ }});
285
+ document.addEventListener('click', function(event) {{
286
+ event.stopPropagation();
287
+ event.preventDefault();
288
+ }}, true);
289
+ waLog('Purified report UI.', 'purifyReportUIButtons');
290
+ }}
291
+ // Run it only on the reports page
292
+ runInGsftMainOnlyAndProtectByURL(purifyReportUIButtons, 'sys_report_template.do');
267
293
  """,
268
294
  ]
269
295
 
@@ -295,6 +321,12 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
295
321
  goal = f"What is the maximum value in {chart_locator}? Give me both the label and the count. If there are many, pick one."
296
322
  elif self.config["question"] == "min":
297
323
  goal = f"What is the minimum value in {chart_locator}? Give me both the label and the count. If there are many, pick one."
324
+ elif self.config["question"] == "mean":
325
+ goal = f"What is the average value in {chart_locator}? Round off to the next highest integer."
326
+ elif self.config["question"] == "median":
327
+ goal = f"What is the median value in {chart_locator}?"
328
+ elif self.config["question"] == "mode":
329
+ goal = f"What is the mode value in {chart_locator}?"
298
330
  else:
299
331
  raise NotImplementedError(f"Question type {self.config['question']} not supported")
300
332
 
@@ -302,6 +334,34 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
302
334
 
303
335
  def cheat(self, page: playwright.sync_api.Page, chat_messages: list[str]) -> None:
304
336
  super().cheat(page, chat_messages)
337
+ # Check if the page is the report list view. If so, open the report
338
+ page_is_report_list_view = check_url_suffix_match(
339
+ page, "/now/nav/ui/classic/params/target/sys_report_list.do", self
340
+ )
341
+ chart_title = self.config["chart_title"]
342
+ if page_is_report_list_view:
343
+ # Open the report
344
+ frame = page.wait_for_selector('iframe[name="gsft_main"]').content_frame()
345
+ # Search for the report by title
346
+ frame.get_by_label("Search a specific field of the Reports list").select_option("Title")
347
+ search_input = frame.locator('input[aria-label="Search"]')
348
+ search_input.click()
349
+ search_input.fill(chart_title)
350
+ search_input.press("Enter")
351
+ page.wait_for_function(
352
+ "typeof window.gsft_main !== 'undefined' && window.gsft_main.WORKARENA_LOAD_COMPLETE"
353
+ )
354
+ # Click on the chart preview to open it
355
+ frame.wait_for_selector(f'a[aria-label="Preview record: {chart_title}"]').click()
356
+ page.wait_for_timeout(1000)
357
+ page.keyboard.press("Enter")
358
+ # Now in the form view, wait for the page to load and click to view the report
359
+ page.wait_for_function(
360
+ "typeof window.gsft_main !== 'undefined' && window.gsft_main.WORKARENA_LOAD_COMPLETE"
361
+ )
362
+ frame = page.wait_for_selector('iframe[name="gsft_main"]').content_frame()
363
+ frame.get_by_text("View Report").first.click()
364
+
305
365
  self._wait_for_ready(page)
306
366
 
307
367
  # Get the chart data
@@ -345,6 +405,37 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
345
405
  chat_messages.append(
346
406
  {"message": f"{min_point['label']}, {min_point['count']}", "role": "assistant"}
347
407
  )
408
+ elif self.config["question"] == "mean":
409
+ counts = [data["count"] for data in chart_data]
410
+ target_count = np.mean(counts)
411
+ chat_messages.append({"message": f"Mean / Average {target_count}", "role": "assistant"})
412
+ elif self.config["question"] == "median":
413
+ counts = [data["count"] for data in chart_data]
414
+ target_count = np.median(counts)
415
+ chat_messages.append({"message": f"Median {target_count}", "role": "assistant"})
416
+ elif self.config["question"] == "mode":
417
+ counts = [data["count"] for data in chart_data]
418
+ # We select the maximum value if there are two or more modes
419
+ frequencies = {}
420
+ for count in counts:
421
+ if count not in frequencies:
422
+ frequencies[count] = 1
423
+ else:
424
+ frequencies[count] += 1
425
+ sorted_frequencies = {
426
+ count: frequency
427
+ for count, frequency in sorted(
428
+ frequencies.items(), key=lambda item: item[1], reverse=True
429
+ )
430
+ }
431
+ max_frequency = list(sorted_frequencies.values())[0]
432
+ max_frequencies = [
433
+ count
434
+ for count, frequency in sorted_frequencies.items()
435
+ if frequency == max_frequency
436
+ ]
437
+ target_count = max(max_frequencies)
438
+ chat_messages.append({"message": f"Mode {target_count}", "role": "assistant"})
348
439
  else:
349
440
  raise NotImplementedError(f"Question type \"{self.config['question']}\" not supported")
350
441
 
@@ -352,12 +443,28 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
352
443
  self, page: playwright.sync_api.Page, chat_messages: list[str]
353
444
  ) -> Tuple[float, bool, str, dict]:
354
445
  super().validate(page, chat_messages)
446
+
447
+ # Check if the page is in the right URL
448
+ logging.debug("Checking if the page is in the right URL to validate the task")
449
+ right_url = check_url_suffix_match(page, expected_url=self.start_url, task=self)
450
+ if not right_url:
451
+ return (
452
+ 0,
453
+ False,
454
+ "",
455
+ {
456
+ "message": f"The page is not in the right URL to validate task {self.__class__.__name__}."
457
+ },
458
+ )
459
+
355
460
  self._wait_for_ready(page)
356
461
 
357
462
  # Get the chart data
463
+ logging.debug("Extracting chart data")
358
464
  _, chart_data, _ = self._get_chart_by_title(page, self.config["chart_title"])
359
465
 
360
466
  # Extract the series
467
+ logging.debug("Extracting the series")
361
468
  if len(chart_data) == 1:
362
469
  chart_data = chart_data[0]["data"]
363
470
  else:
@@ -368,6 +475,7 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
368
475
  ][0]
369
476
 
370
477
  # Extract the agent's response
478
+ logging.debug("Extracting the agent's response")
371
479
  if chat_messages and chat_messages[-1]["role"] == "assistant":
372
480
  response = chat_messages[-1]["message"]
373
481
  else:
@@ -379,6 +487,7 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
379
487
  )
380
488
 
381
489
  # Extract all numbers mentioned by the agent
490
+ logging.debug("Extracting all numbers mentioned by the agent")
382
491
  # ... some value labels may contain numbers so we need to remove the labels from the response first
383
492
  labels = set([point["label"] for point in chart_data])
384
493
  response_ = str(response)
@@ -390,15 +499,29 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
390
499
  )
391
500
  del response_
392
501
 
502
+ if len(response_floats) == 0:
503
+ return (
504
+ 0.0,
505
+ False,
506
+ "No number detected in the response.",
507
+ {"message": "No number detected in the response."},
508
+ )
509
+
393
510
  # Validate the response
511
+ logging.debug("Validating the response based on the question type")
394
512
  if self.config["question"].startswith("value"):
513
+ logging.debug("The question is a value question")
395
514
  # if more than one number is in the prompt, there is necessarily a false positive
396
515
  if len(response_floats) > 1:
397
516
  error_msg = "Incorrect answer. More than one number detected in the response."
398
517
  return 0.0, True, error_msg, {"message": error_msg}
399
518
 
519
+ logging.debug(
520
+ f"Extracting expected format and label from question for validation: {self.config['question']}"
521
+ )
400
522
  format = self.config["question"].split(";")[1].strip()
401
523
  label = self.config["question"].split(";")[2].strip()
524
+ logging.debug(f"Extracted format: {format}, label: {label}")
402
525
 
403
526
  expected_value = float(
404
527
  [
@@ -416,6 +539,7 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
416
539
  elif "max" in self.config["question"] or "min" in self.config["question"]:
417
540
  # Determine whether to find max or min based on configuration
418
541
  target_func = max if self.config["question"] == "max" else min
542
+ logging.debug(f"The question is a {str(target_func)} question")
419
543
 
420
544
  # Get the target count value (max or min)
421
545
  target_count = float(target_func(chart_data, key=lambda x: x["count"])["count"])
@@ -437,6 +561,33 @@ class DashboardRetrievalTask(AbstractServiceNowTask, ABC):
437
561
 
438
562
  # If no correct point is mentioned in the response
439
563
  return 0.0, True, "Incorrect answer.", {"message": "Incorrect answer."}
564
+ # ... validate mean/median/mode responses
565
+ elif (
566
+ "mean" in self.config["question"]
567
+ or "median" in self.config["question"]
568
+ or "mode" in self.config["question"]
569
+ ):
570
+ counts = [data["count"] for data in chart_data]
571
+ if self.config["question"] == "mean":
572
+ target_count = np.mean(counts)
573
+ elif self.config["question"] == "median":
574
+ target_count = np.median(counts)
575
+ elif self.config["question"] == "mode":
576
+ _vals, _counts = np.unique(counts, return_counts=True)
577
+ max_frequency_index = np.argmax(_counts)
578
+ target_count = -_vals[max_frequency_index]
579
+
580
+ # if more than one number is in the prompt, there is necessarily a false positive
581
+ if len(response_floats) > 1:
582
+ error_msg = "Incorrect answer. More than one number detected in the response."
583
+ return 0.0, True, error_msg, {"message": error_msg}
584
+
585
+ # Check if any of these points are mentioned in the response
586
+ if np.isclose(target_count, response_floats[0]):
587
+ return 1.0, True, "Nice work, thank you!", {"message": "Correct answer."}
588
+
589
+ # If no correct point is mentioned in the response
590
+ return 0.0, True, "Incorrect answer.", {"message": "Incorrect answer."}
440
591
 
441
592
  else:
442
593
  raise NotImplementedError(f"Question type \"{self.config['question']}\" not supported")
@@ -611,10 +762,39 @@ class ReportMinMaxRetrievalTask(DashboardRetrievalTask):
611
762
  return json.load(open(REPORT_RETRIEVAL_MINMAX_CONFIG_PATH, "r"))
612
763
 
613
764
 
765
+ class ReportMeanMedianModeRetrievalTask(DashboardRetrievalTask, CompositionalBuildingBlockTask):
766
+ def all_configs(self):
767
+ return json.load(open(REPORT_RETRIEVAL_MINMAX_CONFIG_PATH, "r"))
768
+
769
+
770
+ class WorkLoadBalancingMinMaxRetrievalTask(
771
+ DashboardMinMaxRetrievalTask, CompositionalBuildingBlockTask
772
+ ):
773
+ def all_configs(self):
774
+ return json.load(open(REPORT_RETRIEVAL_MINMAX_CONFIG_PATH, "r"))
775
+
776
+ def setup_goal(self, page: playwright.sync_api.Page) -> Tuple[str | dict]:
777
+ super().setup_goal(page=page)
778
+
779
+ # Configure task
780
+ # ... sample a configuration
781
+ self.config = (
782
+ self.fixed_config if self.fixed_config else self.random.choice(self.all_configs())
783
+ )
784
+ # ... set start URL based on config
785
+ self.start_url = self.instance.snow_url + self.config["url"]
786
+
787
+ goal = f"Create a filter to find reports whose title contains hashtag {self.problem_hashtag} and open the report."
788
+ goal += " From the report, identify the user with the most assigned problems and the user with the least assigned problems."
789
+
790
+ return goal, {}
791
+
792
+
614
793
  __TASKS__ = [
615
794
  var
616
795
  for var in locals().values()
617
796
  if isinstance(var, type)
618
797
  and issubclass(var, DashboardRetrievalTask)
798
+ and not issubclass(var, CompositionalBuildingBlockTask)
619
799
  and var is not DashboardRetrievalTask
620
800
  ]