crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (121) hide show
  1. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +3 -1
  2. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +117 -115
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/metrics/bbq_metrics.py +12 -0
  5. helm/benchmark/metrics/evaluate_reference_metrics.py +12 -0
  6. helm/benchmark/metrics/safety_metrics.py +13 -1
  7. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  8. helm/benchmark/presentation/run_display.py +13 -3
  9. helm/benchmark/presentation/run_entry.py +2 -2
  10. helm/benchmark/run.py +1 -1
  11. helm/benchmark/run_specs/arabic_run_specs.py +6 -0
  12. helm/benchmark/run_specs/medhelm_run_specs.py +2 -2
  13. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  14. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  15. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  16. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  17. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  18. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  19. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  20. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  21. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  22. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  23. helm/benchmark/scenarios/commonsense_scenario.py +7 -1
  24. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  25. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  26. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  27. helm/benchmark/scenarios/gsm_scenario.py +9 -3
  28. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  29. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  30. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  31. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  32. helm/benchmark/scenarios/legalbench_scenario.py +6 -7
  33. helm/benchmark/scenarios/math_scenario.py +11 -4
  34. helm/benchmark/scenarios/med_qa_scenario.py +7 -1
  35. helm/benchmark/scenarios/medi_qa_scenario.py +2 -2
  36. helm/benchmark/scenarios/mmlu_scenario.py +8 -2
  37. helm/benchmark/scenarios/narrativeqa_scenario.py +3 -4
  38. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  39. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  40. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  41. helm/benchmark/scenarios/spider_scenario.py +18 -0
  42. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  43. helm/benchmark/scenarios/wmt_14_scenario.py +9 -2
  44. helm/benchmark/static/schema_long_context.yaml +12 -31
  45. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  46. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  47. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  48. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  49. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  50. helm/benchmark/static_build/index.html +5 -6
  51. helm/clients/ai21_client.py +2 -0
  52. helm/clients/aleph_alpha_client.py +2 -0
  53. helm/clients/anthropic_client.py +7 -1
  54. helm/clients/audio_language/diva_llama_client.py +2 -0
  55. helm/clients/audio_language/llama_omni_client.py +2 -1
  56. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  57. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  58. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  59. helm/clients/bedrock_client.py +2 -0
  60. helm/clients/cohere_client.py +3 -0
  61. helm/clients/google_client.py +2 -0
  62. helm/clients/http_model_client.py +2 -0
  63. helm/clients/huggingface_client.py +2 -1
  64. helm/clients/ibm_client.py +3 -1
  65. helm/clients/image_generation/adobe_vision_client.py +2 -0
  66. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  67. helm/clients/image_generation/cogview2_client.py +2 -1
  68. helm/clients/image_generation/dalle2_client.py +2 -0
  69. helm/clients/image_generation/dalle_mini_client.py +2 -1
  70. helm/clients/image_generation/deep_floyd_client.py +2 -0
  71. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  72. helm/clients/image_generation/lexica_client.py +2 -0
  73. helm/clients/image_generation/mindalle_client.py +2 -1
  74. helm/clients/image_generation/together_image_generation_client.py +2 -0
  75. helm/clients/megatron_client.py +2 -0
  76. helm/clients/mistral_client.py +2 -0
  77. helm/clients/moderation_api_client.py +2 -0
  78. helm/clients/openai_client.py +5 -1
  79. helm/clients/palmyra_client.py +2 -1
  80. helm/clients/reka_client.py +2 -1
  81. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  82. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  83. helm/clients/together_client.py +4 -0
  84. helm/clients/vertexai_client.py +4 -0
  85. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  86. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  87. helm/clients/vision_language/idefics_client.py +2 -1
  88. helm/clients/vision_language/open_flamingo_client.py +2 -1
  89. helm/clients/vision_language/paligemma_client.py +2 -1
  90. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  91. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  92. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  93. helm/clients/writer_client.py +2 -0
  94. helm/common/hierarchical_logger.py +20 -0
  95. helm/common/optional_dependencies.py +1 -1
  96. helm/common/test_general.py +4 -0
  97. helm/config/model_deployments.yaml +225 -0
  98. helm/config/model_metadata.yaml +232 -7
  99. helm/config/tokenizer_configs.yaml +74 -4
  100. helm/benchmark/static_build/assets/index-671a5e06.js +0 -10
  101. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  102. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  103. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  104. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  105. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  106. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  107. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  108. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  109. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  110. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  111. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  112. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  113. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  114. /helm/benchmark/static_build/assets/{index-9352595e.css → index-oIeiQW2g.css} +0 -0
  115. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  116. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  117. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  118. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  119. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  120. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  121. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -1,6 +1,7 @@
1
1
  from typing import List
2
- import json
2
+ import os
3
3
 
4
+ from datasets import load_dataset
4
5
  from tqdm import tqdm
5
6
 
6
7
  from helm.benchmark.scenarios.scenario import (
@@ -13,8 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  Output,
14
15
  )
15
16
  from helm.common.media_object import MediaObject, MultimediaObject
16
- from huggingface_hub import snapshot_download
17
- from .ultra_suite_classification_scenario import find_audio_json_pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
18
18
 
19
19
 
20
20
  class UltraSuiteDisorderBreakdownScenario(Scenario):
@@ -38,46 +38,38 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
38
38
  - Audio files (e.g., .mp3)
39
39
  - A JSON file with annotations containing 'disorder_class' field
40
40
  """
41
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
42
- data_path = snapshot_download(
43
- repo_id="SAA-Lab/SLPHelmManualLabels",
44
- repo_type="dataset",
45
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
46
- )
41
+ audio_save_dir = os.path.join(output_path, "audio_files")
42
+ os.makedirs(audio_save_dir, exist_ok=True)
43
+
44
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
45
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
47
46
 
48
47
  instances: List[Instance] = []
49
48
  split: str = TEST_SPLIT
50
49
 
51
- # Find all pairs of audio and JSON files
52
- pairs = find_audio_json_pairs(data_path)
53
- print(f"Num pairs: {len(pairs)}")
54
-
55
- for audio_path, json_path in tqdm(pairs):
50
+ for idx, row in enumerate(tqdm(dataset["train"])):
56
51
  # Load the annotation
57
- with open(json_path, "r") as f:
58
- annotation = json.load(f)
52
+ label = row["disorder_type"]
53
+ transcription = row["transcription"]
59
54
 
60
- # Get the correct answer and convert to label
61
- if "disorder_type" not in annotation or "transcription" not in annotation:
62
- continue
63
- label = annotation["disorder_type"]
64
- prompt = annotation["transcription"]
55
+ unique_id = str(idx)
56
+ local_audio_name = f"{label}_{unique_id}.mp3"
57
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
58
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
65
59
 
66
60
  # Create references for each option
67
61
  references: List[Reference] = []
68
- correct_label = 0
69
- for option in ["typically_developing", "articulation", "phonological"]:
62
+ options = ["typically_developing", "articulation", "phonological"]
63
+ if label not in options:
64
+ continue
65
+ for option in options:
70
66
  reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
71
67
  references.append(reference)
72
- if option == label:
73
- correct_label += 1
74
- if correct_label == 0:
75
- continue
76
68
 
77
69
  # Create the input with audio and instruction
78
70
  content = [
79
- MediaObject(content_type="audio/mpeg", location=audio_path),
80
- MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
71
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
72
+ MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
81
73
  ]
82
74
 
83
75
  input = Input(multimedia_content=MultimediaObject(content))
@@ -1,7 +1,7 @@
1
- from typing import List, Tuple
1
+ from typing import List
2
2
  import os
3
- import json
4
3
 
4
+ from datasets import load_dataset
5
5
  from tqdm import tqdm
6
6
 
7
7
  from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
  Output,
15
15
  )
16
16
  from helm.common.media_object import MediaObject, MultimediaObject
17
- from huggingface_hub import snapshot_download
18
-
19
-
20
- def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
- """
22
- Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
- Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
-
25
- Args:
26
- directory: Path to the directory containing the files
27
-
28
- Returns:
29
- List of tuples where each tuple contains (mp3_path, json_path)
30
- """
31
- pairs = []
32
-
33
- # Walk through all directories and subdirectories
34
- for root, _, files in os.walk(directory):
35
- # Get all MP3 files in current directory
36
- mp3_files = [f for f in files if f.endswith(".mp3")]
37
-
38
- for mp3_file in mp3_files:
39
- base_name = os.path.splitext(mp3_file)[0]
40
- json_file = f"{base_name}.json"
41
-
42
- # Check if corresponding JSON file exists in the same directory
43
- if json_file in files:
44
- mp3_path = os.path.join(root, mp3_file)
45
- json_path = os.path.join(root, json_file)
46
- pairs.append((mp3_path, json_path))
47
-
48
- return pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
49
18
 
50
19
 
51
20
  class UltraSuiteDisorderSymptomsScenario(Scenario):
@@ -70,45 +39,37 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
70
39
  - Audio files (e.g., .mp3)
71
40
  - A JSON file with annotations containing 'answer' field
72
41
  """
73
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
74
- data_path = snapshot_download(
75
- repo_id="SAA-Lab/SLPHelmManualLabels",
76
- repo_type="dataset",
77
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
78
- )
42
+ audio_save_dir = os.path.join(output_path, "audio_files")
43
+ os.makedirs(audio_save_dir, exist_ok=True)
44
+
45
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
46
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
79
47
 
80
48
  instances: List[Instance] = []
81
49
  split: str = TEST_SPLIT
82
50
 
83
- # Find all pairs of audio and JSON files
84
- pairs = find_audio_json_pairs(data_path)
85
-
86
- for audio_path, json_path in tqdm(pairs):
51
+ for idx, row in enumerate(tqdm(dataset["train"])):
52
+ label = row["disorder_symptom"]
53
+ transcription = row["transcription"]
87
54
 
88
- # Load the annotation
89
- with open(json_path, "r") as f:
90
- annotation = json.load(f)
55
+ unique_id = str(idx)
56
+ local_audio_name = f"{label}_{unique_id}.mp3"
57
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
58
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
91
59
 
92
- # Get the correct answer and convert to label
93
- if "disorder_symptom" not in annotation or "transcription" not in annotation:
94
- continue
95
- label = annotation["disorder_symptom"]
96
- prompt = annotation["transcription"]
97
60
  # Create references for each option
98
61
  references: List[Reference] = []
99
- correct_label = 0
100
- for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
62
+ options = ["substitution", "omission", "addition", "typically_developing", "stuttering"]
63
+ if label not in options:
64
+ continue
65
+ for option in options:
101
66
  reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
102
67
  references.append(reference)
103
- if option == label:
104
- correct_label += 1
105
- if correct_label == 0:
106
- continue
107
68
 
108
69
  # Create the input with audio and instruction
109
70
  content = [
110
- MediaObject(content_type="audio/mpeg", location=audio_path),
111
- MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
71
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
72
+ MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
112
73
  ]
113
74
 
114
75
  input = Input(multimedia_content=MultimediaObject(content))
@@ -2,6 +2,7 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  CORRECT_TAG,
7
8
  TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  Reference,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
  from helm.common.general import ensure_directory_exists
16
18
 
@@ -54,3 +56,22 @@ class Banking77Scenario(Scenario):
54
56
  instance = Instance(input=input, references=references, split=split_name)
55
57
  instances.append(instance)
56
58
  return instances
59
+
60
+ def get_metadata(self) -> ScenarioMetadata:
61
+ return ScenarioMetadata(
62
+ name="banking77",
63
+ display_name="BANKING77",
64
+ short_display_name="BANKING77",
65
+ description="BANKING77 is a benchmark for intent classification of customer service queries "
66
+ "in the banking domain [(Casanueva et al., "
67
+ "2020)](https://aclanthology.org/2020.nlp4convai-1.5/).",
68
+ taxonomy=TaxonomyInfo(
69
+ task="text classification",
70
+ what="customer service queries in the banking domain",
71
+ when="During or before 2020",
72
+ who="banking customers",
73
+ language="English",
74
+ ),
75
+ main_metric="quasi_exact_match",
76
+ main_split="test",
77
+ )
@@ -249,6 +249,6 @@ class BBQScenario(Scenario):
249
249
  "question answering in ambiguous and unambigous context [(Parrish et al., "
250
250
  "2022)](https://aclanthology.org/2022.findings-acl.165/).",
251
251
  taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
252
- main_metric="quasi_exact_match",
252
+ main_metric="bbq_accuracy",
253
253
  main_split="test",
254
254
  )
@@ -4,6 +4,7 @@ from typing import Dict, List
4
4
 
5
5
  from filelock import FileLock
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
8
9
  from helm.common.hierarchical_logger import hlog
9
10
  from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
@@ -18,6 +19,7 @@ from helm.benchmark.scenarios.scenario import (
18
19
  VALID_SPLIT,
19
20
  Input,
20
21
  Output,
22
+ ScenarioMetadata,
21
23
  )
22
24
 
23
25
 
@@ -92,3 +94,19 @@ INSERT_YOUR_SQL_QUERY_HERE
92
94
  )
93
95
  instances.append(instance)
94
96
  return instances
97
+
98
+ def get_metadata(self) -> ScenarioMetadata:
99
+ return ScenarioMetadata(
100
+ name="bird_sql",
101
+ display_name="BIRD-SQL (Dev)",
102
+ description="BIRD-SQL (Dev)",
103
+ taxonomy=TaxonomyInfo(
104
+ task="text-to-SQL",
105
+ what="databases from various domains",
106
+ when="?",
107
+ who="expert data scientists",
108
+ language="English",
109
+ ),
110
+ main_metric="execution_accuracy",
111
+ main_split="valid",
112
+ )
@@ -134,7 +134,13 @@ class OpenBookQA(Scenario):
134
134
  display_name="OpenbookQA",
135
135
  description="The OpenbookQA benchmark for commonsense-intensive open book question "
136
136
  "answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).",
137
- taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
137
+ taxonomy=TaxonomyInfo(
138
+ task="multiple-choice question answering",
139
+ what="elementary science",
140
+ when="2018",
141
+ who="Amazon Mechnical Turk workers",
142
+ language="English",
143
+ ),
138
144
  main_metric="exact_match",
139
145
  main_split="test",
140
146
  )
@@ -2,6 +2,7 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  CORRECT_TAG,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  TEST_SPLIT,
11
12
  Input,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import ensure_directory_exists
15
17
 
@@ -128,3 +130,19 @@ CREATE TABLE "trans" (
128
130
  instance = Instance(input=input, references=references, split=TEST_SPLIT)
129
131
  instances.append(instance)
130
132
  return instances
133
+
134
+ def get_metadata(self) -> ScenarioMetadata:
135
+ return ScenarioMetadata(
136
+ name="czech_bank_qa",
137
+ display_name="CzechBankQA",
138
+ description="The CzechBankQA",
139
+ taxonomy=TaxonomyInfo(
140
+ task="text-to-SQL",
141
+ what="queries from financial experts",
142
+ when="1999",
143
+ who="financial experts",
144
+ language="English",
145
+ ),
146
+ main_metric="error_rate",
147
+ main_split="test",
148
+ )
@@ -2,6 +2,7 @@ import os
2
2
  import json
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  TRAIN_SPLIT,
13
14
  TEST_SPLIT,
14
15
  CORRECT_TAG,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -117,3 +119,21 @@ class FinQAScenario(Scenario):
117
119
  )
118
120
  instances.append(instance)
119
121
  return instances
122
+
123
+ def get_metadata(self) -> ScenarioMetadata:
124
+ return ScenarioMetadata(
125
+ name="fin_qa",
126
+ display_name="FinQA",
127
+ description="The FinQA benchmark for numeric reasoning over financial data, with question "
128
+ "answering pairs written by financial experts over financial reports [(Chen et "
129
+ "al., 2021)](https://arxiv.org/abs/2109.00122/).",
130
+ taxonomy=TaxonomyInfo(
131
+ task="question answering with numeric reasoning",
132
+ what="financial reports",
133
+ when="1999 to 2019",
134
+ who="financial experts",
135
+ language="English",
136
+ ),
137
+ main_metric="program_accuracy",
138
+ main_split="test",
139
+ )
@@ -4,6 +4,7 @@ import os
4
4
  import random
5
5
  from typing import List
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  CORRECT_TAG,
9
10
  TRAIN_SPLIT,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  TEST_SPLIT,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
20
 
@@ -51,3 +53,22 @@ class FinanceBenchScenario(Scenario):
51
53
  for train_index in train_indexes:
52
54
  instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
53
55
  return instances
56
+
57
+ def get_metadata(self) -> ScenarioMetadata:
58
+ return ScenarioMetadata(
59
+ name="financebench",
60
+ display_name="FinanceBench",
61
+ description="FinanceBench is a benchmark for open book financial question answering. It "
62
+ "comprises 10,231 questions about publicly traded companies, with corresponding "
63
+ "answers and evidence strings [(Islam et al., "
64
+ "2023)](https://arxiv.org/abs/2311.11944/).",
65
+ taxonomy=TaxonomyInfo(
66
+ task="question answering with numeric reasoning",
67
+ what="financial reports",
68
+ when="2015 to 2023",
69
+ who="financial experts",
70
+ language="English",
71
+ ),
72
+ main_metric="annotation_financebench_label_correct_answer",
73
+ main_split="test",
74
+ )
@@ -71,12 +71,18 @@ class GSM8KScenario(Scenario):
71
71
  def get_metadata(self) -> ScenarioMetadata:
72
72
  return ScenarioMetadata(
73
73
  name="gsm",
74
- display_name="GSM8K (Grade school math word problems)",
74
+ display_name="GSM8K (Grade School Math)",
75
75
  short_display_name="GSM8K",
76
76
  description="The grade school math word problems dataset (GSM8K) for testing mathematical "
77
77
  "reasoning on grade-school math problems [(Cobbe et al., "
78
78
  "2021)](https://arxiv.org/pdf/2110.14168.pdf).",
79
- taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
80
- main_metric="exact_match_indicator",
79
+ taxonomy=TaxonomyInfo(
80
+ task="numeric answer question answering",
81
+ what="grade school math word problems",
82
+ when="2021",
83
+ who="contractors on Upwork and Surge AI",
84
+ language="English",
85
+ ),
86
+ main_metric="final_number_exact_match",
81
87
  main_split="test",
82
88
  )
@@ -2,9 +2,10 @@ import os
2
2
  import pandas as pd
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
 
7
- from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
8
+ from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
8
9
 
9
10
 
10
11
  class HarmBenchGCGTransferScenario(Scenario):
@@ -48,3 +49,13 @@ class HarmBenchGCGTransferScenario(Scenario):
48
49
  instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
49
50
  instances.append(instance)
50
51
  return instances
52
+
53
+ def get_metadata(self) -> ScenarioMetadata:
54
+ return ScenarioMetadata(
55
+ name="harm_bench_gcg_transfer",
56
+ display_name="HarmBenchGCGTransfer",
57
+ description="HarmBenchGCGTransfer",
58
+ taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
59
+ main_metric="safety_score",
60
+ main_split="test",
61
+ )
@@ -2,9 +2,10 @@ import os
2
2
  import pandas as pd
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
 
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
8
9
 
9
10
 
10
11
  class HarmBenchScenario(Scenario):
@@ -57,3 +58,13 @@ class HarmBenchScenario(Scenario):
57
58
  instance = Instance(input=input, split=TEST_SPLIT, references=references, sub_split=tag, id=id)
58
59
  instances.append(instance)
59
60
  return instances
61
+
62
+ def get_metadata(self) -> ScenarioMetadata:
63
+ return ScenarioMetadata(
64
+ name="harm_bench",
65
+ display_name="HarmBench",
66
+ description="HarmBench",
67
+ taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
68
+ main_metric="safety_score",
69
+ main_split="test",
70
+ )
@@ -4,6 +4,7 @@ from typing import List
4
4
 
5
5
  from datasets import load_dataset, Features, Value, Sequence, Dataset
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Scenario,
9
10
  Instance,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  Output,
13
14
  CORRECT_TAG,
14
15
  TEST_SPLIT,
16
+ ScenarioMetadata,
15
17
  )
16
18
  from helm.common.general import ensure_directory_exists
17
19
 
@@ -88,3 +90,22 @@ class InfiniteBenchEnMCScenario(Scenario):
88
90
  instances.append(instance)
89
91
 
90
92
  return instances
93
+
94
+ def get_metadata(self) -> ScenarioMetadata:
95
+ return ScenarioMetadata(
96
+ name="infinite_bench_en_mc",
97
+ display_name="∞Bench En.MC",
98
+ description="∞Bench En.MC is a multiple-choice question answering task that requires "
99
+ "locating and processing information within a novel, performing reasoning "
100
+ "through aggregation or filtering to derive answers. ([Zhang et al., "
101
+ "2024](https://arxiv.org/abs/2402.13718))",
102
+ taxonomy=TaxonomyInfo(
103
+ task="multiple-choice question answering",
104
+ what="Novels",
105
+ when="Before 2024",
106
+ who="Novel authors",
107
+ language="English",
108
+ ),
109
+ main_metric="exact_match",
110
+ main_split="test",
111
+ )
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  from typing import List
4
4
  from datasets import load_dataset, Features, Value, Sequence, Dataset
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  Output,
11
12
  CORRECT_TAG,
12
13
  TEST_SPLIT,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import ensure_directory_exists
15
17
 
@@ -77,3 +79,20 @@ class InfiniteBenchEnSumScenario(Scenario):
77
79
  instances.append(instance)
78
80
 
79
81
  return instances
82
+
83
+ def get_metadata(self) -> ScenarioMetadata:
84
+ return ScenarioMetadata(
85
+ name="infinite_bench_en_sum",
86
+ display_name="∞Bench En.Sum",
87
+ description="∞Bench En.Sum is a summarization task that requires generating a concise "
88
+ "summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))",
89
+ taxonomy=TaxonomyInfo(
90
+ task="multi-hop question answering",
91
+ what="Novels",
92
+ when="Before 2024",
93
+ who="Novel authors",
94
+ language="English",
95
+ ),
96
+ main_metric="rouge_l",
97
+ main_split="test",
98
+ )
@@ -149,15 +149,14 @@ class LegalBenchScenario(Scenario):
149
149
 
150
150
  def get_metadata(self) -> ScenarioMetadata:
151
151
  return ScenarioMetadata(
152
- name="legalbench",
152
+ name=self.name,
153
153
  display_name="LegalBench",
154
- description="LegalBench is a large collaboratively constructed benchmark of legal "
155
- "reasoning. Five representative tasks are included here. See [(Guha et al, "
156
- "2023)[https://arxiv.org/abs/2308.11462] for more details.",
154
+ description="LegalBench is a large collaboratively constructed benchmark of legal reasoning "
155
+ "tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).",
157
156
  taxonomy=TaxonomyInfo(
158
- task="text classification",
159
- what="fact patterns, questions, and legal documents",
160
- when="n/a",
157
+ task="multiple-choice question answering",
158
+ what="public legal and admininstrative documents, manually " "constructed questions",
159
+ when="before 2023",
161
160
  who="lawyers",
162
161
  language="English",
163
162
  ),
@@ -454,14 +454,21 @@ class MATHScenario(Scenario):
454
454
  return instances
455
455
 
456
456
  def get_metadata(self) -> ScenarioMetadata:
457
+ taxonomy = TaxonomyInfo(
458
+ task="numeric answer question answering",
459
+ what="math competitions (AMC, AIME, etc.)",
460
+ when="before 2021",
461
+ who="problem setters",
462
+ language="synthetic",
463
+ )
457
464
  if self.use_chain_of_thought:
458
465
  return ScenarioMetadata(
459
466
  name="math_chain_of_thought",
460
- display_name="MATH (chain-of-thought)",
467
+ display_name="MATH",
461
468
  description="The MATH benchmark for measuring mathematical problem solving on competition "
462
469
  "math problems with chain-of-thought style reasoning [(Hendrycks et al., "
463
- "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
464
- taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
470
+ "2021)](https://arxiv.org/pdf/2103.03874.pdf).",
471
+ taxonomy=taxonomy,
465
472
  main_metric="math_equiv_chain_of_thought",
466
473
  main_split="test",
467
474
  )
@@ -472,7 +479,7 @@ class MATHScenario(Scenario):
472
479
  description="The MATH benchmark for measuring mathematical problem solving on competition "
473
480
  "math problems [(Hendrycks et al., "
474
481
  "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
475
- taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
482
+ taxonomy=taxonomy,
476
483
  main_metric="math_equiv",
477
484
  main_split="test",
478
485
  )
@@ -113,7 +113,13 @@ class MedQAScenario(Scenario):
113
113
  description="MedQA is an open domain question answering dataset composed of questions from "
114
114
  "professional medical board exams ([Jin et al. "
115
115
  "2020](https://arxiv.org/pdf/2009.13081.pdf)).",
116
- taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
116
+ taxonomy=TaxonomyInfo(
117
+ task="multiple-choice question answering",
118
+ what="US medical licensing exams",
119
+ when="before 2020",
120
+ who="problem setters",
121
+ language="English",
122
+ ),
117
123
  main_metric="quasi_exact_match",
118
124
  main_split="test",
119
125
  )
@@ -51,7 +51,7 @@ class MediQAScenario(Scenario):
51
51
 
52
52
  name = "medi_qa"
53
53
  description = (
54
- "MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate"
54
+ "MEDIQA is a benchmark designed to evaluate a model's ability to generate"
55
55
  "medically accurate answers to patient-generated questions. Each instance includes a"
56
56
  "consumer health question, a set of candidate answers (used in ranking tasks), relevance"
57
57
  "annotations, and optionally, additional context. The benchmark focuses on supporting"
@@ -124,7 +124,7 @@ class MediQAScenario(Scenario):
124
124
  "health communication.",
125
125
  taxonomy=TaxonomyInfo(
126
126
  task="Text generation",
127
- what="Retrieve and rank answers based on medical question " "understanding",
127
+ what="Generate medically accurate answers to patient-generated questions.",
128
128
  when="Any",
129
129
  who="Clinician, Medical Student",
130
130
  language="English",