crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +3 -1
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +117 -115
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +12 -0
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_specs/arabic_run_specs.py +6 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +2 -2
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/commonsense_scenario.py +7 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/gsm_scenario.py +9 -3
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -7
- helm/benchmark/scenarios/math_scenario.py +11 -4
- helm/benchmark/scenarios/med_qa_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mmlu_scenario.py +8 -2
- helm/benchmark/scenarios/narrativeqa_scenario.py +3 -4
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +9 -2
- helm/benchmark/static/schema_long_context.yaml +12 -31
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +5 -1
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/together_client.py +4 -0
- helm/clients/vertexai_client.py +4 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +225 -0
- helm/config/model_metadata.yaml +232 -7
- helm/config/tokenizer_configs.yaml +74 -4
- helm/benchmark/static_build/assets/index-671a5e06.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{index-9352595e.css → index-oIeiQW2g.css} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
-
import
|
|
2
|
+
import os
|
|
3
3
|
|
|
4
|
+
from datasets import load_dataset
|
|
4
5
|
from tqdm import tqdm
|
|
5
6
|
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -13,8 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
Output,
|
|
14
15
|
)
|
|
15
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
-
from
|
|
17
|
-
from .ultra_suite_classification_scenario import find_audio_json_pairs
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
@@ -38,46 +38,38 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
|
38
38
|
- Audio files (e.g., .mp3)
|
|
39
39
|
- A JSON file with annotations containing 'disorder_class' field
|
|
40
40
|
"""
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
)
|
|
41
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
42
|
+
os.makedirs(audio_save_dir, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
|
|
45
|
+
dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
|
|
47
46
|
|
|
48
47
|
instances: List[Instance] = []
|
|
49
48
|
split: str = TEST_SPLIT
|
|
50
49
|
|
|
51
|
-
|
|
52
|
-
pairs = find_audio_json_pairs(data_path)
|
|
53
|
-
print(f"Num pairs: {len(pairs)}")
|
|
54
|
-
|
|
55
|
-
for audio_path, json_path in tqdm(pairs):
|
|
50
|
+
for idx, row in enumerate(tqdm(dataset["train"])):
|
|
56
51
|
# Load the annotation
|
|
57
|
-
|
|
58
|
-
|
|
52
|
+
label = row["disorder_type"]
|
|
53
|
+
transcription = row["transcription"]
|
|
59
54
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
prompt = annotation["transcription"]
|
|
55
|
+
unique_id = str(idx)
|
|
56
|
+
local_audio_name = f"{label}_{unique_id}.mp3"
|
|
57
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
58
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
65
59
|
|
|
66
60
|
# Create references for each option
|
|
67
61
|
references: List[Reference] = []
|
|
68
|
-
|
|
69
|
-
|
|
62
|
+
options = ["typically_developing", "articulation", "phonological"]
|
|
63
|
+
if label not in options:
|
|
64
|
+
continue
|
|
65
|
+
for option in options:
|
|
70
66
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
71
67
|
references.append(reference)
|
|
72
|
-
if option == label:
|
|
73
|
-
correct_label += 1
|
|
74
|
-
if correct_label == 0:
|
|
75
|
-
continue
|
|
76
68
|
|
|
77
69
|
# Create the input with audio and instruction
|
|
78
70
|
content = [
|
|
79
|
-
MediaObject(content_type="audio/mpeg", location=
|
|
80
|
-
MediaObject(content_type="text/plain", text=self.get_instruction(
|
|
71
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
72
|
+
MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
|
|
81
73
|
]
|
|
82
74
|
|
|
83
75
|
input = Input(multimedia_content=MultimediaObject(content))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
import os
|
|
3
|
-
import json
|
|
4
3
|
|
|
4
|
+
from datasets import load_dataset
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
|
|
7
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
-
"""
|
|
22
|
-
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
-
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
directory: Path to the directory containing the files
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
-
"""
|
|
31
|
-
pairs = []
|
|
32
|
-
|
|
33
|
-
# Walk through all directories and subdirectories
|
|
34
|
-
for root, _, files in os.walk(directory):
|
|
35
|
-
# Get all MP3 files in current directory
|
|
36
|
-
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
-
|
|
38
|
-
for mp3_file in mp3_files:
|
|
39
|
-
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
-
json_file = f"{base_name}.json"
|
|
41
|
-
|
|
42
|
-
# Check if corresponding JSON file exists in the same directory
|
|
43
|
-
if json_file in files:
|
|
44
|
-
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
-
json_path = os.path.join(root, json_file)
|
|
46
|
-
pairs.append((mp3_path, json_path))
|
|
47
|
-
|
|
48
|
-
return pairs
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
49
18
|
|
|
50
19
|
|
|
51
20
|
class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
@@ -70,45 +39,37 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
|
70
39
|
- Audio files (e.g., .mp3)
|
|
71
40
|
- A JSON file with annotations containing 'answer' field
|
|
72
41
|
"""
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
)
|
|
42
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
43
|
+
os.makedirs(audio_save_dir, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
|
|
46
|
+
dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
|
|
79
47
|
|
|
80
48
|
instances: List[Instance] = []
|
|
81
49
|
split: str = TEST_SPLIT
|
|
82
50
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
for audio_path, json_path in tqdm(pairs):
|
|
51
|
+
for idx, row in enumerate(tqdm(dataset["train"])):
|
|
52
|
+
label = row["disorder_symptom"]
|
|
53
|
+
transcription = row["transcription"]
|
|
87
54
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
55
|
+
unique_id = str(idx)
|
|
56
|
+
local_audio_name = f"{label}_{unique_id}.mp3"
|
|
57
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
58
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
91
59
|
|
|
92
|
-
# Get the correct answer and convert to label
|
|
93
|
-
if "disorder_symptom" not in annotation or "transcription" not in annotation:
|
|
94
|
-
continue
|
|
95
|
-
label = annotation["disorder_symptom"]
|
|
96
|
-
prompt = annotation["transcription"]
|
|
97
60
|
# Create references for each option
|
|
98
61
|
references: List[Reference] = []
|
|
99
|
-
|
|
100
|
-
|
|
62
|
+
options = ["substitution", "omission", "addition", "typically_developing", "stuttering"]
|
|
63
|
+
if label not in options:
|
|
64
|
+
continue
|
|
65
|
+
for option in options:
|
|
101
66
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
102
67
|
references.append(reference)
|
|
103
|
-
if option == label:
|
|
104
|
-
correct_label += 1
|
|
105
|
-
if correct_label == 0:
|
|
106
|
-
continue
|
|
107
68
|
|
|
108
69
|
# Create the input with audio and instruction
|
|
109
70
|
content = [
|
|
110
|
-
MediaObject(content_type="audio/mpeg", location=
|
|
111
|
-
MediaObject(content_type="text/plain", text=self.get_instruction(
|
|
71
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
72
|
+
MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
|
|
112
73
|
]
|
|
113
74
|
|
|
114
75
|
input = Input(multimedia_content=MultimediaObject(content))
|
|
@@ -2,6 +2,7 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
CORRECT_TAG,
|
|
7
8
|
TEST_SPLIT,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
Reference,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_directory_exists
|
|
16
18
|
|
|
@@ -54,3 +56,22 @@ class Banking77Scenario(Scenario):
|
|
|
54
56
|
instance = Instance(input=input, references=references, split=split_name)
|
|
55
57
|
instances.append(instance)
|
|
56
58
|
return instances
|
|
59
|
+
|
|
60
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
61
|
+
return ScenarioMetadata(
|
|
62
|
+
name="banking77",
|
|
63
|
+
display_name="BANKING77",
|
|
64
|
+
short_display_name="BANKING77",
|
|
65
|
+
description="BANKING77 is a benchmark for intent classification of customer service queries "
|
|
66
|
+
"in the banking domain [(Casanueva et al., "
|
|
67
|
+
"2020)](https://aclanthology.org/2020.nlp4convai-1.5/).",
|
|
68
|
+
taxonomy=TaxonomyInfo(
|
|
69
|
+
task="text classification",
|
|
70
|
+
what="customer service queries in the banking domain",
|
|
71
|
+
when="During or before 2020",
|
|
72
|
+
who="banking customers",
|
|
73
|
+
language="English",
|
|
74
|
+
),
|
|
75
|
+
main_metric="quasi_exact_match",
|
|
76
|
+
main_split="test",
|
|
77
|
+
)
|
|
@@ -249,6 +249,6 @@ class BBQScenario(Scenario):
|
|
|
249
249
|
"question answering in ambiguous and unambigous context [(Parrish et al., "
|
|
250
250
|
"2022)](https://aclanthology.org/2022.findings-acl.165/).",
|
|
251
251
|
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
252
|
-
main_metric="
|
|
252
|
+
main_metric="bbq_accuracy",
|
|
253
253
|
main_split="test",
|
|
254
254
|
)
|
|
@@ -4,6 +4,7 @@ from typing import Dict, List
|
|
|
4
4
|
|
|
5
5
|
from filelock import FileLock
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
|
|
8
9
|
from helm.common.hierarchical_logger import hlog
|
|
9
10
|
from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
|
|
@@ -18,6 +19,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
18
19
|
VALID_SPLIT,
|
|
19
20
|
Input,
|
|
20
21
|
Output,
|
|
22
|
+
ScenarioMetadata,
|
|
21
23
|
)
|
|
22
24
|
|
|
23
25
|
|
|
@@ -92,3 +94,19 @@ INSERT_YOUR_SQL_QUERY_HERE
|
|
|
92
94
|
)
|
|
93
95
|
instances.append(instance)
|
|
94
96
|
return instances
|
|
97
|
+
|
|
98
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
99
|
+
return ScenarioMetadata(
|
|
100
|
+
name="bird_sql",
|
|
101
|
+
display_name="BIRD-SQL (Dev)",
|
|
102
|
+
description="BIRD-SQL (Dev)",
|
|
103
|
+
taxonomy=TaxonomyInfo(
|
|
104
|
+
task="text-to-SQL",
|
|
105
|
+
what="databases from various domains",
|
|
106
|
+
when="?",
|
|
107
|
+
who="expert data scientists",
|
|
108
|
+
language="English",
|
|
109
|
+
),
|
|
110
|
+
main_metric="execution_accuracy",
|
|
111
|
+
main_split="valid",
|
|
112
|
+
)
|
|
@@ -134,7 +134,13 @@ class OpenBookQA(Scenario):
|
|
|
134
134
|
display_name="OpenbookQA",
|
|
135
135
|
description="The OpenbookQA benchmark for commonsense-intensive open book question "
|
|
136
136
|
"answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).",
|
|
137
|
-
taxonomy=TaxonomyInfo(
|
|
137
|
+
taxonomy=TaxonomyInfo(
|
|
138
|
+
task="multiple-choice question answering",
|
|
139
|
+
what="elementary science",
|
|
140
|
+
when="2018",
|
|
141
|
+
who="Amazon Mechnical Turk workers",
|
|
142
|
+
language="English",
|
|
143
|
+
),
|
|
138
144
|
main_metric="exact_match",
|
|
139
145
|
main_split="test",
|
|
140
146
|
)
|
|
@@ -2,6 +2,7 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
CORRECT_TAG,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
TEST_SPLIT,
|
|
11
12
|
Input,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import ensure_directory_exists
|
|
15
17
|
|
|
@@ -128,3 +130,19 @@ CREATE TABLE "trans" (
|
|
|
128
130
|
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
129
131
|
instances.append(instance)
|
|
130
132
|
return instances
|
|
133
|
+
|
|
134
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
135
|
+
return ScenarioMetadata(
|
|
136
|
+
name="czech_bank_qa",
|
|
137
|
+
display_name="CzechBankQA",
|
|
138
|
+
description="The CzechBankQA",
|
|
139
|
+
taxonomy=TaxonomyInfo(
|
|
140
|
+
task="text-to-SQL",
|
|
141
|
+
what="queries from financial experts",
|
|
142
|
+
when="1999",
|
|
143
|
+
who="financial experts",
|
|
144
|
+
language="English",
|
|
145
|
+
),
|
|
146
|
+
main_metric="error_rate",
|
|
147
|
+
main_split="test",
|
|
148
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
TRAIN_SPLIT,
|
|
13
14
|
TEST_SPLIT,
|
|
14
15
|
CORRECT_TAG,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -117,3 +119,21 @@ class FinQAScenario(Scenario):
|
|
|
117
119
|
)
|
|
118
120
|
instances.append(instance)
|
|
119
121
|
return instances
|
|
122
|
+
|
|
123
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
124
|
+
return ScenarioMetadata(
|
|
125
|
+
name="fin_qa",
|
|
126
|
+
display_name="FinQA",
|
|
127
|
+
description="The FinQA benchmark for numeric reasoning over financial data, with question "
|
|
128
|
+
"answering pairs written by financial experts over financial reports [(Chen et "
|
|
129
|
+
"al., 2021)](https://arxiv.org/abs/2109.00122/).",
|
|
130
|
+
taxonomy=TaxonomyInfo(
|
|
131
|
+
task="question answering with numeric reasoning",
|
|
132
|
+
what="financial reports",
|
|
133
|
+
when="1999 to 2019",
|
|
134
|
+
who="financial experts",
|
|
135
|
+
language="English",
|
|
136
|
+
),
|
|
137
|
+
main_metric="program_accuracy",
|
|
138
|
+
main_split="test",
|
|
139
|
+
)
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import random
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
CORRECT_TAG,
|
|
9
10
|
TRAIN_SPLIT,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
TEST_SPLIT,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
20
|
|
|
@@ -51,3 +53,22 @@ class FinanceBenchScenario(Scenario):
|
|
|
51
53
|
for train_index in train_indexes:
|
|
52
54
|
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
|
|
53
55
|
return instances
|
|
56
|
+
|
|
57
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
58
|
+
return ScenarioMetadata(
|
|
59
|
+
name="financebench",
|
|
60
|
+
display_name="FinanceBench",
|
|
61
|
+
description="FinanceBench is a benchmark for open book financial question answering. It "
|
|
62
|
+
"comprises 10,231 questions about publicly traded companies, with corresponding "
|
|
63
|
+
"answers and evidence strings [(Islam et al., "
|
|
64
|
+
"2023)](https://arxiv.org/abs/2311.11944/).",
|
|
65
|
+
taxonomy=TaxonomyInfo(
|
|
66
|
+
task="question answering with numeric reasoning",
|
|
67
|
+
what="financial reports",
|
|
68
|
+
when="2015 to 2023",
|
|
69
|
+
who="financial experts",
|
|
70
|
+
language="English",
|
|
71
|
+
),
|
|
72
|
+
main_metric="annotation_financebench_label_correct_answer",
|
|
73
|
+
main_split="test",
|
|
74
|
+
)
|
|
@@ -71,12 +71,18 @@ class GSM8KScenario(Scenario):
|
|
|
71
71
|
def get_metadata(self) -> ScenarioMetadata:
|
|
72
72
|
return ScenarioMetadata(
|
|
73
73
|
name="gsm",
|
|
74
|
-
display_name="GSM8K (Grade
|
|
74
|
+
display_name="GSM8K (Grade School Math)",
|
|
75
75
|
short_display_name="GSM8K",
|
|
76
76
|
description="The grade school math word problems dataset (GSM8K) for testing mathematical "
|
|
77
77
|
"reasoning on grade-school math problems [(Cobbe et al., "
|
|
78
78
|
"2021)](https://arxiv.org/pdf/2110.14168.pdf).",
|
|
79
|
-
taxonomy=TaxonomyInfo(
|
|
80
|
-
|
|
79
|
+
taxonomy=TaxonomyInfo(
|
|
80
|
+
task="numeric answer question answering",
|
|
81
|
+
what="grade school math word problems",
|
|
82
|
+
when="2021",
|
|
83
|
+
who="contractors on Upwork and Surge AI",
|
|
84
|
+
language="English",
|
|
85
|
+
),
|
|
86
|
+
main_metric="final_number_exact_match",
|
|
81
87
|
main_split="test",
|
|
82
88
|
)
|
|
@@ -2,9 +2,10 @@ import os
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
|
|
7
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
8
|
+
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class HarmBenchGCGTransferScenario(Scenario):
|
|
@@ -48,3 +49,13 @@ class HarmBenchGCGTransferScenario(Scenario):
|
|
|
48
49
|
instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
|
|
49
50
|
instances.append(instance)
|
|
50
51
|
return instances
|
|
52
|
+
|
|
53
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
54
|
+
return ScenarioMetadata(
|
|
55
|
+
name="harm_bench_gcg_transfer",
|
|
56
|
+
display_name="HarmBenchGCGTransfer",
|
|
57
|
+
description="HarmBenchGCGTransfer",
|
|
58
|
+
taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
|
|
59
|
+
main_metric="safety_score",
|
|
60
|
+
main_split="test",
|
|
61
|
+
)
|
|
@@ -2,9 +2,10 @@ import os
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
|
|
7
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
8
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class HarmBenchScenario(Scenario):
|
|
@@ -57,3 +58,13 @@ class HarmBenchScenario(Scenario):
|
|
|
57
58
|
instance = Instance(input=input, split=TEST_SPLIT, references=references, sub_split=tag, id=id)
|
|
58
59
|
instances.append(instance)
|
|
59
60
|
return instances
|
|
61
|
+
|
|
62
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
63
|
+
return ScenarioMetadata(
|
|
64
|
+
name="harm_bench",
|
|
65
|
+
display_name="HarmBench",
|
|
66
|
+
description="HarmBench",
|
|
67
|
+
taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
|
|
68
|
+
main_metric="safety_score",
|
|
69
|
+
main_split="test",
|
|
70
|
+
)
|
|
@@ -4,6 +4,7 @@ from typing import List
|
|
|
4
4
|
|
|
5
5
|
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
9
10
|
Instance,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
Output,
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
TEST_SPLIT,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
from helm.common.general import ensure_directory_exists
|
|
17
19
|
|
|
@@ -88,3 +90,22 @@ class InfiniteBenchEnMCScenario(Scenario):
|
|
|
88
90
|
instances.append(instance)
|
|
89
91
|
|
|
90
92
|
return instances
|
|
93
|
+
|
|
94
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
95
|
+
return ScenarioMetadata(
|
|
96
|
+
name="infinite_bench_en_mc",
|
|
97
|
+
display_name="∞Bench En.MC",
|
|
98
|
+
description="∞Bench En.MC is a multiple-choice question answering task that requires "
|
|
99
|
+
"locating and processing information within a novel, performing reasoning "
|
|
100
|
+
"through aggregation or filtering to derive answers. ([Zhang et al., "
|
|
101
|
+
"2024](https://arxiv.org/abs/2402.13718))",
|
|
102
|
+
taxonomy=TaxonomyInfo(
|
|
103
|
+
task="multiple-choice question answering",
|
|
104
|
+
what="Novels",
|
|
105
|
+
when="Before 2024",
|
|
106
|
+
who="Novel authors",
|
|
107
|
+
language="English",
|
|
108
|
+
),
|
|
109
|
+
main_metric="exact_match",
|
|
110
|
+
main_split="test",
|
|
111
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import List
|
|
4
4
|
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
Output,
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
TEST_SPLIT,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import ensure_directory_exists
|
|
15
17
|
|
|
@@ -77,3 +79,20 @@ class InfiniteBenchEnSumScenario(Scenario):
|
|
|
77
79
|
instances.append(instance)
|
|
78
80
|
|
|
79
81
|
return instances
|
|
82
|
+
|
|
83
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
84
|
+
return ScenarioMetadata(
|
|
85
|
+
name="infinite_bench_en_sum",
|
|
86
|
+
display_name="∞Bench En.Sum",
|
|
87
|
+
description="∞Bench En.Sum is a summarization task that requires generating a concise "
|
|
88
|
+
"summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))",
|
|
89
|
+
taxonomy=TaxonomyInfo(
|
|
90
|
+
task="multi-hop question answering",
|
|
91
|
+
what="Novels",
|
|
92
|
+
when="Before 2024",
|
|
93
|
+
who="Novel authors",
|
|
94
|
+
language="English",
|
|
95
|
+
),
|
|
96
|
+
main_metric="rouge_l",
|
|
97
|
+
main_split="test",
|
|
98
|
+
)
|
|
@@ -149,15 +149,14 @@ class LegalBenchScenario(Scenario):
|
|
|
149
149
|
|
|
150
150
|
def get_metadata(self) -> ScenarioMetadata:
|
|
151
151
|
return ScenarioMetadata(
|
|
152
|
-
name=
|
|
152
|
+
name=self.name,
|
|
153
153
|
display_name="LegalBench",
|
|
154
|
-
description="LegalBench is a large collaboratively constructed benchmark of legal "
|
|
155
|
-
"
|
|
156
|
-
"2023)[https://arxiv.org/abs/2308.11462] for more details.",
|
|
154
|
+
description="LegalBench is a large collaboratively constructed benchmark of legal reasoning "
|
|
155
|
+
"tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).",
|
|
157
156
|
taxonomy=TaxonomyInfo(
|
|
158
|
-
task="
|
|
159
|
-
what="
|
|
160
|
-
when="
|
|
157
|
+
task="multiple-choice question answering",
|
|
158
|
+
what="public legal and admininstrative documents, manually " "constructed questions",
|
|
159
|
+
when="before 2023",
|
|
161
160
|
who="lawyers",
|
|
162
161
|
language="English",
|
|
163
162
|
),
|
|
@@ -454,14 +454,21 @@ class MATHScenario(Scenario):
|
|
|
454
454
|
return instances
|
|
455
455
|
|
|
456
456
|
def get_metadata(self) -> ScenarioMetadata:
|
|
457
|
+
taxonomy = TaxonomyInfo(
|
|
458
|
+
task="numeric answer question answering",
|
|
459
|
+
what="math competitions (AMC, AIME, etc.)",
|
|
460
|
+
when="before 2021",
|
|
461
|
+
who="problem setters",
|
|
462
|
+
language="synthetic",
|
|
463
|
+
)
|
|
457
464
|
if self.use_chain_of_thought:
|
|
458
465
|
return ScenarioMetadata(
|
|
459
466
|
name="math_chain_of_thought",
|
|
460
|
-
display_name="MATH
|
|
467
|
+
display_name="MATH",
|
|
461
468
|
description="The MATH benchmark for measuring mathematical problem solving on competition "
|
|
462
469
|
"math problems with chain-of-thought style reasoning [(Hendrycks et al., "
|
|
463
|
-
"2021)](https://
|
|
464
|
-
taxonomy=
|
|
470
|
+
"2021)](https://arxiv.org/pdf/2103.03874.pdf).",
|
|
471
|
+
taxonomy=taxonomy,
|
|
465
472
|
main_metric="math_equiv_chain_of_thought",
|
|
466
473
|
main_split="test",
|
|
467
474
|
)
|
|
@@ -472,7 +479,7 @@ class MATHScenario(Scenario):
|
|
|
472
479
|
description="The MATH benchmark for measuring mathematical problem solving on competition "
|
|
473
480
|
"math problems [(Hendrycks et al., "
|
|
474
481
|
"2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
|
|
475
|
-
taxonomy=
|
|
482
|
+
taxonomy=taxonomy,
|
|
476
483
|
main_metric="math_equiv",
|
|
477
484
|
main_split="test",
|
|
478
485
|
)
|
|
@@ -113,7 +113,13 @@ class MedQAScenario(Scenario):
|
|
|
113
113
|
description="MedQA is an open domain question answering dataset composed of questions from "
|
|
114
114
|
"professional medical board exams ([Jin et al. "
|
|
115
115
|
"2020](https://arxiv.org/pdf/2009.13081.pdf)).",
|
|
116
|
-
taxonomy=TaxonomyInfo(
|
|
116
|
+
taxonomy=TaxonomyInfo(
|
|
117
|
+
task="multiple-choice question answering",
|
|
118
|
+
what="US medical licensing exams",
|
|
119
|
+
when="before 2020",
|
|
120
|
+
who="problem setters",
|
|
121
|
+
language="English",
|
|
122
|
+
),
|
|
117
123
|
main_metric="quasi_exact_match",
|
|
118
124
|
main_split="test",
|
|
119
125
|
)
|
|
@@ -51,7 +51,7 @@ class MediQAScenario(Scenario):
|
|
|
51
51
|
|
|
52
52
|
name = "medi_qa"
|
|
53
53
|
description = (
|
|
54
|
-
"MEDIQA is a benchmark designed to evaluate a model's ability to
|
|
54
|
+
"MEDIQA is a benchmark designed to evaluate a model's ability to generate"
|
|
55
55
|
"medically accurate answers to patient-generated questions. Each instance includes a"
|
|
56
56
|
"consumer health question, a set of candidate answers (used in ranking tasks), relevance"
|
|
57
57
|
"annotations, and optionally, additional context. The benchmark focuses on supporting"
|
|
@@ -124,7 +124,7 @@ class MediQAScenario(Scenario):
|
|
|
124
124
|
"health communication.",
|
|
125
125
|
taxonomy=TaxonomyInfo(
|
|
126
126
|
task="Text generation",
|
|
127
|
-
what="
|
|
127
|
+
what="Generate medically accurate answers to patient-generated questions.",
|
|
128
128
|
when="Any",
|
|
129
129
|
who="Clinician, Medical Student",
|
|
130
130
|
language="English",
|