crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
+
"""
|
|
22
|
+
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
+
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
directory: Path to the directory containing the files
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
+
"""
|
|
31
|
+
pairs = []
|
|
32
|
+
|
|
33
|
+
# Walk through all directories and subdirectories
|
|
34
|
+
for root, _, files in os.walk(directory):
|
|
35
|
+
# Get all MP3 files in current directory
|
|
36
|
+
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
+
|
|
38
|
+
for mp3_file in mp3_files:
|
|
39
|
+
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
+
json_file = f"{base_name}.json"
|
|
41
|
+
|
|
42
|
+
# Check if corresponding JSON file exists in the same directory
|
|
43
|
+
if json_file in files:
|
|
44
|
+
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
+
json_path = os.path.join(root, json_file)
|
|
46
|
+
pairs.append((mp3_path, json_path))
|
|
47
|
+
|
|
48
|
+
return pairs
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
52
|
+
"""
|
|
53
|
+
A scenario identifying features of speech disorders within the provided audio.
|
|
54
|
+
The audio files contain speech from children, potentially with an adult present.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
name = "speech_disorder"
|
|
58
|
+
description = "A scenario for evaluating speech disorders in children"
|
|
59
|
+
tags = ["audio", "classification", "speech_disorder"]
|
|
60
|
+
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmManualLabels"
|
|
61
|
+
|
|
62
|
+
def get_instruction(self, words: str) -> str:
|
|
63
|
+
prompt = f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording and recognize any abnormal features in the child's speech. 2. These features can be on of the following: A - 'substitution', B - 'omission', C - 'addition', D - 'typically_developing', or E - 'stuttering'. Here, 'substitution' is when the child substitutes one word/phrase/syllable for another. 'omission' is when the child omits one word/phrase/syllable. 'addition' is when the child adds one word/phrase/syllable. 'typically_developing' is when the child's speech is typical of a child of their age. 'stuttering' is when the child stutters, has difficulty speaking, repeats sounds/words or prolongs sounds/words. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
|
|
64
|
+
|
|
65
|
+
return prompt
|
|
66
|
+
|
|
67
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
68
|
+
"""
|
|
69
|
+
Create instances from the audio files and their corresponding JSON annotations.
|
|
70
|
+
The data directory should contain:
|
|
71
|
+
- Audio files (e.g., .mp3)
|
|
72
|
+
- A JSON file with annotations containing 'answer' field
|
|
73
|
+
"""
|
|
74
|
+
print(f"Downloading dataset from {UltraSuiteDisorderSymptomsScenario.HF_MAPPING_URL} to {output_path}")
|
|
75
|
+
ensure_file_downloaded(source_url=UltraSuiteDisorderSymptomsScenario.HF_MAPPING_URL, target_path=output_path)
|
|
76
|
+
|
|
77
|
+
instances: List[Instance] = []
|
|
78
|
+
split: str = TEST_SPLIT
|
|
79
|
+
|
|
80
|
+
# Find all pairs of audio and JSON files
|
|
81
|
+
pairs = find_audio_json_pairs(output_path)
|
|
82
|
+
|
|
83
|
+
for audio_path, json_path in tqdm(pairs):
|
|
84
|
+
|
|
85
|
+
# Load the annotation
|
|
86
|
+
with open(json_path, "r") as f:
|
|
87
|
+
annotation = json.load(f)
|
|
88
|
+
|
|
89
|
+
# Get the correct answer and convert to label
|
|
90
|
+
if "disorder_symptom" not in annotation or "transcription" not in annotation:
|
|
91
|
+
continue
|
|
92
|
+
label = annotation["disorder_symptom"]
|
|
93
|
+
prompt = annotation["transcription"]
|
|
94
|
+
# Create references for each option
|
|
95
|
+
references: List[Reference] = []
|
|
96
|
+
for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
|
|
97
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
98
|
+
references.append(reference)
|
|
99
|
+
|
|
100
|
+
# Create the input with audio and instruction
|
|
101
|
+
content = [
|
|
102
|
+
MediaObject(content_type="audio/mpeg", location=audio_path),
|
|
103
|
+
MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
input = Input(multimedia_content=MultimediaObject(content))
|
|
107
|
+
instances.append(Instance(input=input, references=references, split=split))
|
|
108
|
+
|
|
109
|
+
return instances
|
|
@@ -46,6 +46,10 @@ class VocalSoundScenario(Scenario):
|
|
|
46
46
|
description = "Classify an audio sample of a spoken digit ([Gong et al, 2022](https://arxiv.org/abs/2205.03433))."
|
|
47
47
|
tags: List[str] = ["audio", "classification"]
|
|
48
48
|
|
|
49
|
+
def __init__(self, sound: str) -> None:
|
|
50
|
+
super().__init__()
|
|
51
|
+
self._sound: str = sound
|
|
52
|
+
|
|
49
53
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
54
|
instances: List[Instance] = []
|
|
51
55
|
down_loading_path = os.path.join(output_path, "download")
|
|
@@ -53,7 +57,12 @@ class VocalSoundScenario(Scenario):
|
|
|
53
57
|
wav_save_dir = os.path.join(down_loading_path, "audio_16k")
|
|
54
58
|
for file_name in tqdm(os.listdir(wav_save_dir)):
|
|
55
59
|
local_audio_path: str = os.path.join(wav_save_dir, file_name)
|
|
56
|
-
if
|
|
60
|
+
if (
|
|
61
|
+
not file_name.endswith(".wav")
|
|
62
|
+
or is_invalid_audio_file(local_audio_path)
|
|
63
|
+
# Skip this problematic file
|
|
64
|
+
or file_name == "m0083_0_sneeze.wav"
|
|
65
|
+
):
|
|
57
66
|
continue
|
|
58
67
|
|
|
59
68
|
input = Input(
|
|
@@ -61,9 +70,14 @@ class VocalSoundScenario(Scenario):
|
|
|
61
70
|
)
|
|
62
71
|
|
|
63
72
|
answer: str = file_name.split("_")[-1].split(".")[0]
|
|
73
|
+
if answer.lower() != self._sound:
|
|
74
|
+
continue
|
|
75
|
+
|
|
64
76
|
if answer == "throatclearing":
|
|
65
77
|
answer = "throat clearing"
|
|
66
78
|
|
|
67
79
|
references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
|
|
68
80
|
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
81
|
+
|
|
82
|
+
assert len(instances) > 0, f"No instances found for sound: {self._sound}"
|
|
69
83
|
return instances
|
|
@@ -40,8 +40,7 @@ class VoxCeleb2Scenario(Scenario):
|
|
|
40
40
|
"https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
|
|
41
41
|
)
|
|
42
42
|
IDENTITY_INSTRUCTION = (
|
|
43
|
-
"Listen to the audio and take your best guess to determine if the two speakers are the same person.
|
|
44
|
-
"Give just the letter of your answer and nothing else."
|
|
43
|
+
"Listen to the audio and take your best guess to determine if the two speakers are the same person."
|
|
45
44
|
)
|
|
46
45
|
|
|
47
46
|
name = "voxceleb2"
|
|
@@ -12,7 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
14
|
from helm.common.general import ensure_directory_exists
|
|
15
|
-
from helm.common.hierarchical_logger import
|
|
15
|
+
from helm.common.hierarchical_logger import hwarn
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class AutoBencherCapabilitiesScenario(Scenario):
|
|
@@ -61,7 +61,7 @@ class AutoBencherCapabilitiesScenario(Scenario):
|
|
|
61
61
|
# References are category ID, followed by level 2, 3 and 4 category names.
|
|
62
62
|
references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
|
|
63
63
|
if row["gold_answer"] is None:
|
|
64
|
-
|
|
64
|
+
hwarn(f"Row had no gold_answer: {row}")
|
|
65
65
|
continue
|
|
66
66
|
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
67
67
|
instances.append(instance)
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import pandas as pd
|
|
3
2
|
from typing import List
|
|
4
3
|
|
|
5
|
-
from helm.common.general import
|
|
4
|
+
from helm.common.general import check_file_exists
|
|
6
5
|
from helm.benchmark.scenarios.scenario import (
|
|
7
6
|
Input,
|
|
8
7
|
Scenario,
|
|
@@ -61,28 +60,30 @@ Clinical Note:
|
|
|
61
60
|
|
|
62
61
|
class CHWCarePlanScenario(Scenario):
|
|
63
62
|
"""
|
|
64
|
-
A scenario for
|
|
63
|
+
A scenario for a dataset containing free form text of a clinical health worker care plan, with the
|
|
64
|
+
associated goal being to restructure that text into a given format.
|
|
65
65
|
|
|
66
66
|
- Input: The clinical note (column "MO Note").
|
|
67
67
|
- Output: The clinical note (column "MO Note"). We will use this note as the reference for entailment.
|
|
68
68
|
"""
|
|
69
69
|
|
|
70
70
|
name = "chw_care_plan"
|
|
71
|
-
description =
|
|
72
|
-
|
|
71
|
+
description = (
|
|
72
|
+
"NoteExtract is a benchmark that focuses on the structured extraction of information"
|
|
73
|
+
"from free-form clinical text. It provides care plan notes authored by health workers"
|
|
74
|
+
"and evaluates a model's ability to convert them into a predefined structured format,"
|
|
75
|
+
"such as fields for Chief Complaint and History of Present Illness. The benchmark"
|
|
76
|
+
"emphasizes faithful extraction without hallucination or inference."
|
|
77
|
+
)
|
|
73
78
|
tags = ["question_answering", "biomedical"]
|
|
74
79
|
|
|
75
|
-
def __init__(self):
|
|
76
|
-
"""
|
|
77
|
-
:param data_file: Path to the mimiciv_icd10.feather file.
|
|
78
|
-
"""
|
|
80
|
+
def __init__(self, data_path: str):
|
|
79
81
|
super().__init__()
|
|
80
|
-
self.
|
|
82
|
+
self.data_path = data_path
|
|
81
83
|
|
|
82
84
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
df = pd.read_csv(self.data_file) # columns: ["text", "target", ...]
|
|
85
|
+
check_file_exists(self.data_path, msg=f"[CHWCarePlanScenario] Required data file not found: '{self.data_path}'")
|
|
86
|
+
df = pd.read_csv(self.data_path) # columns: ["text", "target", ...]
|
|
86
87
|
|
|
87
88
|
instances: List[Instance] = []
|
|
88
89
|
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from helm.common.general import
|
|
5
|
+
from helm.common.general import check_file_exists
|
|
6
6
|
from helm.benchmark.scenarios.scenario import (
|
|
7
7
|
Input,
|
|
8
8
|
Scenario,
|
|
@@ -76,16 +76,21 @@ class CLEARScenario(Scenario):
|
|
|
76
76
|
"unemployment": "unemployment",
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
-
def __init__(self, condition: str):
|
|
79
|
+
def __init__(self, condition: str, data_path: str):
|
|
80
80
|
"""Initialize the scenario with a specific medical condition"""
|
|
81
81
|
super().__init__()
|
|
82
82
|
|
|
83
83
|
if condition not in self.CONDITIONS:
|
|
84
84
|
raise ValueError(f"Condition '{condition}' not supported. Available conditions: {self.CONDITIONS}")
|
|
85
|
-
|
|
85
|
+
self.data_path = data_path
|
|
86
86
|
self.condition = condition
|
|
87
87
|
self.name = f"clear_{condition}"
|
|
88
|
-
self.description =
|
|
88
|
+
self.description = (
|
|
89
|
+
"CLEAR is a benchmark designed to evaluate models on their ability to detect medical"
|
|
90
|
+
"conditions from patient notes using categorical responses. Each instance consists of"
|
|
91
|
+
"a clinical note and a target condition, requiring the model to classify the patient's"
|
|
92
|
+
"history as either affirmative, negative, or uncertain."
|
|
93
|
+
) # noqa: E501
|
|
89
94
|
self.tags = ["classification", "biomedical", condition.replace("_", "-")]
|
|
90
95
|
|
|
91
96
|
def get_answer_choices(self) -> List[str]:
|
|
@@ -95,9 +100,8 @@ class CLEARScenario(Scenario):
|
|
|
95
100
|
|
|
96
101
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
97
102
|
"""Load and process the data for the specified conditon."""
|
|
98
|
-
|
|
99
|
-
excel_path =
|
|
100
|
-
ensure_directory_exists(os.path.dirname(excel_path))
|
|
103
|
+
excel_path = os.path.join(self.data_path, f"{self.condition}.xlsx")
|
|
104
|
+
check_file_exists(excel_path, msg=f"[CLEARScenario] Required data file not found: '{excel_path}'")
|
|
101
105
|
|
|
102
106
|
df = pd.read_excel(excel_path)
|
|
103
107
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
-
from helm.common.general import
|
|
2
|
+
from helm.common.general import check_file_exists
|
|
3
3
|
from helm.benchmark.scenarios.scenario import (
|
|
4
4
|
Input,
|
|
5
5
|
Scenario,
|
|
@@ -21,26 +21,34 @@ def file_preprocessing(data_path: str, task_objective: str) -> pd.DataFrame:
|
|
|
21
21
|
data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/'
|
|
22
22
|
"""
|
|
23
23
|
# Load the first CSV file
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
diagnosis_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/diagnosis.csv.gz"
|
|
25
|
+
check_file_exists(
|
|
26
|
+
diagnosis_path, msg=f"[DischargeMeScenario] Required diagnosis file not found: '{diagnosis_path}'"
|
|
26
27
|
)
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
discharge_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz"
|
|
29
|
+
check_file_exists(
|
|
30
|
+
discharge_path, msg=f"[DischargeMeScenario] Required discharge file not found: '{discharge_path}'"
|
|
29
31
|
)
|
|
32
|
+
target_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge_target.csv.gz"
|
|
33
|
+
check_file_exists(target_path, msg=f"[DischargeMeScenario] Required target file not found: '{target_path}'")
|
|
34
|
+
radiology_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz"
|
|
35
|
+
check_file_exists(
|
|
36
|
+
radiology_path, msg=f"[DischargeMeScenario] Required radiology file not found: '{radiology_path}'"
|
|
37
|
+
)
|
|
38
|
+
ed_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/edstays.csv.gz"
|
|
39
|
+
check_file_exists(ed_path, msg=f"[DischargeMeScenario] Required ed file not found: '{ed_path}'")
|
|
40
|
+
triage_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/triage.csv.gz"
|
|
41
|
+
check_file_exists(triage_path, msg=f"[DischargeMeScenario] Required triage file not found: '{triage_path}'")
|
|
42
|
+
df_diagnosis = pd.read_csv(diagnosis_path, compression="gzip", keep_default_na=False)
|
|
43
|
+
df_discharge = pd.read_csv(discharge_path, compression="gzip", keep_default_na=False)
|
|
30
44
|
df_target = pd.read_csv(
|
|
31
|
-
|
|
45
|
+
target_path,
|
|
32
46
|
compression="gzip",
|
|
33
47
|
keep_default_na=False,
|
|
34
48
|
)
|
|
35
|
-
df_radiology = pd.read_csv(
|
|
36
|
-
|
|
37
|
-
)
|
|
38
|
-
df_ed = pd.read_csv(
|
|
39
|
-
f"{data_path}/files/discharge-me/1.3/test_phase_1/edstays.csv.gz", compression="gzip", keep_default_na=False
|
|
40
|
-
)
|
|
41
|
-
df_triage = pd.read_csv(
|
|
42
|
-
f"{data_path}/files/discharge-me/1.3/test_phase_1/triage.csv.gz", compression="gzip", keep_default_na=False
|
|
43
|
-
)
|
|
49
|
+
df_radiology = pd.read_csv(radiology_path, compression="gzip", keep_default_na=False)
|
|
50
|
+
df_ed = pd.read_csv(ed_path, compression="gzip", keep_default_na=False)
|
|
51
|
+
df_triage = pd.read_csv(triage_path, compression="gzip", keep_default_na=False)
|
|
44
52
|
df_diagnosis_triage = pd.merge(
|
|
45
53
|
df_diagnosis, df_triage, on="subject_id", how="inner", suffixes=("_df_diagnosis", "_df_triage")
|
|
46
54
|
)
|
|
@@ -113,16 +121,23 @@ class DischargeMeScenario(Scenario):
|
|
|
113
121
|
"""
|
|
114
122
|
|
|
115
123
|
name = "dischargeme"
|
|
116
|
-
description =
|
|
117
|
-
|
|
124
|
+
description = (
|
|
125
|
+
"DischargeMe is a benchmark designed to evaluate clinical text generation. It pairs"
|
|
126
|
+
"discharge summaries and radiology reports from MIMIC-IV with generation tasks"
|
|
127
|
+
"such as writing discharge instructions or summarizing the brief hospital course. The"
|
|
128
|
+
"benchmark assesses a model's ability to generate patient-facing documentation that is"
|
|
129
|
+
"complete, empathetic, and clinically accurate."
|
|
130
|
+
)
|
|
118
131
|
tags = ["biomedical"]
|
|
119
132
|
|
|
133
|
+
def __init__(self, data_path: str):
|
|
134
|
+
super().__init__()
|
|
135
|
+
self.data_path = data_path
|
|
136
|
+
|
|
120
137
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
121
|
-
data_path = "/share/pi/nigam/data/physionet.org"
|
|
122
|
-
ensure_directory_exists(data_path)
|
|
123
138
|
instances: List[Instance] = []
|
|
124
|
-
df_bhc = file_preprocessing(data_path, "brief_hospital_course")
|
|
125
|
-
df_di = file_preprocessing(data_path, "discharge_instructions")
|
|
139
|
+
df_bhc = file_preprocessing(self.data_path, "brief_hospital_course")
|
|
140
|
+
df_di = file_preprocessing(self.data_path, "discharge_instructions")
|
|
126
141
|
|
|
127
142
|
for i in range(df_bhc.shape[0]):
|
|
128
143
|
prompt_bhc = create_prompt(
|
|
@@ -36,7 +36,13 @@ class EhrSqlScenario(Scenario):
|
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
name = "ehr_sql"
|
|
39
|
-
description =
|
|
39
|
+
description = (
|
|
40
|
+
"EHRSQL is a benchmark designed to evaluate models on generating structured queries"
|
|
41
|
+
"for clinical research. Each example includes a natural language question and a database"
|
|
42
|
+
"schema, and the task is to produce an SQL query that would return the correct result"
|
|
43
|
+
"for a biomedical research objective. This benchmark assesses a model's understanding"
|
|
44
|
+
"of medical terminology, data structures, and query construction."
|
|
45
|
+
)
|
|
40
46
|
tags = ["sql", "medical", "reasoning"]
|
|
41
47
|
|
|
42
48
|
def setup_database(self, output_path: str) -> str:
|
|
@@ -3,12 +3,11 @@ import os
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import tiktoken
|
|
5
5
|
|
|
6
|
-
from filelock import FileLock
|
|
7
6
|
from functools import partial
|
|
8
7
|
from tqdm import tqdm
|
|
9
8
|
from typing import Any, Dict, List, Optional, Mapping
|
|
10
9
|
|
|
11
|
-
from helm.common.general import ensure_directory_exists
|
|
10
|
+
from helm.common.general import check_file_exists, ensure_directory_exists
|
|
12
11
|
from helm.benchmark.scenarios.scenario import (
|
|
13
12
|
TEST_SPLIT,
|
|
14
13
|
Input,
|
|
@@ -1411,7 +1410,10 @@ class EHRSHOTScenario(Scenario):
|
|
|
1411
1410
|
|
|
1412
1411
|
name = "ehrshot"
|
|
1413
1412
|
description = (
|
|
1414
|
-
"
|
|
1413
|
+
"EHRSHOT is a benchmark designed to evaluate a model's ability to predict future"
|
|
1414
|
+
"clinical events using structured EHR data. Each instance contains a patient's"
|
|
1415
|
+
"historical EHR data and a forward-looking clinical question about whether a particular"
|
|
1416
|
+
"diagnosis, lab result, or hospital event will occur."
|
|
1415
1417
|
)
|
|
1416
1418
|
tags = [] # TODO
|
|
1417
1419
|
|
|
@@ -1420,24 +1422,32 @@ class EHRSHOTScenario(Scenario):
|
|
|
1420
1422
|
"no",
|
|
1421
1423
|
]
|
|
1422
1424
|
|
|
1423
|
-
def __init__(self, subject: str, max_length: Optional[int] = None):
|
|
1425
|
+
def __init__(self, subject: str, data_path: str, max_length: Optional[int] = None):
|
|
1424
1426
|
super().__init__()
|
|
1425
1427
|
self.subject: str = subject # same as "task" or "labeling_function"
|
|
1426
|
-
self.path_to_meds_dir: str = "/share/pi/nigam/data/medhelm/ehrshot/meds/"
|
|
1427
|
-
self.path_to_tmp_dir: str = "/share/pi/nigam/data/medhelm/ehrshot/prompts/"
|
|
1428
1428
|
self.max_length = max_length
|
|
1429
|
+
self.data_path = data_path
|
|
1429
1430
|
|
|
1430
|
-
def create_benchmark(self, n_procs: int = 4) -> Dict[str, str]:
|
|
1431
|
+
def create_benchmark(self, output_path: str, n_procs: int = 4) -> Dict[str, str]:
|
|
1431
1432
|
"""Loads the MEDS dataset and converts it to prompts"""
|
|
1432
|
-
|
|
1433
1433
|
# Load MEDS EHRSHOT patient timelines
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1434
|
+
data_parquet_path = os.path.join(self.data_path, "data/data.parquet")
|
|
1435
|
+
check_file_exists(
|
|
1436
|
+
data_parquet_path, msg=f"[EHRSHOTScenario] Required parquet data file not found: '{data_parquet_path}'"
|
|
1437
|
+
)
|
|
1438
|
+
splits_parquet_path = os.path.join(self.data_path, "metadata/subject_splits.parquet")
|
|
1439
|
+
check_file_exists(
|
|
1440
|
+
splits_parquet_path, msg=f"[EHRSHOTScenario] Required splits file not found: '{splits_parquet_path}'"
|
|
1441
|
+
)
|
|
1442
|
+
df_data = pd.read_parquet(data_parquet_path)
|
|
1443
|
+
df_splits = pd.read_parquet(splits_parquet_path)
|
|
1437
1444
|
# Load MEDS EHRSHOT labels
|
|
1438
|
-
tasks = sorted(os.listdir(os.path.join(self.
|
|
1445
|
+
tasks = sorted(os.listdir(os.path.join(self.data_path, "labels")))
|
|
1439
1446
|
for t in tasks:
|
|
1440
|
-
path_to_labels: str = os.path.join(self.
|
|
1447
|
+
path_to_labels: str = os.path.join(self.data_path, "labels", t, "labels.parquet")
|
|
1448
|
+
check_file_exists(
|
|
1449
|
+
path_to_labels, msg=f"[EHRSHOTScenario] Required labels file not found: '{path_to_labels}'"
|
|
1450
|
+
)
|
|
1441
1451
|
if t != self.subject or not os.path.exists(path_to_labels):
|
|
1442
1452
|
continue
|
|
1443
1453
|
df_labels = pd.read_parquet(path_to_labels)
|
|
@@ -1470,18 +1480,16 @@ class EHRSHOTScenario(Scenario):
|
|
|
1470
1480
|
df_labels["prompt"] = prompts
|
|
1471
1481
|
|
|
1472
1482
|
# Save to parquet
|
|
1473
|
-
path_to_output_dir: str = os.path.join(
|
|
1483
|
+
path_to_output_dir: str = os.path.join(output_path, self.subject)
|
|
1474
1484
|
ensure_directory_exists(path_to_output_dir)
|
|
1475
1485
|
df_labels.to_parquet(os.path.join(path_to_output_dir, "medhelm_prompts.parquet"))
|
|
1476
1486
|
return {"status": "success"}
|
|
1477
1487
|
|
|
1478
1488
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
1479
|
-
path_to_input_csv: str = os.path.join(
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
print(f"Creating benchmark from SCRATCH for {self.subject}...")
|
|
1484
|
-
self.create_benchmark() # Create benchmark from scratch
|
|
1489
|
+
path_to_input_csv: str = os.path.join(output_path, self.subject, "medhelm_prompts.parquet")
|
|
1490
|
+
if not os.path.exists(path_to_input_csv):
|
|
1491
|
+
print(f"Creating benchmark from SCRATCH for {self.subject}...")
|
|
1492
|
+
self.create_benchmark(output_path=output_path) # Create benchmark from scratch
|
|
1485
1493
|
|
|
1486
1494
|
# Load data for this task
|
|
1487
1495
|
df = pd.read_parquet(path_to_input_csv)
|
|
@@ -1509,38 +1517,3 @@ class EHRSHOTScenario(Scenario):
|
|
|
1509
1517
|
)
|
|
1510
1518
|
|
|
1511
1519
|
return instances
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
if __name__ == "__main__":
|
|
1515
|
-
# Generate statistics on prompts
|
|
1516
|
-
from transformers import AutoTokenizer
|
|
1517
|
-
|
|
1518
|
-
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
1519
|
-
tqdm.pandas()
|
|
1520
|
-
n_procs: int = 10
|
|
1521
|
-
|
|
1522
|
-
os.makedirs("./ehrshot_stats", exist_ok=True)
|
|
1523
|
-
for t in TASK_FULL_NAMES.keys():
|
|
1524
|
-
# Skip if already exists
|
|
1525
|
-
if os.path.exists(f"./ehrshot_stats/{t}.txt"):
|
|
1526
|
-
print(f"Skipping {t} because it already exists")
|
|
1527
|
-
continue
|
|
1528
|
-
|
|
1529
|
-
# Create benchmark
|
|
1530
|
-
scenario = EHRSHOTScenario(subject=t)
|
|
1531
|
-
scenario.create_benchmark(n_procs=n_procs)
|
|
1532
|
-
instances = scenario.get_instances("test.csv")
|
|
1533
|
-
|
|
1534
|
-
# Calculate prompt token stats
|
|
1535
|
-
path_to_input_csv = os.path.join(scenario.path_to_tmp_dir, scenario.subject, "medhelm_prompts.parquet")
|
|
1536
|
-
df = pd.read_parquet(path_to_input_csv)
|
|
1537
|
-
df["prompt_n_tokens"] = df["prompt"].progress_apply(lambda x: len(tokenizer.encode(x)))
|
|
1538
|
-
with open(f"./ehrshot_stats/{t}.txt", "w") as f:
|
|
1539
|
-
f.write("-" * 100 + "\n")
|
|
1540
|
-
f.write(f"Task: {t}\n")
|
|
1541
|
-
f.write(f"# of instances: {len(instances)}\n")
|
|
1542
|
-
f.write(f"# of positives: {df['boolean_value'].sum()}\n")
|
|
1543
|
-
f.write(f"Size of splits:\n{df['split'].value_counts()}\n")
|
|
1544
|
-
f.write(f"# tokens per prompt:\n{df['prompt_n_tokens'].describe()}\n")
|
|
1545
|
-
f.write("-" * 100 + "\n")
|
|
1546
|
-
df.to_parquet(os.path.join(scenario.path_to_tmp_dir, scenario.subject, "medhelm_prompts.parquet"))
|
|
@@ -2,7 +2,7 @@ from collections import defaultdict
|
|
|
2
2
|
from dataclasses import dataclass, field, replace
|
|
3
3
|
from functools import cached_property
|
|
4
4
|
from typing import List, Optional
|
|
5
|
-
from helm.common.hierarchical_logger import
|
|
5
|
+
from helm.common.hierarchical_logger import hwarn
|
|
6
6
|
|
|
7
7
|
import dacite
|
|
8
8
|
import re
|
|
@@ -111,7 +111,7 @@ def validate_grammar(grammar: Grammar):
|
|
|
111
111
|
# Make sure all categories are defined
|
|
112
112
|
for category in expansion.categories:
|
|
113
113
|
if category not in grammar.category_to_rules:
|
|
114
|
-
|
|
114
|
+
hwarn(f"Category {category} is not defined")
|
|
115
115
|
|
|
116
116
|
|
|
117
117
|
def read_grammar(path: str) -> Grammar:
|
|
@@ -57,7 +57,12 @@ class HeadQAScenario(Scenario):
|
|
|
57
57
|
SKIP_TEXTQA: bool = False
|
|
58
58
|
|
|
59
59
|
name = "head_qa"
|
|
60
|
-
description =
|
|
60
|
+
description = (
|
|
61
|
+
"HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to"
|
|
62
|
+
"evaluate a model's medical knowledge and reasoning. Each instance presents a clinical"
|
|
63
|
+
"or scientific question with four answer options, requiring the model to select the most"
|
|
64
|
+
"appropriate answer."
|
|
65
|
+
)
|
|
61
66
|
tags = ["question_answering", "biomedical", "medicine"]
|
|
62
67
|
|
|
63
68
|
def __init__(self, language: str = "en", category: Optional[str] = None):
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_directory_exists
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InfiniteBenchEnQAScenario(Scenario):
|
|
20
|
+
"""InfiniteBench En.QA
|
|
21
|
+
|
|
22
|
+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
23
|
+
understand, and reason over long contexts (100k+ tokens). InfiniteBench En.QA is a subset of
|
|
24
|
+
InfiniteBench that requires models to perform open-form question answering on questions that necessitate
|
|
25
|
+
long-range dependency and reasoning, beyond simple short passage retrieval.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "infinite_bench_en_qa"
|
|
29
|
+
description = "∞Bench En.QA is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
30
|
+
tags = ["question_answering"]
|
|
31
|
+
|
|
32
|
+
def __init__(self, max_num_words: int):
|
|
33
|
+
self.max_num_words = max_num_words
|
|
34
|
+
super().__init__()
|
|
35
|
+
|
|
36
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
37
|
+
# Get InfiniteBench from HuggingFace
|
|
38
|
+
cache_dir = os.path.join(output_path, "data")
|
|
39
|
+
ensure_directory_exists(cache_dir)
|
|
40
|
+
|
|
41
|
+
# Define the features schema
|
|
42
|
+
ft = Features(
|
|
43
|
+
{
|
|
44
|
+
"id": Value("int64"),
|
|
45
|
+
"context": Value("string"),
|
|
46
|
+
"input": Value("string"),
|
|
47
|
+
"answer": Sequence(Value("string")),
|
|
48
|
+
"options": Sequence(Value("string")),
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Load the dataset with the specified features
|
|
53
|
+
dataset = load_dataset(
|
|
54
|
+
"xinrongzhang2022/InfiniteBench",
|
|
55
|
+
split="longbook_qa_eng",
|
|
56
|
+
features=ft,
|
|
57
|
+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
assert isinstance(dataset, Dataset)
|
|
61
|
+
|
|
62
|
+
def count_words(text: str) -> int:
|
|
63
|
+
return len(re.split(r"\s+", text.strip()))
|
|
64
|
+
|
|
65
|
+
dataset = dataset.filter(
|
|
66
|
+
lambda example: count_words(example["context"])
|
|
67
|
+
+ count_words(example["input"])
|
|
68
|
+
+ sum(count_words(option) for option in example["options"])
|
|
69
|
+
<= self.max_num_words
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Read all instances
|
|
73
|
+
instances: List[Instance] = []
|
|
74
|
+
for row in dataset:
|
|
75
|
+
id = row["id"]
|
|
76
|
+
input = Input(text=row["context"] + "\n\n" + row["input"])
|
|
77
|
+
instance = Instance(
|
|
78
|
+
id=id,
|
|
79
|
+
input=input,
|
|
80
|
+
references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
|
|
81
|
+
split=TEST_SPLIT,
|
|
82
|
+
)
|
|
83
|
+
instances.append(instance)
|
|
84
|
+
|
|
85
|
+
return instances
|