crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from datasets import load_dataset, Dataset
|
|
4
|
+
from helm.common.hierarchical_logger import htrack_block
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MELTTranslationScenario(Scenario):
|
|
19
|
+
name = "melt_translation"
|
|
20
|
+
description = "Machine Translation scenario."
|
|
21
|
+
tags = ["machine_translation"]
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
dataset_name: str,
|
|
26
|
+
revision: str,
|
|
27
|
+
source_language: str,
|
|
28
|
+
target_language: str,
|
|
29
|
+
subset: Optional[str] = None,
|
|
30
|
+
splits: Optional[Dict[str, str]] = None,
|
|
31
|
+
):
|
|
32
|
+
"""Initializes the question answering scenario.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
dataset_name: The name of the dataset.
|
|
36
|
+
revision: The revision of the dataset to use.
|
|
37
|
+
source_language: The source language to use.
|
|
38
|
+
target_language: The target language to use.
|
|
39
|
+
subset: The subset of the dataset to use. Defaults to "".
|
|
40
|
+
splits: The splits to use for the dataset. Defaults to None.
|
|
41
|
+
"""
|
|
42
|
+
super().__init__()
|
|
43
|
+
self.MAX_TRAIN_INSTANCES = 20_000
|
|
44
|
+
valid_languages = set(["vi", "en"])
|
|
45
|
+
self.dataset_name = dataset_name
|
|
46
|
+
self.subset = subset
|
|
47
|
+
self.revision = revision
|
|
48
|
+
self.splits = splits
|
|
49
|
+
self.source_language = source_language
|
|
50
|
+
self.target_language = target_language
|
|
51
|
+
if self.source_language not in valid_languages or self.target_language not in valid_languages:
|
|
52
|
+
raise ValueError("Supported languages: vi, en.")
|
|
53
|
+
if self.source_language == self.target_language:
|
|
54
|
+
raise ValueError("The source language and the target language should be different.")
|
|
55
|
+
if self.source_language != "en" and self.target_language != "en":
|
|
56
|
+
raise ValueError("One of the languages should be English.")
|
|
57
|
+
|
|
58
|
+
def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
|
|
59
|
+
"""
|
|
60
|
+
Helper for generating instances for a split.
|
|
61
|
+
Args:
|
|
62
|
+
splits (dict): Which splits to partition the data into.
|
|
63
|
+
Returns:
|
|
64
|
+
List[Instance]: Instances from the file for the specified split.
|
|
65
|
+
"""
|
|
66
|
+
with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
|
|
67
|
+
hf_dataset: Any = load_dataset(
|
|
68
|
+
self.dataset_name,
|
|
69
|
+
self.subset,
|
|
70
|
+
revision=self.revision,
|
|
71
|
+
trust_remote_code=True,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
instances: List[Instance] = []
|
|
75
|
+
|
|
76
|
+
for dataset_split_name, helm_split_name in splits.items():
|
|
77
|
+
if helm_split_name == TRAIN_SPLIT:
|
|
78
|
+
hf_dataset[dataset_split_name] = hf_dataset[dataset_split_name].shuffle(seed=42)[
|
|
79
|
+
: self.MAX_TRAIN_INSTANCES
|
|
80
|
+
]
|
|
81
|
+
hf_dataset[dataset_split_name] = Dataset.from_dict(hf_dataset[dataset_split_name])
|
|
82
|
+
|
|
83
|
+
for example in hf_dataset[dataset_split_name]:
|
|
84
|
+
source_sentence = example[self.source_language]
|
|
85
|
+
target_sentence = example[self.target_language]
|
|
86
|
+
instances.append(
|
|
87
|
+
Instance(
|
|
88
|
+
input=Input(text=source_sentence),
|
|
89
|
+
references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
|
|
90
|
+
split=helm_split_name,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
return instances
|
|
94
|
+
|
|
95
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
96
|
+
if self.splits is None:
|
|
97
|
+
splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
|
|
98
|
+
else:
|
|
99
|
+
splits = {}
|
|
100
|
+
if "train" in self.splits:
|
|
101
|
+
splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
|
|
102
|
+
if "validation" in self.splits:
|
|
103
|
+
splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
|
|
104
|
+
if "test" in self.splits:
|
|
105
|
+
splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
|
|
106
|
+
|
|
107
|
+
instances: List[Instance] = self.get_instances_for_splits(splits=splits)
|
|
108
|
+
return instances
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class MELTTranslationOPUS100Scenario(MELTTranslationScenario):
|
|
112
|
+
"""
|
|
113
|
+
Scenario for the OPUS100 dataset.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
name = "melt_translation_opus100"
|
|
117
|
+
description = "OPUS100 dataset for machine translation."
|
|
118
|
+
tags = ["machine_translation"]
|
|
119
|
+
|
|
120
|
+
def __init__(self, **kwargs):
|
|
121
|
+
super().__init__(
|
|
122
|
+
dataset_name="vietgpt/opus100_envi",
|
|
123
|
+
revision="45df06fb0b31edc882d7c8d34389261f995e5208",
|
|
124
|
+
splits={
|
|
125
|
+
TRAIN_SPLIT: "train",
|
|
126
|
+
VALID_SPLIT: "validation",
|
|
127
|
+
TEST_SPLIT: "test",
|
|
128
|
+
},
|
|
129
|
+
**kwargs,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class MELTTranslationPhoMTScenario(MELTTranslationScenario):
|
|
134
|
+
"""
|
|
135
|
+
Scenario for the PhoMT dataset.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
name = "melt_translation_phomt"
|
|
139
|
+
description = "PhoMT dataset for machine translation."
|
|
140
|
+
tags = ["machine_translation"]
|
|
141
|
+
|
|
142
|
+
def __init__(self, **kwargs):
|
|
143
|
+
super().__init__(
|
|
144
|
+
dataset_name="ura-hcmut/PhoMT",
|
|
145
|
+
revision="74386685db01dc038860ff0a90d9f5fbde284bf7",
|
|
146
|
+
splits={
|
|
147
|
+
TRAIN_SPLIT: "train",
|
|
148
|
+
VALID_SPLIT: "validation",
|
|
149
|
+
TEST_SPLIT: "test",
|
|
150
|
+
},
|
|
151
|
+
**kwargs,
|
|
152
|
+
)
|
|
@@ -9,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
9
|
PassageQuestionInput,
|
|
10
10
|
Output,
|
|
11
11
|
)
|
|
12
|
+
from helm.common.general import check_file_exists
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class MentalHealthScenario(Scenario):
|
|
@@ -48,10 +49,19 @@ class MentalHealthScenario(Scenario):
|
|
|
48
49
|
"""
|
|
49
50
|
|
|
50
51
|
name = "mental_health"
|
|
51
|
-
description =
|
|
52
|
-
|
|
52
|
+
description = (
|
|
53
|
+
"MentalHealth is a benchmark focused on evaluating empathetic communication in"
|
|
54
|
+
"mental health counseling. It includes simulated conversations between patients"
|
|
55
|
+
"and counselors, where the task is to generate compassionate and appropriate counselor"
|
|
56
|
+
"responses. The benchmark assesses a model's ability to support patients emotionally"
|
|
57
|
+
"and meaningfully engage in therapeutic conversations."
|
|
58
|
+
)
|
|
53
59
|
tags = ["dialogue", "counseling", "mental_health", "empathy", "healthcare"]
|
|
54
60
|
|
|
61
|
+
def __init__(self, data_path: str):
|
|
62
|
+
super().__init__()
|
|
63
|
+
self.data_path = data_path
|
|
64
|
+
|
|
55
65
|
def process_dialogue_data(self, data: pd.DataFrame) -> List[Instance]:
|
|
56
66
|
"""
|
|
57
67
|
Process the dialogue data into evaluation instances.
|
|
@@ -102,9 +112,10 @@ class MentalHealthScenario(Scenario):
|
|
|
102
112
|
Returns:
|
|
103
113
|
List[Instance]: List of processed instances for evaluation
|
|
104
114
|
"""
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
115
|
+
check_file_exists(
|
|
116
|
+
self.data_path, msg=f"[MentalHealthScenario] Required data file not found: '{self.data_path}'"
|
|
117
|
+
)
|
|
118
|
+
dialogue_data = pd.read_csv(self.data_path)
|
|
108
119
|
|
|
109
120
|
# Process into instances
|
|
110
121
|
instances = self.process_dialogue_data(dialogue_data)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import Dict, List
|
|
3
3
|
|
|
4
|
-
from helm.common.general import
|
|
4
|
+
from helm.common.general import check_file_exists
|
|
5
5
|
from helm.benchmark.scenarios.scenario import (
|
|
6
6
|
Input,
|
|
7
7
|
Scenario,
|
|
@@ -59,15 +59,20 @@ class MIMICBHCScenario(Scenario):
|
|
|
59
59
|
|
|
60
60
|
name = "mimic_bhc"
|
|
61
61
|
description = (
|
|
62
|
-
"
|
|
63
|
-
"
|
|
62
|
+
"MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief"
|
|
63
|
+
"Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV,"
|
|
64
|
+
"each paired with its corresponding BHC summary. The benchmark evaluates a model's"
|
|
65
|
+
"ability to condense detailed clinical information into accurate, concise summaries that"
|
|
66
|
+
"reflect the patient's hospital stay."
|
|
64
67
|
)
|
|
65
68
|
tags = ["summarization", "biomedical"]
|
|
66
69
|
|
|
70
|
+
def __init__(self, data_path: str):
|
|
71
|
+
super().__init__()
|
|
72
|
+
self.data_path = data_path
|
|
73
|
+
|
|
67
74
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
68
|
-
data_path =
|
|
69
|
-
ensure_directory_exists(data_path)
|
|
70
|
-
data_path = data_path + "mimic_iv_bhc.json"
|
|
75
|
+
check_file_exists(self.data_path, msg=f"[MIMICBHCScenario] Required data file not found: '{self.data_path}'")
|
|
71
76
|
|
|
72
77
|
instances: List[Instance] = []
|
|
73
78
|
# Limit to zero shot setting for now
|
|
@@ -77,7 +82,7 @@ class MIMICBHCScenario(Scenario):
|
|
|
77
82
|
"test": TEST_SPLIT,
|
|
78
83
|
}
|
|
79
84
|
|
|
80
|
-
with open(data_path, "r") as f:
|
|
85
|
+
with open(self.data_path, "r") as f:
|
|
81
86
|
data = [json.loads(line) for line in f]
|
|
82
87
|
|
|
83
88
|
for data_split, split in splits.items():
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Dict, List
|
|
3
3
|
|
|
4
|
-
from helm.common.general import
|
|
4
|
+
from helm.common.general import check_file_exists
|
|
5
5
|
from helm.benchmark.scenarios.scenario import (
|
|
6
6
|
Input,
|
|
7
7
|
Scenario,
|
|
@@ -44,15 +44,18 @@ class MIMICRRSScenario(Scenario):
|
|
|
44
44
|
|
|
45
45
|
name = "mimic_rrs"
|
|
46
46
|
description = (
|
|
47
|
-
"
|
|
48
|
-
"
|
|
47
|
+
"MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III"
|
|
48
|
+
"database. It contains pairs of 'Findings' and 'Impression' sections, enabling evaluation"
|
|
49
|
+
"of a model's ability to summarize diagnostic imaging observations into concise, clinically"
|
|
50
|
+
"relevant conclusions."
|
|
49
51
|
)
|
|
50
52
|
tags = ["question_answering", "biomedical"]
|
|
51
53
|
|
|
52
|
-
def
|
|
53
|
-
|
|
54
|
-
|
|
54
|
+
def __init__(self, data_path: str):
|
|
55
|
+
super().__init__()
|
|
56
|
+
self.data_path = data_path
|
|
55
57
|
|
|
58
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
56
59
|
instances: List[Instance] = []
|
|
57
60
|
# Limit to zero shot setting for now
|
|
58
61
|
splits: Dict[str, str] = {
|
|
@@ -64,8 +67,14 @@ class MIMICRRSScenario(Scenario):
|
|
|
64
67
|
for data_split, split in splits.items():
|
|
65
68
|
split_findings_name: str = f"{data_split}.findings.tok"
|
|
66
69
|
split_impressions_name: str = f"{data_split}.impression.tok"
|
|
67
|
-
findings_path: str = os.path.join(data_path, split_findings_name)
|
|
68
|
-
impressions_path: str = os.path.join(data_path, split_impressions_name)
|
|
70
|
+
findings_path: str = os.path.join(self.data_path, split_findings_name)
|
|
71
|
+
impressions_path: str = os.path.join(self.data_path, split_impressions_name)
|
|
72
|
+
check_file_exists(
|
|
73
|
+
findings_path, msg=f"[MIMICRRSScenario] Required findings file not found: '{findings_path}'"
|
|
74
|
+
)
|
|
75
|
+
check_file_exists(
|
|
76
|
+
impressions_path, msg=f"[MIMICRRSScenario] Required impressions file not found: '{impressions_path}'"
|
|
77
|
+
)
|
|
69
78
|
findings: List[str] = self.read_file(findings_path)
|
|
70
79
|
impressions: List[str] = self.read_file(impressions_path)
|
|
71
80
|
assert len(findings) == len(impressions), "Findings and impressions must have the same length"
|
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import pandas as pd
|
|
3
2
|
import numpy as np
|
|
4
3
|
from typing import List
|
|
5
4
|
|
|
6
|
-
from helm.common.general import
|
|
5
|
+
from helm.common.general import check_file_exists
|
|
7
6
|
from helm.benchmark.scenarios.scenario import (
|
|
8
7
|
Input,
|
|
9
8
|
Scenario,
|
|
@@ -24,21 +23,28 @@ class MIMICIVBillingCodeScenario(Scenario):
|
|
|
24
23
|
"""
|
|
25
24
|
|
|
26
25
|
name = "mimiciv_billing_code"
|
|
27
|
-
description =
|
|
26
|
+
description = (
|
|
27
|
+
"MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the"
|
|
28
|
+
"MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task"
|
|
29
|
+
"requires models to extract structured billing codes based on free-text clinical notes,"
|
|
30
|
+
"reflecting real-world hospital coding tasks for financial reimbursement."
|
|
31
|
+
)
|
|
28
32
|
tags = ["question_answering", "biomedical"]
|
|
29
33
|
|
|
30
|
-
def __init__(self,
|
|
34
|
+
def __init__(self, data_path: str):
|
|
31
35
|
"""
|
|
32
|
-
:param
|
|
36
|
+
:param data_path: Path to the mimiciv_icd10.feather file.
|
|
33
37
|
"""
|
|
34
38
|
super().__init__()
|
|
35
|
-
self.
|
|
39
|
+
self.data_path = data_path
|
|
36
40
|
|
|
37
41
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
38
|
-
|
|
42
|
+
check_file_exists(
|
|
43
|
+
self.data_path, msg=f"[MIMICIVBilligCodeScenario] Required data file not found: '{self.data_path}'"
|
|
44
|
+
)
|
|
39
45
|
|
|
40
46
|
# Read the preprocessed MIMIC-IV data (.feather format)
|
|
41
|
-
df = pd.read_feather(self.
|
|
47
|
+
df = pd.read_feather(self.data_path) # columns: ["text", "target", ...]
|
|
42
48
|
|
|
43
49
|
instances: List[Instance] = []
|
|
44
50
|
|
|
@@ -31,9 +31,12 @@ class MTSamplesProceduresScenario(Scenario):
|
|
|
31
31
|
GITHUB_DIR_URL = f"https://github.com/raulista1997/benchmarkdata/tree/{GIT_HASH}/mtsample_procedure"
|
|
32
32
|
RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsample_procedure/"
|
|
33
33
|
|
|
34
|
-
name = "
|
|
34
|
+
name = "mtsamples_procedures"
|
|
35
35
|
description = (
|
|
36
|
-
"
|
|
36
|
+
"MTSamples Procedures is a benchmark composed of transcribed operative notes,"
|
|
37
|
+
"focused on documenting surgical procedures. Each example presents a brief patient case"
|
|
38
|
+
"involving a surgical intervention, and the model is tasked with generating a coherent"
|
|
39
|
+
"and clinically accurate procedural summary or treatment plan."
|
|
37
40
|
)
|
|
38
41
|
tags = ["medical", "transcription", "plan_generation"]
|
|
39
42
|
|
|
@@ -36,8 +36,9 @@ class MTSamplesReplicateScenario(Scenario):
|
|
|
36
36
|
|
|
37
37
|
name = "mtsamples_replicate"
|
|
38
38
|
description = (
|
|
39
|
-
"
|
|
40
|
-
"
|
|
39
|
+
"MTSamples Replicate is a benchmark that provides transcribed medical reports"
|
|
40
|
+
"from various specialties. It is used to evaluate a model's ability to generate clinically"
|
|
41
|
+
"appropriate treatment plans based on unstructured patient documentation"
|
|
41
42
|
)
|
|
42
43
|
tags = ["medical", "transcription", "plan_generation"]
|
|
43
44
|
|
|
@@ -188,8 +188,13 @@ class N2C2CTMatchingScenario(Scenario):
|
|
|
188
188
|
"""
|
|
189
189
|
|
|
190
190
|
name = "n2c2_ct_matching"
|
|
191
|
-
description =
|
|
192
|
-
|
|
191
|
+
description = (
|
|
192
|
+
"N2C2-CT is a benchmark designed to evaluate a model's ability to match patients to"
|
|
193
|
+
"appropriate clinical trials based on eligibility criteria. Each example includes a clinical"
|
|
194
|
+
"note and a trial description. The model is tasked with determining whether the patient"
|
|
195
|
+
"is a valid candidate for the trial. This benchmark supports automation and decision"
|
|
196
|
+
"support in clinical research enrollment."
|
|
197
|
+
)
|
|
193
198
|
tags = [] # TODO
|
|
194
199
|
|
|
195
200
|
POSSIBLE_ANSWER_CHOICES: List[str] = [
|
|
@@ -197,11 +202,12 @@ class N2C2CTMatchingScenario(Scenario):
|
|
|
197
202
|
"no",
|
|
198
203
|
]
|
|
199
204
|
|
|
200
|
-
def __init__(self, subject: str):
|
|
205
|
+
def __init__(self, data_path: str, subject: str):
|
|
201
206
|
super().__init__()
|
|
202
207
|
self.subject: str = subject # specific inclusion criterion to assess
|
|
203
|
-
self.
|
|
204
|
-
self.
|
|
208
|
+
self.data_path: str = data_path
|
|
209
|
+
self.path_to_train_dir: str = os.path.join(self.data_path, "train/")
|
|
210
|
+
self.path_to_test_dir: str = os.path.join(self.data_path, "test/")
|
|
205
211
|
|
|
206
212
|
def create_prompt(self, patient: Dict[str, Any]) -> str:
|
|
207
213
|
# Cast None values to empty strings during string formatting, but keep the original functions returning None
|
|
@@ -11,6 +11,7 @@ from typing import List, Optional, Tuple, Dict
|
|
|
11
11
|
|
|
12
12
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
13
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
14
|
+
from helm.common.local_context import LocalContext
|
|
14
15
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
15
16
|
from helm.common.authentication import Authentication
|
|
16
17
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -39,7 +40,7 @@ except ModuleNotFoundError as e:
|
|
|
39
40
|
# https://github.com/stanford-crfm/benchmarking/issues/569
|
|
40
41
|
def get_test_tokenizer_service() -> TokenizerService:
|
|
41
42
|
# Pointed to the default local path set in run.py (--local-path)
|
|
42
|
-
return TokenizerService(
|
|
43
|
+
return TokenizerService(LocalContext(base_path="prod_env"))
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
SOLUTION_TAG: str = "solution"
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import datasets
|
|
7
|
+
import tiktoken
|
|
8
|
+
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Output,
|
|
12
|
+
Reference,
|
|
13
|
+
Scenario,
|
|
14
|
+
Instance,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
Input,
|
|
17
|
+
)
|
|
18
|
+
from helm.common.general import ensure_directory_exists
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OpenAIMRCRScenario(Scenario):
|
|
22
|
+
"""OpenAI MRCR scenario
|
|
23
|
+
|
|
24
|
+
OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking
|
|
25
|
+
an LLM's ability to distinguish between multiple needles hidden in context. This eval is
|
|
26
|
+
inspired by the MRCR eval first introduced by Gemini (https://arxiv.org/pdf/2409.12640v2).
|
|
27
|
+
|
|
28
|
+
The task is as follows: The model is given a long, multi-turn, synthetically generated
|
|
29
|
+
conversation between user and model where the user asks for a piece of writing about a topic,
|
|
30
|
+
e.g. "write a poem about tapirs" or "write a blog post about rocks". Hidden in this conversation
|
|
31
|
+
are 2, 4, or 8 identical asks, and the model is ultimately prompted to return the i-th instance
|
|
32
|
+
of one of those asks. For example, "Return the 2nd poem about tapirs".
|
|
33
|
+
|
|
34
|
+
Reference: https://huggingface.co/datasets/openai/mrcr"""
|
|
35
|
+
|
|
36
|
+
name = "openai_mrcr"
|
|
37
|
+
description = "OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2)." # noqa: E501
|
|
38
|
+
tags = ["long_context", "mrcr"]
|
|
39
|
+
|
|
40
|
+
NEEDLES_OPTIONS = [2, 4, 8]
|
|
41
|
+
|
|
42
|
+
def __init__(self, needles: int, max_num_words: Optional[int] = None):
|
|
43
|
+
super().__init__()
|
|
44
|
+
self.needles = needles
|
|
45
|
+
self.max_num_words = max_num_words
|
|
46
|
+
if needles not in self.NEEDLES_OPTIONS:
|
|
47
|
+
raise Exception(f"Needles must be one of {self.NEEDLES_OPTIONS}")
|
|
48
|
+
self.tokenizer = tiktoken.get_encoding("o200k_base")
|
|
49
|
+
|
|
50
|
+
def count_words(self, messages: list[dict]) -> int:
|
|
51
|
+
return sum([len(re.split(r"\s+", m["content"].strip())) for m in messages])
|
|
52
|
+
|
|
53
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
54
|
+
cache_dir = os.path.join(output_path, "data")
|
|
55
|
+
ensure_directory_exists(cache_dir)
|
|
56
|
+
dataset = datasets.load_dataset(
|
|
57
|
+
"openai/mrcr",
|
|
58
|
+
cache_dir=cache_dir,
|
|
59
|
+
split="train",
|
|
60
|
+
data_files=[f"{self.needles}needle.parquet"],
|
|
61
|
+
revision="204b0d4e8d9ca5c0a90bf942fdb2a5969094adc0",
|
|
62
|
+
)
|
|
63
|
+
instances = []
|
|
64
|
+
for idx, row in enumerate(dataset):
|
|
65
|
+
messages = json.loads(row["prompt"])
|
|
66
|
+
if self.max_num_words and self.count_words(messages) > self.max_num_words:
|
|
67
|
+
continue
|
|
68
|
+
input = Input(messages=messages)
|
|
69
|
+
references = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
|
|
70
|
+
instance = Instance(
|
|
71
|
+
id=f"{self.needles}needle{idx}",
|
|
72
|
+
input=input,
|
|
73
|
+
references=references,
|
|
74
|
+
split=TEST_SPLIT,
|
|
75
|
+
extra_data={"random_string_to_prepend": row["random_string_to_prepend"]},
|
|
76
|
+
)
|
|
77
|
+
instances.append(instance)
|
|
78
|
+
|
|
79
|
+
return instances
|
|
@@ -125,7 +125,12 @@ class PubMedQAScenario(Scenario):
|
|
|
125
125
|
"""
|
|
126
126
|
|
|
127
127
|
name = "pubmed_qa"
|
|
128
|
-
description =
|
|
128
|
+
description = (
|
|
129
|
+
"PubMedQA is a biomedical question-answering dataset that evaluates a model's"
|
|
130
|
+
"ability to interpret scientific literature. It consists of PubMed abstracts paired with"
|
|
131
|
+
"yes/no/maybe questions derived from the content. The benchmark assesses a model's"
|
|
132
|
+
"capability to reason over biomedical texts and provide factually grounded answers."
|
|
133
|
+
)
|
|
129
134
|
tags = ["question_answering", "biomedical"]
|
|
130
135
|
|
|
131
136
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import csv
|
|
2
|
+
import os
|
|
2
3
|
|
|
3
|
-
from filelock import FileLock
|
|
4
4
|
from typing import Dict, List
|
|
5
5
|
from docx import Document
|
|
6
6
|
|
|
@@ -13,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
13
|
Reference,
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
|
+
from helm.common.general import ensure_file_downloaded
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
def extract_red_text_runs(document):
|
|
@@ -87,10 +88,19 @@ class RaceBasedMedScenario(Scenario):
|
|
|
87
88
|
"""
|
|
88
89
|
|
|
89
90
|
name = "race_based_med"
|
|
90
|
-
description =
|
|
91
|
-
|
|
91
|
+
description = (
|
|
92
|
+
"RaceBias is a benchmark used to evaluate language models for racially biased or"
|
|
93
|
+
"inappropriate content in medical question-answering scenarios. Each instance consists"
|
|
94
|
+
"of a medical question and a model-generated response. The task is to classify whether"
|
|
95
|
+
"the response contains race-based, harmful, or inaccurate content. This benchmark"
|
|
96
|
+
"supports research into bias detection and fairness in clinical AI systems."
|
|
97
|
+
)
|
|
92
98
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
93
99
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
|
|
100
|
+
FILE_URL: str = (
|
|
101
|
+
"https://static-content.springer.com/esm/"
|
|
102
|
+
"art%3A10.1038%2Fs41746-023-00939-z/MediaObjects/41746_2023_939_MOESM1_ESM.docx"
|
|
103
|
+
)
|
|
94
104
|
|
|
95
105
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
96
106
|
data = {}
|
|
@@ -114,12 +124,12 @@ class RaceBasedMedScenario(Scenario):
|
|
|
114
124
|
return data
|
|
115
125
|
|
|
116
126
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
117
|
-
data_path = "/share/pi/nigam/data/medhelm/race_based/race_based.csv"
|
|
118
127
|
# Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
128
|
+
data_path = os.path.join(output_path, "race_based.csv")
|
|
129
|
+
|
|
130
|
+
if not os.path.exists(data_path):
|
|
131
|
+
word_file = os.path.join(output_path, "race_based.docx")
|
|
132
|
+
ensure_file_downloaded(source_url=self.FILE_URL, target_path=word_file, unpack=False)
|
|
123
133
|
create_csv_from_word(word_file, data_path)
|
|
124
134
|
|
|
125
135
|
instances: List[Instance] = []
|
|
@@ -133,7 +133,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
|
|
|
133
133
|
input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
|
|
134
134
|
# Calculate the number of tokens in the example
|
|
135
135
|
total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
|
|
136
|
-
print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
|
|
136
|
+
# print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
|
|
137
137
|
if total_tokens + tokens_to_generate > max_seq_length:
|
|
138
138
|
num_docs -= incremental
|
|
139
139
|
break
|
|
@@ -142,7 +142,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
|
|
|
142
142
|
if num_docs > len(docs):
|
|
143
143
|
num_docs = len(docs)
|
|
144
144
|
break
|
|
145
|
-
print('Number of documents:', num_docs)
|
|
145
|
+
# print('Number of documents:', num_docs)
|
|
146
146
|
|
|
147
147
|
# Generate samples
|
|
148
148
|
for index in tqdm(range(num_samples)):
|
|
@@ -72,7 +72,7 @@ Question: {query} Answer:""" # noqa: E501
|
|
|
72
72
|
|
|
73
73
|
class RULERHotpotQAScenario(_RULERQAScenario):
|
|
74
74
|
name = "ruler_hotpotqa"
|
|
75
|
-
description = "
|
|
75
|
+
description = "RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario." # noqa: E501
|
|
76
76
|
tags = ["long_context", "rag"]
|
|
77
77
|
|
|
78
78
|
def __init__(self, max_num_words: int):
|
|
@@ -81,7 +81,7 @@ class RULERHotpotQAScenario(_RULERQAScenario):
|
|
|
81
81
|
|
|
82
82
|
class RULERSQuADScenario(_RULERQAScenario):
|
|
83
83
|
name = "ruler_squad"
|
|
84
|
-
description = "
|
|
84
|
+
description = "RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario." # noqa: E501
|
|
85
85
|
tags = ["long_context", "rag"]
|
|
86
86
|
|
|
87
87
|
def __init__(self, max_num_words: int):
|