crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -23,13 +24,19 @@ class SHCBMTMedScenario(Scenario):
|
|
|
23
24
|
|
|
24
25
|
name = "shc_bmt_med"
|
|
25
26
|
description = (
|
|
26
|
-
"
|
|
27
|
-
"
|
|
27
|
+
"BMT-Status is a benchmark composed of clinical notes and associated binary questions"
|
|
28
|
+
"related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT),"
|
|
29
|
+
"or hematopoietic cell transplant (HCT) status. The goal is to determine whether the"
|
|
30
|
+
"patient received a subsequent transplant based on the provided clinical documentation."
|
|
28
31
|
)
|
|
29
32
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
30
33
|
|
|
31
34
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
32
35
|
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
33
40
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
34
41
|
data = {}
|
|
35
42
|
with open(csv_path, "r") as file:
|
|
@@ -39,7 +46,7 @@ class SHCBMTMedScenario(Scenario):
|
|
|
39
46
|
context = row["context"]
|
|
40
47
|
answer = row["label"]
|
|
41
48
|
prompt = (
|
|
42
|
-
f"Provide an answer to the following {question} with the following context: {context} "
|
|
49
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
43
50
|
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
44
51
|
"details or response, just a simple A or B response."
|
|
45
52
|
)
|
|
@@ -47,10 +54,9 @@ class SHCBMTMedScenario(Scenario):
|
|
|
47
54
|
return data
|
|
48
55
|
|
|
49
56
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
-
data_path =
|
|
51
|
-
|
|
57
|
+
check_file_exists(self.data_path, msg=f"[SHCBMTMedScenario] Required data file not found: '{self.data_path}'")
|
|
52
58
|
instances: List[Instance] = []
|
|
53
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
59
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
54
60
|
|
|
55
61
|
for prompt, answer in benchmark_data.items():
|
|
56
62
|
assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -24,13 +25,18 @@ class SHCCDIMedScenario(Scenario):
|
|
|
24
25
|
|
|
25
26
|
name = "shc_cdi_med"
|
|
26
27
|
description = (
|
|
27
|
-
"
|
|
28
|
-
"
|
|
28
|
+
"CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI)"
|
|
29
|
+
"notes. It is used to evaluate a model's ability to verify clinical conditions based on"
|
|
30
|
+
"documented evidence in patient records."
|
|
29
31
|
)
|
|
30
32
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
31
33
|
|
|
32
34
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
33
35
|
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
34
40
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
35
41
|
data = {}
|
|
36
42
|
with open(csv_path, "r") as file:
|
|
@@ -40,7 +46,7 @@ class SHCCDIMedScenario(Scenario):
|
|
|
40
46
|
context = row["context"]
|
|
41
47
|
answer = row["label"]
|
|
42
48
|
prompt = (
|
|
43
|
-
f"Provide an answer to the following {question} with the following context: {context} , "
|
|
49
|
+
f"Provide an answer to the following question: {question} with the following context: {context} , "
|
|
44
50
|
"Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
|
|
45
51
|
"additional details or response, just a simple A or B response."
|
|
46
52
|
)
|
|
@@ -48,10 +54,9 @@ class SHCCDIMedScenario(Scenario):
|
|
|
48
54
|
return data
|
|
49
55
|
|
|
50
56
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
51
|
-
data_path =
|
|
52
|
-
|
|
57
|
+
check_file_exists(self.data_path, msg=f"[SHCCDIMedScenario] Required data file not found: '{self.data_path}'")
|
|
53
58
|
instances: List[Instance] = []
|
|
54
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
59
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
55
60
|
|
|
56
61
|
for prompt, answer in benchmark_data.items():
|
|
57
62
|
assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -24,13 +25,19 @@ class SHCCONFMedScenario(Scenario):
|
|
|
24
25
|
|
|
25
26
|
name = "shc_conf_med"
|
|
26
27
|
description = (
|
|
27
|
-
"
|
|
28
|
-
"
|
|
28
|
+
"MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is"
|
|
29
|
+
"used to evaluate whether the content contains sensitive protected health information"
|
|
30
|
+
"(PHI) that should be restricted from parental access, in accordance with adolescent"
|
|
31
|
+
"confidentiality policies in clinical care."
|
|
29
32
|
)
|
|
30
33
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
31
34
|
|
|
32
35
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
33
36
|
|
|
37
|
+
def __init__(self, data_path: str):
|
|
38
|
+
super().__init__()
|
|
39
|
+
self.data_path = data_path
|
|
40
|
+
|
|
34
41
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
35
42
|
data = {}
|
|
36
43
|
with open(csv_path, "r") as file:
|
|
@@ -40,7 +47,7 @@ class SHCCONFMedScenario(Scenario):
|
|
|
40
47
|
context = row["context"]
|
|
41
48
|
answer = row["label"]
|
|
42
49
|
prompt = (
|
|
43
|
-
f"Provide an answer to the following {question} with the following context: {context} "
|
|
50
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
44
51
|
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
45
52
|
"details or response, just a simple A or B response."
|
|
46
53
|
)
|
|
@@ -48,10 +55,9 @@ class SHCCONFMedScenario(Scenario):
|
|
|
48
55
|
return data
|
|
49
56
|
|
|
50
57
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
51
|
-
data_path =
|
|
52
|
-
|
|
58
|
+
check_file_exists(self.data_path, msg=f"[SHCCONFMedScenario] Required data file not found: '{self.data_path}'")
|
|
53
59
|
instances: List[Instance] = []
|
|
54
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
60
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
55
61
|
|
|
56
62
|
for prompt, answer in benchmark_data.items():
|
|
57
63
|
assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -23,13 +24,18 @@ class SHCENTMedScenario(Scenario):
|
|
|
23
24
|
|
|
24
25
|
name = "shc_ent_med"
|
|
25
26
|
description = (
|
|
26
|
-
"
|
|
27
|
-
"
|
|
27
|
+
"ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note"
|
|
28
|
+
"supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess"
|
|
29
|
+
"models' abilities to make referral decisions based on unstructured clinical text."
|
|
28
30
|
)
|
|
29
31
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
30
32
|
|
|
31
33
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
|
|
32
34
|
|
|
35
|
+
def __init__(self, data_path: str):
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.data_path = data_path
|
|
38
|
+
|
|
33
39
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
34
40
|
data = {}
|
|
35
41
|
counter = 1
|
|
@@ -41,7 +47,7 @@ class SHCENTMedScenario(Scenario):
|
|
|
41
47
|
context = row["context"]
|
|
42
48
|
answer = row["label"]
|
|
43
49
|
prompt = (
|
|
44
|
-
f"{counter} Provide an answer to the following {question} with the following context:"
|
|
50
|
+
f"{counter} Provide an answer to the following question: {question} with the following context:"
|
|
45
51
|
f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
|
|
46
52
|
" Do not provide any additional details or response, just a simple A, B, or C response."
|
|
47
53
|
)
|
|
@@ -50,10 +56,9 @@ class SHCENTMedScenario(Scenario):
|
|
|
50
56
|
return data
|
|
51
57
|
|
|
52
58
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
53
|
-
data_path =
|
|
54
|
-
|
|
59
|
+
check_file_exists(self.data_path, msg=f"[SHCENTMedScenario] Required data file not found: '{self.data_path}'")
|
|
55
60
|
instances: List[Instance] = []
|
|
56
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
61
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
57
62
|
|
|
58
63
|
for prompt, answer in benchmark_data.items():
|
|
59
64
|
assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -22,11 +23,19 @@ class SHCGIPMedScenario(Scenario):
|
|
|
22
23
|
"""
|
|
23
24
|
|
|
24
25
|
name = "shc_gip_med"
|
|
25
|
-
description =
|
|
26
|
+
description = (
|
|
27
|
+
"HospiceReferral is a benchmark that evaluates model performance in identifying"
|
|
28
|
+
"whether patients are eligible for hospice care based on palliative care clinical notes."
|
|
29
|
+
"The benchmark focuses on end-of-life care referral decisions."
|
|
30
|
+
)
|
|
26
31
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
27
32
|
|
|
28
33
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
29
34
|
|
|
35
|
+
def __init__(self, data_path: str):
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.data_path = data_path
|
|
38
|
+
|
|
30
39
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
31
40
|
data = {}
|
|
32
41
|
with open(csv_path, "r") as file:
|
|
@@ -36,7 +45,7 @@ class SHCGIPMedScenario(Scenario):
|
|
|
36
45
|
context = row["context"]
|
|
37
46
|
answer = row["label"]
|
|
38
47
|
prompt = (
|
|
39
|
-
f"Provide an answer to the following {question} with the following context: {context} "
|
|
48
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
40
49
|
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
41
50
|
"details or response, just a simple A or B response."
|
|
42
51
|
)
|
|
@@ -44,10 +53,9 @@ class SHCGIPMedScenario(Scenario):
|
|
|
44
53
|
return data
|
|
45
54
|
|
|
46
55
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
-
data_path =
|
|
48
|
-
|
|
56
|
+
check_file_exists(self.data_path, msg=f"[SHCGIPMedScenario] Required data file not found: '{self.data_path}'")
|
|
49
57
|
instances: List[Instance] = []
|
|
50
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
58
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
51
59
|
|
|
52
60
|
for prompt, answer in benchmark_data.items():
|
|
53
61
|
assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCPRIVACYMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This dataset features messages sent generated by an LLM from patient clinical notes data.
|
|
22
|
+
The scenario evaluates the ability of an LLM to determine if any potentially confidential
|
|
23
|
+
information about the patient was included. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "shc_privacy_med"
|
|
27
|
+
description = (
|
|
28
|
+
"PrivacyDetection is a benchmark composed of patient portal messages submitted by"
|
|
29
|
+
"patients or caregivers. The task is to determine whether the message contains any"
|
|
30
|
+
"confidential or privacy-leaking information that should be protected."
|
|
31
|
+
)
|
|
32
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
33
|
+
|
|
34
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
40
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
41
|
+
data = {}
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
question = row["prompt"]
|
|
46
|
+
context = row["context"]
|
|
47
|
+
answer = row["label"]
|
|
48
|
+
prompt = (
|
|
49
|
+
"You are reviewing clinical messages in order to determine if any confidential "
|
|
50
|
+
f"information was included. Please answer the following question: {question} with the "
|
|
51
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
52
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
58
|
+
check_file_exists(
|
|
59
|
+
self.data_path, msg=f"[SHCPRIVACYMedScenario] Required data file not found: '{self.data_path}'"
|
|
60
|
+
)
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
63
|
+
|
|
64
|
+
for prompt, answer in benchmark_data.items():
|
|
65
|
+
assert answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
66
|
+
references: List[Reference] = [
|
|
67
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
68
|
+
for pred_answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
69
|
+
]
|
|
70
|
+
instances.append(
|
|
71
|
+
Instance(
|
|
72
|
+
input=Input(text=prompt),
|
|
73
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
74
|
+
split=TEST_SPLIT,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return instances
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCPROXYMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This dataset features messages sent by proxy users and non proxy users, for evaluation of
|
|
22
|
+
LLM capabilities to determine the sender. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "shc_proxy_med"
|
|
26
|
+
description = (
|
|
27
|
+
"ProxySender is a benchmark composed of patient portal messages received by clinicians."
|
|
28
|
+
"It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent,"
|
|
29
|
+
"spouse), which is critical for understanding who is communicating with healthcare"
|
|
30
|
+
"providers."
|
|
31
|
+
)
|
|
32
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
33
|
+
|
|
34
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
40
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
41
|
+
data = {}
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
question = row["prompt"]
|
|
46
|
+
context = row["context"]
|
|
47
|
+
answer = row["label"]
|
|
48
|
+
prompt = (
|
|
49
|
+
"You are reviewing a clinical messages in order to determine if they have been "
|
|
50
|
+
f"sent by a proxy user. Please determine the following: {question} with the "
|
|
51
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
52
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
58
|
+
check_file_exists(self.data_path, msg=f"[SHCPROXYMedScenario] Required data file not found: '{self.data_path}'")
|
|
59
|
+
instances: List[Instance] = []
|
|
60
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
61
|
+
|
|
62
|
+
for prompt, answer in benchmark_data.items():
|
|
63
|
+
assert answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
64
|
+
references: List[Reference] = [
|
|
65
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
66
|
+
for pred_answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
67
|
+
]
|
|
68
|
+
instances.append(
|
|
69
|
+
Instance(
|
|
70
|
+
input=Input(text=prompt),
|
|
71
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
72
|
+
split=TEST_SPLIT,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return instances
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -27,14 +28,19 @@ class SHCPTBMMedScenario(Scenario):
|
|
|
27
28
|
|
|
28
29
|
name = "shc_ptbm_med"
|
|
29
30
|
description = (
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
31
|
+
"ADHD-Behavior is a benchmark that evaluates a model’s ability to detect whether"
|
|
32
|
+
"a clinician recommends parent training in behavior management, an evidence-based"
|
|
33
|
+
"first-line treatment for young children diagnosed with ADHD. Each instance includes"
|
|
34
|
+
"a clinical note from a pediatric visit and a binary classification task."
|
|
33
35
|
)
|
|
34
36
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
35
37
|
|
|
36
38
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
37
39
|
|
|
40
|
+
def __init__(self, data_path: str):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.data_path = data_path
|
|
43
|
+
|
|
38
44
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
39
45
|
data = {}
|
|
40
46
|
with open(csv_path, "r") as file:
|
|
@@ -46,7 +52,7 @@ class SHCPTBMMedScenario(Scenario):
|
|
|
46
52
|
prompt = (
|
|
47
53
|
"You are reviewing a clinical note from health records of children with "
|
|
48
54
|
"attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
|
|
49
|
-
f"behavioral therapy. Provide an answer to the following {question} with the "
|
|
55
|
+
f"behavioral therapy. Provide an answer to the following question: {question} with the "
|
|
50
56
|
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
51
57
|
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
52
58
|
)
|
|
@@ -54,10 +60,9 @@ class SHCPTBMMedScenario(Scenario):
|
|
|
54
60
|
return data
|
|
55
61
|
|
|
56
62
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
57
|
-
data_path =
|
|
58
|
-
|
|
63
|
+
check_file_exists(self.data_path, msg=f"[SHCPTBMMedScenario] Required data file not found: '{self.data_path}'")
|
|
59
64
|
instances: List[Instance] = []
|
|
60
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
65
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
61
66
|
|
|
62
67
|
for prompt, answer in benchmark_data.items():
|
|
63
68
|
assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -30,14 +31,19 @@ class SHCSEIMedScenario(Scenario):
|
|
|
30
31
|
|
|
31
32
|
name = "shc_sei_med"
|
|
32
33
|
description = (
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"
|
|
34
|
+
"ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for"
|
|
35
|
+
"pediatric ADHD visits document medication side effect monitoring, which is a key recommendation"
|
|
36
|
+
"in clinical practice guidelines. The dataset supports binary classification"
|
|
37
|
+
"to detect presence or absence of side effect inquiries (SEI) within notes."
|
|
36
38
|
)
|
|
37
39
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
38
40
|
|
|
39
41
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
40
42
|
|
|
43
|
+
def __init__(self, data_path: str):
|
|
44
|
+
super().__init__()
|
|
45
|
+
self.data_path = data_path
|
|
46
|
+
|
|
41
47
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
42
48
|
data = {}
|
|
43
49
|
with open(csv_path, "r") as file:
|
|
@@ -59,7 +65,7 @@ class SHCSEIMedScenario(Scenario):
|
|
|
59
65
|
"categorized as SEI because they consist of a plan or an explanation about "
|
|
60
66
|
"side effects without actual side effect monitoring taking place, and "
|
|
61
67
|
"No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
|
|
62
|
-
f"Provide an answer to the following {question} with the following context: {context} "
|
|
68
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
63
69
|
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
64
70
|
"details or response, just a simple A or B response."
|
|
65
71
|
)
|
|
@@ -67,10 +73,9 @@ class SHCSEIMedScenario(Scenario):
|
|
|
67
73
|
return data
|
|
68
74
|
|
|
69
75
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
-
data_path =
|
|
71
|
-
|
|
76
|
+
check_file_exists(self.data_path, msg=f"[SHCSEIMedScenario] Required data file not found: '{self.data_path}'")
|
|
72
77
|
instances: List[Instance] = []
|
|
73
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
78
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
74
79
|
|
|
75
80
|
for prompt, answer in benchmark_data.items():
|
|
76
81
|
assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -22,12 +23,18 @@ class SHCSequoiaMedScenario(Scenario):
|
|
|
22
23
|
|
|
23
24
|
name = "shc_sequoia_med"
|
|
24
25
|
description = (
|
|
25
|
-
"
|
|
26
|
+
"ClinicReferral is a benchmark that determines patient eligibility for referral to the"
|
|
27
|
+
"Sequoia Clinic based on information from palliative care notes. The dataset provides"
|
|
28
|
+
"curated decisions on referral appropriateness to assist in automating clinic workflows."
|
|
26
29
|
)
|
|
27
30
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
28
31
|
|
|
29
32
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
30
33
|
|
|
34
|
+
def __init__(self, data_path: str):
|
|
35
|
+
super().__init__()
|
|
36
|
+
self.data_path = data_path
|
|
37
|
+
|
|
31
38
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
32
39
|
data = {}
|
|
33
40
|
counter = 1
|
|
@@ -38,7 +45,7 @@ class SHCSequoiaMedScenario(Scenario):
|
|
|
38
45
|
context = row["context"]
|
|
39
46
|
answer = row["label"]
|
|
40
47
|
prompt = (
|
|
41
|
-
f" {counter} Provide an answer to the following {question} with the following context:"
|
|
48
|
+
f" {counter} Provide an answer to the following question: {question} with the following context:"
|
|
42
49
|
f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
|
|
43
50
|
"additional details or response, just a simple A or B response."
|
|
44
51
|
)
|
|
@@ -47,10 +54,11 @@ class SHCSequoiaMedScenario(Scenario):
|
|
|
47
54
|
return data
|
|
48
55
|
|
|
49
56
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
-
|
|
51
|
-
|
|
57
|
+
check_file_exists(
|
|
58
|
+
self.data_path, msg=f"[SHCSequoiaMedScenario] Required data file not found: '{self.data_path}'"
|
|
59
|
+
)
|
|
52
60
|
instances: List[Instance] = []
|
|
53
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
61
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
54
62
|
|
|
55
63
|
for prompt, answer in benchmark_data.items():
|
|
56
64
|
assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import csv
|
|
3
2
|
from typing import List
|
|
4
3
|
|
|
5
|
-
from helm.common.general import
|
|
4
|
+
from helm.common.general import check_file_exists
|
|
6
5
|
from helm.benchmark.scenarios.scenario import (
|
|
7
6
|
Input,
|
|
8
7
|
Scenario,
|
|
@@ -40,19 +39,27 @@ class StarrPatientInstructionsScenario(Scenario):
|
|
|
40
39
|
"""
|
|
41
40
|
|
|
42
41
|
name = "starr_patient_instructions"
|
|
43
|
-
description =
|
|
42
|
+
description = (
|
|
43
|
+
"PatientInstruct is a benchmark designed to evaluate models on generating personalized"
|
|
44
|
+
"post-procedure instructions for patients. It includes real-world patient History & Physical"
|
|
45
|
+
"Note (H&P) and operative report, from which models must produce clear, actionable instructions"
|
|
46
|
+
"appropriate for patients recovering from medical interventions."
|
|
47
|
+
)
|
|
44
48
|
tags = ["patient_communication", "healthcare", "instruction_generation", "surgery"]
|
|
45
49
|
|
|
46
|
-
def
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
ensure_directory_exists(os.path.dirname(csv_path))
|
|
50
|
+
def __init__(self, data_path: str):
|
|
51
|
+
super().__init__()
|
|
52
|
+
self.data_path = data_path
|
|
50
53
|
|
|
54
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
55
|
+
check_file_exists(
|
|
56
|
+
self.data_path, msg=f"[StarrPatientInstructiosScenario] Required data file not found: '{self.data_path}'"
|
|
57
|
+
)
|
|
51
58
|
instances: List[Instance] = []
|
|
52
59
|
# For now, we assign all instances to the test split (zero-shot setting).
|
|
53
60
|
split = TEST_SPLIT
|
|
54
61
|
|
|
55
|
-
with open(
|
|
62
|
+
with open(self.data_path, "r", encoding="utf-8") as csvfile:
|
|
56
63
|
reader = csv.DictReader(csvfile)
|
|
57
64
|
for row in reader:
|
|
58
65
|
# Retrieve and strip the relevant fields.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.infinite_bench_en_qa_scenario import InfiniteBenchEnQAScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_infinite_bench_en_qa_scenario():
|
|
10
|
+
with TemporaryDirectory() as tmpdir:
|
|
11
|
+
scenario = InfiniteBenchEnQAScenario(max_num_words=10000000)
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 351
|
|
14
|
+
assert instances[0].split == "test"
|
|
15
|
+
assert len(instances[0].input.text) == 381829
|
|
16
|
+
assert len(instances[0].references) == 1
|
|
17
|
+
assert len(instances[0].references[0].output.text) == 8
|
|
18
|
+
assert instances[0].references[0].tags == [CORRECT_TAG]
|