crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -133,7 +133,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
|
|
|
133
133
|
input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
|
|
134
134
|
# Calculate the number of tokens in the example
|
|
135
135
|
total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
|
|
136
|
-
print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
|
|
136
|
+
# print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
|
|
137
137
|
if total_tokens + tokens_to_generate > max_seq_length:
|
|
138
138
|
num_docs -= incremental
|
|
139
139
|
break
|
|
@@ -142,7 +142,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
|
|
|
142
142
|
if num_docs > len(docs):
|
|
143
143
|
num_docs = len(docs)
|
|
144
144
|
break
|
|
145
|
-
print('Number of documents:', num_docs)
|
|
145
|
+
# print('Number of documents:', num_docs)
|
|
146
146
|
|
|
147
147
|
# Generate samples
|
|
148
148
|
for index in tqdm(range(num_samples)):
|
|
@@ -72,7 +72,7 @@ Question: {query} Answer:""" # noqa: E501
|
|
|
72
72
|
|
|
73
73
|
class RULERHotpotQAScenario(_RULERQAScenario):
|
|
74
74
|
name = "ruler_hotpotqa"
|
|
75
|
-
description = "
|
|
75
|
+
description = "RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario." # noqa: E501
|
|
76
76
|
tags = ["long_context", "rag"]
|
|
77
77
|
|
|
78
78
|
def __init__(self, max_num_words: int):
|
|
@@ -81,7 +81,7 @@ class RULERHotpotQAScenario(_RULERQAScenario):
|
|
|
81
81
|
|
|
82
82
|
class RULERSQuADScenario(_RULERQAScenario):
|
|
83
83
|
name = "ruler_squad"
|
|
84
|
-
description = "
|
|
84
|
+
description = "RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario." # noqa: E501
|
|
85
85
|
tags = ["long_context", "rag"]
|
|
86
86
|
|
|
87
87
|
def __init__(self, max_num_words: int):
|
|
@@ -1750,7 +1750,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
|
|
|
1750
1750
|
text_noun = self.prompt_components["text_noun"]
|
|
1751
1751
|
instruction = self.prompt_components["single_instruction"]
|
|
1752
1752
|
|
|
1753
|
-
passage = "{question}
|
|
1753
|
+
passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
|
|
1754
1754
|
question=question.format(row["question_translated"]),
|
|
1755
1755
|
text_noun=text_noun,
|
|
1756
1756
|
text=row["text"],
|
|
@@ -1898,7 +1898,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
|
|
|
1898
1898
|
text_noun = self.prompt_components["text_noun"]
|
|
1899
1899
|
instruction = self.prompt_components["single_instruction"]
|
|
1900
1900
|
|
|
1901
|
-
passage = "{question}
|
|
1901
|
+
passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
|
|
1902
1902
|
question=question.format(row["question_translated"]),
|
|
1903
1903
|
text_noun=text_noun,
|
|
1904
1904
|
text=row["text"],
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -23,13 +24,19 @@ class SHCBMTMedScenario(Scenario):
|
|
|
23
24
|
|
|
24
25
|
name = "shc_bmt_med"
|
|
25
26
|
description = (
|
|
26
|
-
"
|
|
27
|
-
"
|
|
27
|
+
"BMT-Status is a benchmark composed of clinical notes and associated binary questions"
|
|
28
|
+
"related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT),"
|
|
29
|
+
"or hematopoietic cell transplant (HCT) status. The goal is to determine whether the"
|
|
30
|
+
"patient received a subsequent transplant based on the provided clinical documentation."
|
|
28
31
|
)
|
|
29
32
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
30
33
|
|
|
31
34
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
32
35
|
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
33
40
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
34
41
|
data = {}
|
|
35
42
|
with open(csv_path, "r") as file:
|
|
@@ -39,7 +46,7 @@ class SHCBMTMedScenario(Scenario):
|
|
|
39
46
|
context = row["context"]
|
|
40
47
|
answer = row["label"]
|
|
41
48
|
prompt = (
|
|
42
|
-
f"Provide an answer to the following {question} with the following context: {context} "
|
|
49
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
43
50
|
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
44
51
|
"details or response, just a simple A or B response."
|
|
45
52
|
)
|
|
@@ -47,10 +54,9 @@ class SHCBMTMedScenario(Scenario):
|
|
|
47
54
|
return data
|
|
48
55
|
|
|
49
56
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
-
data_path =
|
|
51
|
-
|
|
57
|
+
check_file_exists(self.data_path, msg=f"[SHCBMTMedScenario] Required data file not found: '{self.data_path}'")
|
|
52
58
|
instances: List[Instance] = []
|
|
53
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
59
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
54
60
|
|
|
55
61
|
for prompt, answer in benchmark_data.items():
|
|
56
62
|
assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -24,13 +25,18 @@ class SHCCDIMedScenario(Scenario):
|
|
|
24
25
|
|
|
25
26
|
name = "shc_cdi_med"
|
|
26
27
|
description = (
|
|
27
|
-
"
|
|
28
|
-
"
|
|
28
|
+
"CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI)"
|
|
29
|
+
"notes. It is used to evaluate a model's ability to verify clinical conditions based on"
|
|
30
|
+
"documented evidence in patient records."
|
|
29
31
|
)
|
|
30
32
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
31
33
|
|
|
32
34
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
33
35
|
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
34
40
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
35
41
|
data = {}
|
|
36
42
|
with open(csv_path, "r") as file:
|
|
@@ -40,7 +46,7 @@ class SHCCDIMedScenario(Scenario):
|
|
|
40
46
|
context = row["context"]
|
|
41
47
|
answer = row["label"]
|
|
42
48
|
prompt = (
|
|
43
|
-
f"Provide an answer to the following {question} with the following context: {context} , "
|
|
49
|
+
f"Provide an answer to the following question: {question} with the following context: {context} , "
|
|
44
50
|
"Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
|
|
45
51
|
"additional details or response, just a simple A or B response."
|
|
46
52
|
)
|
|
@@ -48,10 +54,9 @@ class SHCCDIMedScenario(Scenario):
|
|
|
48
54
|
return data
|
|
49
55
|
|
|
50
56
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
51
|
-
data_path =
|
|
52
|
-
|
|
57
|
+
check_file_exists(self.data_path, msg=f"[SHCCDIMedScenario] Required data file not found: '{self.data_path}'")
|
|
53
58
|
instances: List[Instance] = []
|
|
54
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
59
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
55
60
|
|
|
56
61
|
for prompt, answer in benchmark_data.items():
|
|
57
62
|
assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -24,13 +25,19 @@ class SHCCONFMedScenario(Scenario):
|
|
|
24
25
|
|
|
25
26
|
name = "shc_conf_med"
|
|
26
27
|
description = (
|
|
27
|
-
"
|
|
28
|
-
"
|
|
28
|
+
"MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is"
|
|
29
|
+
"used to evaluate whether the content contains sensitive protected health information"
|
|
30
|
+
"(PHI) that should be restricted from parental access, in accordance with adolescent"
|
|
31
|
+
"confidentiality policies in clinical care."
|
|
29
32
|
)
|
|
30
33
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
31
34
|
|
|
32
35
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
33
36
|
|
|
37
|
+
def __init__(self, data_path: str):
|
|
38
|
+
super().__init__()
|
|
39
|
+
self.data_path = data_path
|
|
40
|
+
|
|
34
41
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
35
42
|
data = {}
|
|
36
43
|
with open(csv_path, "r") as file:
|
|
@@ -40,7 +47,7 @@ class SHCCONFMedScenario(Scenario):
|
|
|
40
47
|
context = row["context"]
|
|
41
48
|
answer = row["label"]
|
|
42
49
|
prompt = (
|
|
43
|
-
f"Provide an answer to the following {question} with the following context: {context} "
|
|
50
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
44
51
|
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
45
52
|
"details or response, just a simple A or B response."
|
|
46
53
|
)
|
|
@@ -48,10 +55,9 @@ class SHCCONFMedScenario(Scenario):
|
|
|
48
55
|
return data
|
|
49
56
|
|
|
50
57
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
51
|
-
data_path =
|
|
52
|
-
|
|
58
|
+
check_file_exists(self.data_path, msg=f"[SHCCONFMedScenario] Required data file not found: '{self.data_path}'")
|
|
53
59
|
instances: List[Instance] = []
|
|
54
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
60
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
55
61
|
|
|
56
62
|
for prompt, answer in benchmark_data.items():
|
|
57
63
|
assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -23,13 +24,18 @@ class SHCENTMedScenario(Scenario):
|
|
|
23
24
|
|
|
24
25
|
name = "shc_ent_med"
|
|
25
26
|
description = (
|
|
26
|
-
"
|
|
27
|
-
"
|
|
27
|
+
"ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note"
|
|
28
|
+
"supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess"
|
|
29
|
+
"models' abilities to make referral decisions based on unstructured clinical text."
|
|
28
30
|
)
|
|
29
31
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
30
32
|
|
|
31
33
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
|
|
32
34
|
|
|
35
|
+
def __init__(self, data_path: str):
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.data_path = data_path
|
|
38
|
+
|
|
33
39
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
34
40
|
data = {}
|
|
35
41
|
counter = 1
|
|
@@ -41,7 +47,7 @@ class SHCENTMedScenario(Scenario):
|
|
|
41
47
|
context = row["context"]
|
|
42
48
|
answer = row["label"]
|
|
43
49
|
prompt = (
|
|
44
|
-
f"{counter} Provide an answer to the following {question} with the following context:"
|
|
50
|
+
f"{counter} Provide an answer to the following question: {question} with the following context:"
|
|
45
51
|
f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
|
|
46
52
|
" Do not provide any additional details or response, just a simple A, B, or C response."
|
|
47
53
|
)
|
|
@@ -50,10 +56,9 @@ class SHCENTMedScenario(Scenario):
|
|
|
50
56
|
return data
|
|
51
57
|
|
|
52
58
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
53
|
-
data_path =
|
|
54
|
-
|
|
59
|
+
check_file_exists(self.data_path, msg=f"[SHCENTMedScenario] Required data file not found: '{self.data_path}'")
|
|
55
60
|
instances: List[Instance] = []
|
|
56
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
61
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
57
62
|
|
|
58
63
|
for prompt, answer in benchmark_data.items():
|
|
59
64
|
assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -22,11 +23,19 @@ class SHCGIPMedScenario(Scenario):
|
|
|
22
23
|
"""
|
|
23
24
|
|
|
24
25
|
name = "shc_gip_med"
|
|
25
|
-
description =
|
|
26
|
+
description = (
|
|
27
|
+
"HospiceReferral is a benchmark that evaluates model performance in identifying"
|
|
28
|
+
"whether patients are eligible for hospice care based on palliative care clinical notes."
|
|
29
|
+
"The benchmark focuses on end-of-life care referral decisions."
|
|
30
|
+
)
|
|
26
31
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
27
32
|
|
|
28
33
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
29
34
|
|
|
35
|
+
def __init__(self, data_path: str):
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.data_path = data_path
|
|
38
|
+
|
|
30
39
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
31
40
|
data = {}
|
|
32
41
|
with open(csv_path, "r") as file:
|
|
@@ -36,7 +45,7 @@ class SHCGIPMedScenario(Scenario):
|
|
|
36
45
|
context = row["context"]
|
|
37
46
|
answer = row["label"]
|
|
38
47
|
prompt = (
|
|
39
|
-
f"Provide an answer to the following {question} with the following context: {context} "
|
|
48
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
40
49
|
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
41
50
|
"details or response, just a simple A or B response."
|
|
42
51
|
)
|
|
@@ -44,10 +53,9 @@ class SHCGIPMedScenario(Scenario):
|
|
|
44
53
|
return data
|
|
45
54
|
|
|
46
55
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
-
data_path =
|
|
48
|
-
|
|
56
|
+
check_file_exists(self.data_path, msg=f"[SHCGIPMedScenario] Required data file not found: '{self.data_path}'")
|
|
49
57
|
instances: List[Instance] = []
|
|
50
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
58
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
51
59
|
|
|
52
60
|
for prompt, answer in benchmark_data.items():
|
|
53
61
|
assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCPRIVACYMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This dataset features messages sent generated by an LLM from patient clinical notes data.
|
|
22
|
+
The scenario evaluates the ability of an LLM to determine if any potentially confidential
|
|
23
|
+
information about the patient was included. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "shc_privacy_med"
|
|
27
|
+
description = (
|
|
28
|
+
"PrivacyDetection is a benchmark composed of patient portal messages submitted by"
|
|
29
|
+
"patients or caregivers. The task is to determine whether the message contains any"
|
|
30
|
+
"confidential or privacy-leaking information that should be protected."
|
|
31
|
+
)
|
|
32
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
33
|
+
|
|
34
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
40
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
41
|
+
data = {}
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
question = row["prompt"]
|
|
46
|
+
context = row["context"]
|
|
47
|
+
answer = row["label"]
|
|
48
|
+
prompt = (
|
|
49
|
+
"You are reviewing clinical messages in order to determine if any confidential "
|
|
50
|
+
f"information was included. Please answer the following question: {question} with the "
|
|
51
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
52
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
58
|
+
check_file_exists(
|
|
59
|
+
self.data_path, msg=f"[SHCPRIVACYMedScenario] Required data file not found: '{self.data_path}'"
|
|
60
|
+
)
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
63
|
+
|
|
64
|
+
for prompt, answer in benchmark_data.items():
|
|
65
|
+
assert answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
66
|
+
references: List[Reference] = [
|
|
67
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
68
|
+
for pred_answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
69
|
+
]
|
|
70
|
+
instances.append(
|
|
71
|
+
Instance(
|
|
72
|
+
input=Input(text=prompt),
|
|
73
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
74
|
+
split=TEST_SPLIT,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return instances
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCPROXYMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This dataset features messages sent by proxy users and non proxy users, for evaluation of
|
|
22
|
+
LLM capabilities to determine the sender. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "shc_proxy_med"
|
|
26
|
+
description = (
|
|
27
|
+
"ProxySender is a benchmark composed of patient portal messages received by clinicians."
|
|
28
|
+
"It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent,"
|
|
29
|
+
"spouse), which is critical for understanding who is communicating with healthcare"
|
|
30
|
+
"providers."
|
|
31
|
+
)
|
|
32
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
33
|
+
|
|
34
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
40
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
41
|
+
data = {}
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
question = row["prompt"]
|
|
46
|
+
context = row["context"]
|
|
47
|
+
answer = row["label"]
|
|
48
|
+
prompt = (
|
|
49
|
+
"You are reviewing a clinical messages in order to determine if they have been "
|
|
50
|
+
f"sent by a proxy user. Please determine the following: {question} with the "
|
|
51
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
52
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
58
|
+
check_file_exists(self.data_path, msg=f"[SHCPROXYMedScenario] Required data file not found: '{self.data_path}'")
|
|
59
|
+
instances: List[Instance] = []
|
|
60
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
61
|
+
|
|
62
|
+
for prompt, answer in benchmark_data.items():
|
|
63
|
+
assert answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
64
|
+
references: List[Reference] = [
|
|
65
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
66
|
+
for pred_answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
67
|
+
]
|
|
68
|
+
instances.append(
|
|
69
|
+
Instance(
|
|
70
|
+
input=Input(text=prompt),
|
|
71
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
72
|
+
split=TEST_SPLIT,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return instances
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -27,14 +28,19 @@ class SHCPTBMMedScenario(Scenario):
|
|
|
27
28
|
|
|
28
29
|
name = "shc_ptbm_med"
|
|
29
30
|
description = (
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
31
|
+
"ADHD-Behavior is a benchmark that evaluates a model’s ability to detect whether"
|
|
32
|
+
"a clinician recommends parent training in behavior management, an evidence-based"
|
|
33
|
+
"first-line treatment for young children diagnosed with ADHD. Each instance includes"
|
|
34
|
+
"a clinical note from a pediatric visit and a binary classification task."
|
|
33
35
|
)
|
|
34
36
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
35
37
|
|
|
36
38
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
37
39
|
|
|
40
|
+
def __init__(self, data_path: str):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.data_path = data_path
|
|
43
|
+
|
|
38
44
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
39
45
|
data = {}
|
|
40
46
|
with open(csv_path, "r") as file:
|
|
@@ -46,7 +52,7 @@ class SHCPTBMMedScenario(Scenario):
|
|
|
46
52
|
prompt = (
|
|
47
53
|
"You are reviewing a clinical note from health records of children with "
|
|
48
54
|
"attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
|
|
49
|
-
f"behavioral therapy. Provide an answer to the following {question} with the "
|
|
55
|
+
f"behavioral therapy. Provide an answer to the following question: {question} with the "
|
|
50
56
|
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
51
57
|
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
52
58
|
)
|
|
@@ -54,10 +60,9 @@ class SHCPTBMMedScenario(Scenario):
|
|
|
54
60
|
return data
|
|
55
61
|
|
|
56
62
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
57
|
-
data_path =
|
|
58
|
-
|
|
63
|
+
check_file_exists(self.data_path, msg=f"[SHCPTBMMedScenario] Required data file not found: '{self.data_path}'")
|
|
59
64
|
instances: List[Instance] = []
|
|
60
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
65
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
61
66
|
|
|
62
67
|
for prompt, answer in benchmark_data.items():
|
|
63
68
|
assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -30,14 +31,19 @@ class SHCSEIMedScenario(Scenario):
|
|
|
30
31
|
|
|
31
32
|
name = "shc_sei_med"
|
|
32
33
|
description = (
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"
|
|
34
|
+
"ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for"
|
|
35
|
+
"pediatric ADHD visits document medication side effect monitoring, which is a key recommendation"
|
|
36
|
+
"in clinical practice guidelines. The dataset supports binary classification"
|
|
37
|
+
"to detect presence or absence of side effect inquiries (SEI) within notes."
|
|
36
38
|
)
|
|
37
39
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
38
40
|
|
|
39
41
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
40
42
|
|
|
43
|
+
def __init__(self, data_path: str):
|
|
44
|
+
super().__init__()
|
|
45
|
+
self.data_path = data_path
|
|
46
|
+
|
|
41
47
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
42
48
|
data = {}
|
|
43
49
|
with open(csv_path, "r") as file:
|
|
@@ -59,7 +65,7 @@ class SHCSEIMedScenario(Scenario):
|
|
|
59
65
|
"categorized as SEI because they consist of a plan or an explanation about "
|
|
60
66
|
"side effects without actual side effect monitoring taking place, and "
|
|
61
67
|
"No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
|
|
62
|
-
f"Provide an answer to the following {question} with the following context: {context} "
|
|
68
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
63
69
|
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
64
70
|
"details or response, just a simple A or B response."
|
|
65
71
|
)
|
|
@@ -67,10 +73,9 @@ class SHCSEIMedScenario(Scenario):
|
|
|
67
73
|
return data
|
|
68
74
|
|
|
69
75
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
-
data_path =
|
|
71
|
-
|
|
76
|
+
check_file_exists(self.data_path, msg=f"[SHCSEIMedScenario] Required data file not found: '{self.data_path}'")
|
|
72
77
|
instances: List[Instance] = []
|
|
73
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
78
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
74
79
|
|
|
75
80
|
for prompt, answer in benchmark_data.items():
|
|
76
81
|
assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
11
|
Reference,
|
|
12
12
|
Output,
|
|
13
13
|
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
14
15
|
|
|
15
16
|
csv.field_size_limit(sys.maxsize)
|
|
16
17
|
|
|
@@ -22,12 +23,18 @@ class SHCSequoiaMedScenario(Scenario):
|
|
|
22
23
|
|
|
23
24
|
name = "shc_sequoia_med"
|
|
24
25
|
description = (
|
|
25
|
-
"
|
|
26
|
+
"ClinicReferral is a benchmark that determines patient eligibility for referral to the"
|
|
27
|
+
"Sequoia Clinic based on information from palliative care notes. The dataset provides"
|
|
28
|
+
"curated decisions on referral appropriateness to assist in automating clinic workflows."
|
|
26
29
|
)
|
|
27
30
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
28
31
|
|
|
29
32
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
30
33
|
|
|
34
|
+
def __init__(self, data_path: str):
|
|
35
|
+
super().__init__()
|
|
36
|
+
self.data_path = data_path
|
|
37
|
+
|
|
31
38
|
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
32
39
|
data = {}
|
|
33
40
|
counter = 1
|
|
@@ -38,7 +45,7 @@ class SHCSequoiaMedScenario(Scenario):
|
|
|
38
45
|
context = row["context"]
|
|
39
46
|
answer = row["label"]
|
|
40
47
|
prompt = (
|
|
41
|
-
f" {counter} Provide an answer to the following {question} with the following context:"
|
|
48
|
+
f" {counter} Provide an answer to the following question: {question} with the following context:"
|
|
42
49
|
f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
|
|
43
50
|
"additional details or response, just a simple A or B response."
|
|
44
51
|
)
|
|
@@ -47,10 +54,11 @@ class SHCSequoiaMedScenario(Scenario):
|
|
|
47
54
|
return data
|
|
48
55
|
|
|
49
56
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
-
|
|
51
|
-
|
|
57
|
+
check_file_exists(
|
|
58
|
+
self.data_path, msg=f"[SHCSequoiaMedScenario] Required data file not found: '{self.data_path}'"
|
|
59
|
+
)
|
|
52
60
|
instances: List[Instance] = []
|
|
53
|
-
benchmark_data = self.create_benchmark(data_path)
|
|
61
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
54
62
|
|
|
55
63
|
for prompt, answer in benchmark_data.items():
|
|
56
64
|
assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
|